зеркало из https://github.com/microsoft/LightGBM.git
Predefined bin thresholds (#2325)
* Fix bug where small values of max_bin cause crash.
* Revert "Fix bug where small values of max_bin cause crash."
This reverts commit fe5c8e2547
.
* Add functionality to force bin thresholds.
* Fix style issues.
* Use stable sort.
* Minor style and doc fixes.
* Add functionality to force bin thresholds.
* Fix style issues.
* Use stable sort.
* Minor style and doc fixes.
* Change binning behavior to be same as PR #2342.
* Add functionality to force bin thresholds.
* Fix style issues.
* Use stable sort.
* Minor style and doc fixes.
* Add functionality to force bin thresholds.
* Fix style issues.
* Use stable sort.
* Minor style and doc fixes.
* Change binning behavior to be same as PR #2342.
* Add functionality to force bin thresholds.
* Fix style issues.
* Minor style and doc fixes.
* Add functionality to force bin thresholds.
* Fix style issues.
* Minor style and doc fixes.
* Change binning behavior to be same as PR #2342.
* Add functionality to force bin thresholds.
* Fix style issues.
* Use stable sort.
* Minor style and doc fixes.
* Add functionality to force bin thresholds.
* Fix style issues.
* Use stable sort.
* Minor style and doc fixes.
* Change binning behavior to be same as PR #2342.
* Use different bin finding function for predefined bounds.
* Fix style issues.
* Minor refactoring, overload FindBinWithZeroAsOneBin.
* Fix style issues.
* Fix bug and add new test.
* Add warning when using categorical features with forced bins.
* Pass forced_upper_bounds by reference.
* Pass container types by const reference.
* Get categorical features using FeatureBinMapper.
* Fix bug for small max_bin.
* Move GetForcedBins to DatasetLoader.
* Find forced bins in dataset_loader.
* Minor fixes.
This commit is contained in:
Родитель
f2632a6e1e
Коммит
cc7a1e2739
|
@ -414,6 +414,14 @@ Learning Control Parameters
|
||||||
|
|
||||||
- see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
|
- see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
|
||||||
|
|
||||||
|
- ``forcedbins_filename`` :raw-html:`<a id="forcedbins_filename" title="Permalink to this parameter" href="#forcedbins_filename">🔗︎</a>`, default = ``""``, type = string
|
||||||
|
|
||||||
|
- path to a ``.json`` file that specifies bin upper bounds for some or all features
|
||||||
|
|
||||||
|
- ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
|
||||||
|
|
||||||
|
- see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
|
||||||
|
|
||||||
- ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">🔗︎</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
|
- ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">🔗︎</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
|
||||||
|
|
||||||
- decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
|
- decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
|
||||||
|
|
|
@ -29,6 +29,9 @@ is_training_metric = true
|
||||||
# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy.
|
# number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy.
|
||||||
max_bin = 255
|
max_bin = 255
|
||||||
|
|
||||||
|
# forced bin thresholds
|
||||||
|
# forcedbins_filename = forced_bins.json
|
||||||
|
|
||||||
# training data
|
# training data
|
||||||
# if exsting weight file, should name to "regression.train.weight"
|
# if exsting weight file, should name to "regression.train.weight"
|
||||||
# alias: train_data, train
|
# alias: train_data, train
|
||||||
|
|
|
@ -146,9 +146,10 @@ class BinMapper {
|
||||||
* \param bin_type Type of this bin
|
* \param bin_type Type of this bin
|
||||||
* \param use_missing True to enable missing value handle
|
* \param use_missing True to enable missing value handle
|
||||||
* \param zero_as_missing True to use zero as missing value
|
* \param zero_as_missing True to use zero as missing value
|
||||||
|
* \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm)
|
||||||
*/
|
*/
|
||||||
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
|
void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type,
|
||||||
bool use_missing, bool zero_as_missing);
|
bool use_missing, bool zero_as_missing, const std::vector<double>& forced_upper_bounds);
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \brief Use specific number of bin to calculate the size of this class
|
* \brief Use specific number of bin to calculate the size of this class
|
||||||
|
|
|
@ -412,6 +412,11 @@ struct Config {
|
||||||
// desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
|
// desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
|
||||||
std::string forcedsplits_filename = "";
|
std::string forcedsplits_filename = "";
|
||||||
|
|
||||||
|
// desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
|
||||||
|
// desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
|
||||||
|
// desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
|
||||||
|
std::string forcedbins_filename = "";
|
||||||
|
|
||||||
// check = >=0.0
|
// check = >=0.0
|
||||||
// check = <=1.0
|
// check = <=1.0
|
||||||
// desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
|
// desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees
|
||||||
|
|
|
@ -290,6 +290,7 @@ class Dataset {
|
||||||
|
|
||||||
void Construct(
|
void Construct(
|
||||||
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
|
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
|
||||||
|
const std::vector<std::vector<double>>& forced_bins,
|
||||||
int** sample_non_zero_indices,
|
int** sample_non_zero_indices,
|
||||||
const int* num_per_col,
|
const int* num_per_col,
|
||||||
size_t total_sample_cnt,
|
size_t total_sample_cnt,
|
||||||
|
@ -630,6 +631,7 @@ class Dataset {
|
||||||
bool is_finish_load_;
|
bool is_finish_load_;
|
||||||
int max_bin_;
|
int max_bin_;
|
||||||
std::vector<int32_t> max_bin_by_feature_;
|
std::vector<int32_t> max_bin_by_feature_;
|
||||||
|
std::vector<std::vector<double>> forced_bin_bounds_;
|
||||||
int bin_construct_sample_cnt_;
|
int bin_construct_sample_cnt_;
|
||||||
int min_data_in_bin_;
|
int min_data_in_bin_;
|
||||||
bool use_missing_;
|
bool use_missing_;
|
||||||
|
|
|
@ -36,6 +36,9 @@ class DatasetLoader {
|
||||||
/*! \brief Disable copy */
|
/*! \brief Disable copy */
|
||||||
DatasetLoader(const DatasetLoader&) = delete;
|
DatasetLoader(const DatasetLoader&) = delete;
|
||||||
|
|
||||||
|
static std::vector<std::vector<double>> GetForcedBins(std::string forced_bins_path, int num_total_features,
|
||||||
|
const std::unordered_set<int>& categorical_features);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
|
Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector<data_size_t>* used_data_indices);
|
||||||
|
|
||||||
|
|
125
src/io/bin.cpp
125
src/io/bin.cpp
|
@ -71,7 +71,7 @@ namespace LightGBM {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
|
std::vector<double> GreedyFindBin(const double* distinct_values, const int* counts,
|
||||||
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
|
int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) {
|
||||||
std::vector<double> bin_upper_bound;
|
std::vector<double> bin_upper_bound;
|
||||||
CHECK(max_bin > 0);
|
CHECK(max_bin > 0);
|
||||||
|
@ -149,8 +149,105 @@ namespace LightGBM {
|
||||||
return bin_upper_bound;
|
return bin_upper_bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts,
|
std::vector<double> FindBinWithPredefinedBin(const double* distinct_values, const int* counts,
|
||||||
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
|
int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector<double>& forced_upper_bounds) {
|
||||||
|
std::vector<double> bin_upper_bound;
|
||||||
|
|
||||||
|
// get list of distinct values
|
||||||
|
int left_cnt_data = 0;
|
||||||
|
int cnt_zero = 0;
|
||||||
|
int right_cnt_data = 0;
|
||||||
|
for (int i = 0; i < num_distinct_values; ++i) {
|
||||||
|
if (distinct_values[i] <= -kZeroThreshold) {
|
||||||
|
left_cnt_data += counts[i];
|
||||||
|
} else if (distinct_values[i] > kZeroThreshold) {
|
||||||
|
right_cnt_data += counts[i];
|
||||||
|
} else {
|
||||||
|
cnt_zero += counts[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// get number of positive and negative distinct values
|
||||||
|
int left_cnt = -1;
|
||||||
|
for (int i = 0; i < num_distinct_values; ++i) {
|
||||||
|
if (distinct_values[i] > -kZeroThreshold) {
|
||||||
|
left_cnt = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (left_cnt < 0) {
|
||||||
|
left_cnt = num_distinct_values;
|
||||||
|
}
|
||||||
|
int right_start = -1;
|
||||||
|
for (int i = left_cnt; i < num_distinct_values; ++i) {
|
||||||
|
if (distinct_values[i] > kZeroThreshold) {
|
||||||
|
right_start = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// include zero bounds and infinity bound
|
||||||
|
if (max_bin == 2) {
|
||||||
|
if (left_cnt == 0) {
|
||||||
|
bin_upper_bound.push_back(kZeroThreshold);
|
||||||
|
} else {
|
||||||
|
bin_upper_bound.push_back(-kZeroThreshold);
|
||||||
|
}
|
||||||
|
} else if (max_bin >= 3) {
|
||||||
|
if (left_cnt > 0) {
|
||||||
|
bin_upper_bound.push_back(-kZeroThreshold);
|
||||||
|
}
|
||||||
|
if (right_start >= 0) {
|
||||||
|
bin_upper_bound.push_back(kZeroThreshold);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bin_upper_bound.push_back(std::numeric_limits<double>::infinity());
|
||||||
|
|
||||||
|
// add forced bounds, excluding zeros since we have already added zero bounds
|
||||||
|
int max_to_insert = max_bin - static_cast<int>(bin_upper_bound.size());
|
||||||
|
int num_inserted = 0;
|
||||||
|
for (size_t i = 0; i < forced_upper_bounds.size(); ++i) {
|
||||||
|
if (num_inserted >= max_to_insert) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) {
|
||||||
|
bin_upper_bound.push_back(forced_upper_bounds[i]);
|
||||||
|
++num_inserted;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
|
||||||
|
|
||||||
|
// find remaining bounds
|
||||||
|
int free_bins = max_bin - static_cast<int>(bin_upper_bound.size());
|
||||||
|
std::vector<double> bounds_to_add;
|
||||||
|
int value_ind = 0;
|
||||||
|
for (size_t i = 0; i < bin_upper_bound.size(); ++i) {
|
||||||
|
int cnt_in_bin = 0;
|
||||||
|
int distinct_cnt_in_bin = 0;
|
||||||
|
int bin_start = value_ind;
|
||||||
|
while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) {
|
||||||
|
cnt_in_bin += counts[value_ind];
|
||||||
|
++distinct_cnt_in_bin;
|
||||||
|
++value_ind;
|
||||||
|
}
|
||||||
|
int bins_remaining = max_bin - static_cast<int>(bin_upper_bound.size()) - static_cast<int>(bounds_to_add.size());
|
||||||
|
int num_sub_bins = static_cast<int>(std::lround((static_cast<double>(cnt_in_bin) * free_bins / total_sample_cnt)));
|
||||||
|
num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1;
|
||||||
|
if (i == bin_upper_bound.size() - 1) {
|
||||||
|
num_sub_bins = bins_remaining + 1;
|
||||||
|
}
|
||||||
|
std::vector<double> new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin,
|
||||||
|
num_sub_bins, cnt_in_bin, min_data_in_bin);
|
||||||
|
bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity
|
||||||
|
}
|
||||||
|
bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end());
|
||||||
|
std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end());
|
||||||
|
CHECK(bin_upper_bound.size() <= static_cast<size_t>(max_bin));
|
||||||
|
return bin_upper_bound;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
|
||||||
|
int max_bin, size_t total_sample_cnt, int min_data_in_bin) {
|
||||||
std::vector<double> bin_upper_bound;
|
std::vector<double> bin_upper_bound;
|
||||||
int left_cnt_data = 0;
|
int left_cnt_data = 0;
|
||||||
int cnt_zero = 0;
|
int cnt_zero = 0;
|
||||||
|
@ -207,8 +304,19 @@ namespace LightGBM {
|
||||||
return bin_upper_bound;
|
return bin_upper_bound;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<double> FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values,
|
||||||
|
int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector<double>& forced_upper_bounds) {
|
||||||
|
if (forced_upper_bounds.empty()) {
|
||||||
|
return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
|
||||||
|
} else {
|
||||||
|
return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin,
|
||||||
|
forced_upper_bounds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
|
void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt,
|
||||||
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) {
|
int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing,
|
||||||
|
const std::vector<double>& forced_upper_bounds) {
|
||||||
int na_cnt = 0;
|
int na_cnt = 0;
|
||||||
int tmp_num_sample_values = 0;
|
int tmp_num_sample_values = 0;
|
||||||
for (int i = 0; i < num_sample_values; ++i) {
|
for (int i = 0; i < num_sample_values; ++i) {
|
||||||
|
@ -276,14 +384,17 @@ namespace LightGBM {
|
||||||
int num_distinct_values = static_cast<int>(distinct_values.size());
|
int num_distinct_values = static_cast<int>(distinct_values.size());
|
||||||
if (bin_type_ == BinType::NumericalBin) {
|
if (bin_type_ == BinType::NumericalBin) {
|
||||||
if (missing_type_ == MissingType::Zero) {
|
if (missing_type_ == MissingType::Zero) {
|
||||||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
|
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
|
||||||
|
min_data_in_bin, forced_upper_bounds);
|
||||||
if (bin_upper_bound_.size() == 2) {
|
if (bin_upper_bound_.size() == 2) {
|
||||||
missing_type_ = MissingType::None;
|
missing_type_ = MissingType::None;
|
||||||
}
|
}
|
||||||
} else if (missing_type_ == MissingType::None) {
|
} else if (missing_type_ == MissingType::None) {
|
||||||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin);
|
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt,
|
||||||
|
min_data_in_bin, forced_upper_bounds);
|
||||||
} else {
|
} else {
|
||||||
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin);
|
bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt,
|
||||||
|
min_data_in_bin, forced_upper_bounds);
|
||||||
bin_upper_bound_.push_back(NaN);
|
bin_upper_bound_.push_back(NaN);
|
||||||
}
|
}
|
||||||
num_bin_ = static_cast<int>(bin_upper_bound_.size());
|
num_bin_ = static_cast<int>(bin_upper_bound_.size());
|
||||||
|
|
|
@ -215,6 +215,7 @@ std::unordered_set<std::string> Config::parameter_set({
|
||||||
"monotone_constraints",
|
"monotone_constraints",
|
||||||
"feature_contri",
|
"feature_contri",
|
||||||
"forcedsplits_filename",
|
"forcedsplits_filename",
|
||||||
|
"forcedbins_filename",
|
||||||
"refit_decay_rate",
|
"refit_decay_rate",
|
||||||
"cegb_tradeoff",
|
"cegb_tradeoff",
|
||||||
"cegb_penalty_split",
|
"cegb_penalty_split",
|
||||||
|
@ -406,6 +407,8 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
|
||||||
|
|
||||||
GetString(params, "forcedsplits_filename", &forcedsplits_filename);
|
GetString(params, "forcedsplits_filename", &forcedsplits_filename);
|
||||||
|
|
||||||
|
GetString(params, "forcedbins_filename", &forcedbins_filename);
|
||||||
|
|
||||||
GetDouble(params, "refit_decay_rate", &refit_decay_rate);
|
GetDouble(params, "refit_decay_rate", &refit_decay_rate);
|
||||||
CHECK(refit_decay_rate >=0.0);
|
CHECK(refit_decay_rate >=0.0);
|
||||||
CHECK(refit_decay_rate <=1.0);
|
CHECK(refit_decay_rate <=1.0);
|
||||||
|
@ -621,6 +624,7 @@ std::string Config::SaveMembersToString() const {
|
||||||
str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
|
str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast<int8_t, int>(monotone_constraints), ",") << "]\n";
|
||||||
str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
|
str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
|
||||||
str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
|
str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
|
||||||
|
str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n";
|
||||||
str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
|
str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
|
||||||
str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
|
str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
|
||||||
str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
|
str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
|
||||||
namespace LightGBM {
|
namespace LightGBM {
|
||||||
|
|
||||||
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
|
const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n";
|
||||||
|
@ -214,6 +215,7 @@ std::vector<std::vector<int>> FastFeatureBundling(const std::vector<std::unique_
|
||||||
|
|
||||||
void Dataset::Construct(
|
void Dataset::Construct(
|
||||||
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
|
std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
|
||||||
|
const std::vector<std::vector<double>>& forced_bins,
|
||||||
int** sample_non_zero_indices,
|
int** sample_non_zero_indices,
|
||||||
const int* num_per_col,
|
const int* num_per_col,
|
||||||
size_t total_sample_cnt,
|
size_t total_sample_cnt,
|
||||||
|
@ -324,6 +326,7 @@ void Dataset::Construct(
|
||||||
max_bin_by_feature_.resize(num_total_features_);
|
max_bin_by_feature_.resize(num_total_features_);
|
||||||
max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
|
max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end());
|
||||||
}
|
}
|
||||||
|
forced_bin_bounds_ = forced_bins;
|
||||||
max_bin_ = io_config.max_bin;
|
max_bin_ = io_config.max_bin;
|
||||||
min_data_in_bin_ = io_config.min_data_in_bin;
|
min_data_in_bin_ = io_config.min_data_in_bin;
|
||||||
bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
|
bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt;
|
||||||
|
@ -356,6 +359,9 @@ void Dataset::ResetConfig(const char* parameters) {
|
||||||
if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
|
if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) {
|
||||||
Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
|
Log::Warning("Cannot change sparse_threshold after constructed Dataset handle.");
|
||||||
}
|
}
|
||||||
|
if (param.count("forcedbins_filename")) {
|
||||||
|
Log::Warning("Cannot change forced bins after constructed Dataset handle.");
|
||||||
|
}
|
||||||
|
|
||||||
if (!io_config.monotone_constraints.empty()) {
|
if (!io_config.monotone_constraints.empty()) {
|
||||||
CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
|
CHECK(static_cast<size_t>(num_total_features_) == io_config.monotone_constraints.size());
|
||||||
|
@ -430,6 +436,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) {
|
||||||
group_feature_cnt_ = dataset->group_feature_cnt_;
|
group_feature_cnt_ = dataset->group_feature_cnt_;
|
||||||
monotone_types_ = dataset->monotone_types_;
|
monotone_types_ = dataset->monotone_types_;
|
||||||
feature_penalty_ = dataset->feature_penalty_;
|
feature_penalty_ = dataset->feature_penalty_;
|
||||||
|
forced_bin_bounds_ = dataset->forced_bin_bounds_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Dataset::CreateValid(const Dataset* dataset) {
|
void Dataset::CreateValid(const Dataset* dataset) {
|
||||||
|
@ -484,6 +491,7 @@ void Dataset::CreateValid(const Dataset* dataset) {
|
||||||
}
|
}
|
||||||
monotone_types_ = dataset->monotone_types_;
|
monotone_types_ = dataset->monotone_types_;
|
||||||
feature_penalty_ = dataset->feature_penalty_;
|
feature_penalty_ = dataset->feature_penalty_;
|
||||||
|
forced_bin_bounds_ = dataset->forced_bin_bounds_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Dataset::ReSize(data_size_t num_data) {
|
void Dataset::ReSize(data_size_t num_data) {
|
||||||
|
@ -657,6 +665,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
|
||||||
for (int i = 0; i < num_total_features_; ++i) {
|
for (int i = 0; i < num_total_features_; ++i) {
|
||||||
size_of_header += feature_names_[i].size() + sizeof(int);
|
size_of_header += feature_names_[i].size() + sizeof(int);
|
||||||
}
|
}
|
||||||
|
// size of forced bins
|
||||||
|
for (int i = 0; i < num_total_features_; ++i) {
|
||||||
|
size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int);
|
||||||
|
}
|
||||||
writer->Write(&size_of_header, sizeof(size_of_header));
|
writer->Write(&size_of_header, sizeof(size_of_header));
|
||||||
// write header
|
// write header
|
||||||
writer->Write(&num_data_, sizeof(num_data_));
|
writer->Write(&num_data_, sizeof(num_data_));
|
||||||
|
@ -705,6 +717,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) {
|
||||||
const char* c_str = feature_names_[i].c_str();
|
const char* c_str = feature_names_[i].c_str();
|
||||||
writer->Write(c_str, sizeof(char) * str_len);
|
writer->Write(c_str, sizeof(char) * str_len);
|
||||||
}
|
}
|
||||||
|
// write forced bins
|
||||||
|
for (int i = 0; i < num_total_features_; ++i) {
|
||||||
|
int num_bounds = static_cast<int>(forced_bin_bounds_[i].size());
|
||||||
|
writer->Write(&num_bounds, sizeof(int));
|
||||||
|
|
||||||
|
for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
|
||||||
|
writer->Write(&forced_bin_bounds_[i][j], sizeof(double));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// get size of meta data
|
// get size of meta data
|
||||||
size_t size_of_metadata = metadata_.SizesInByte();
|
size_t size_of_metadata = metadata_.SizesInByte();
|
||||||
|
@ -754,6 +775,13 @@ void Dataset::DumpTextFile(const char* text_filename) {
|
||||||
for (auto n : feature_names_) {
|
for (auto n : feature_names_) {
|
||||||
fprintf(file, "%s, ", n.c_str());
|
fprintf(file, "%s, ", n.c_str());
|
||||||
}
|
}
|
||||||
|
fprintf(file, "\nforced_bins: ");
|
||||||
|
for (int i = 0; i < num_total_features_; ++i) {
|
||||||
|
fprintf(file, "\nfeature %d: ", i);
|
||||||
|
for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) {
|
||||||
|
fprintf(file, "%lf, ", forced_bin_bounds_[i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
std::vector<std::unique_ptr<BinIterator>> iterators;
|
std::vector<std::unique_ptr<BinIterator>> iterators;
|
||||||
iterators.reserve(num_features_);
|
iterators.reserve(num_features_);
|
||||||
for (int j = 0; j < num_features_; ++j) {
|
for (int j = 0; j < num_features_; ++j) {
|
||||||
|
@ -1005,6 +1033,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
|
||||||
PushVector(&feature_names_, other->feature_names_);
|
PushVector(&feature_names_, other->feature_names_);
|
||||||
PushVector(&feature2subfeature_, other->feature2subfeature_);
|
PushVector(&feature2subfeature_, other->feature2subfeature_);
|
||||||
PushVector(&group_feature_cnt_, other->group_feature_cnt_);
|
PushVector(&group_feature_cnt_, other->group_feature_cnt_);
|
||||||
|
PushVector(&forced_bin_bounds_, other->forced_bin_bounds_);
|
||||||
feature_groups_.reserve(other->feature_groups_.size());
|
feature_groups_.reserve(other->feature_groups_.size());
|
||||||
for (auto& fg : other->feature_groups_) {
|
for (auto& fg : other->feature_groups_) {
|
||||||
feature_groups_.emplace_back(new FeatureGroup(*fg));
|
feature_groups_.emplace_back(new FeatureGroup(*fg));
|
||||||
|
@ -1027,6 +1056,7 @@ void Dataset::addFeaturesFrom(Dataset* other) {
|
||||||
|
|
||||||
PushClearIfEmpty(&monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
|
PushClearIfEmpty(&monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0);
|
||||||
PushClearIfEmpty(&feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
|
PushClearIfEmpty(&feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0);
|
||||||
|
PushClearIfEmpty(&max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1);
|
||||||
|
|
||||||
num_features_ += other->num_features_;
|
num_features_ += other->num_features_;
|
||||||
num_total_features_ += other->num_total_features_;
|
num_total_features_ += other->num_total_features_;
|
||||||
|
|
|
@ -4,11 +4,16 @@
|
||||||
*/
|
*/
|
||||||
#include <LightGBM/dataset_loader.h>
|
#include <LightGBM/dataset_loader.h>
|
||||||
|
|
||||||
|
#include <LightGBM/json11.hpp>
|
||||||
#include <LightGBM/network.h>
|
#include <LightGBM/network.h>
|
||||||
#include <LightGBM/utils/array_args.h>
|
#include <LightGBM/utils/array_args.h>
|
||||||
#include <LightGBM/utils/log.h>
|
#include <LightGBM/utils/log.h>
|
||||||
#include <LightGBM/utils/openmp_wrapper.h>
|
#include <LightGBM/utils/openmp_wrapper.h>
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
using namespace json11;
|
||||||
|
|
||||||
namespace LightGBM {
|
namespace LightGBM {
|
||||||
|
|
||||||
DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename)
|
DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename)
|
||||||
|
@ -458,6 +463,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
|
||||||
}
|
}
|
||||||
dataset->feature_names_.emplace_back(str_buf.str());
|
dataset->feature_names_.emplace_back(str_buf.str());
|
||||||
}
|
}
|
||||||
|
// get forced_bin_bounds_
|
||||||
|
dataset->forced_bin_bounds_ = std::vector<std::vector<double>>(dataset->num_total_features_, std::vector<double>());
|
||||||
|
for (int i = 0; i < dataset->num_total_features_; ++i) {
|
||||||
|
int num_bounds = *(reinterpret_cast<const int*>(mem_ptr));
|
||||||
|
mem_ptr += sizeof(int);
|
||||||
|
dataset->forced_bin_bounds_[i] = std::vector<double>();
|
||||||
|
const double* tmp_ptr_forced_bounds = reinterpret_cast<const double*>(mem_ptr);
|
||||||
|
|
||||||
|
for (int j = 0; j < num_bounds; ++j) {
|
||||||
|
double bound = tmp_ptr_forced_bounds[j];
|
||||||
|
dataset->forced_bin_bounds_[i].push_back(bound);
|
||||||
|
}
|
||||||
|
mem_ptr += num_bounds * sizeof(double);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// read size of meta data
|
// read size of meta data
|
||||||
read_cnt = reader->Read(buffer.data(), sizeof(size_t));
|
read_cnt = reader->Read(buffer.data(), sizeof(size_t));
|
||||||
|
@ -549,6 +569,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b
|
||||||
return dataset.release();
|
return dataset.release();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
|
Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
|
||||||
int** sample_indices, int num_col, const int* num_per_col,
|
int** sample_indices, int num_col, const int* num_per_col,
|
||||||
size_t total_sample_size, data_size_t num_data) {
|
size_t total_sample_size, data_size_t num_data) {
|
||||||
|
@ -565,6 +586,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
|
||||||
CHECK(static_cast<size_t>(num_col) == config_.max_bin_by_feature.size());
|
CHECK(static_cast<size_t>(num_col) == config_.max_bin_by_feature.size());
|
||||||
CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
|
CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get forced split
|
||||||
|
std::string forced_bins_path = config_.forcedbins_filename;
|
||||||
|
std::vector<std::vector<double>> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, num_col, categorical_features_);
|
||||||
|
|
||||||
const data_size_t filter_cnt = static_cast<data_size_t>(
|
const data_size_t filter_cnt = static_cast<data_size_t>(
|
||||||
static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
|
static_cast<double>(config_.min_data_in_leaf * total_sample_size) / num_data);
|
||||||
if (Network::num_machines() == 1) {
|
if (Network::num_machines() == 1) {
|
||||||
|
@ -589,12 +615,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
|
||||||
if (config_.max_bin_by_feature.empty()) {
|
if (config_.max_bin_by_feature.empty()) {
|
||||||
bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
|
bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
|
||||||
config_.max_bin, config_.min_data_in_bin, filter_cnt,
|
config_.max_bin, config_.min_data_in_bin, filter_cnt,
|
||||||
bin_type, config_.use_missing, config_.zero_as_missing);
|
bin_type, config_.use_missing, config_.zero_as_missing,
|
||||||
|
forced_bin_bounds[i]);
|
||||||
} else {
|
} else {
|
||||||
bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
|
bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size,
|
||||||
config_.max_bin_by_feature[i], config_.min_data_in_bin,
|
config_.max_bin_by_feature[i], config_.min_data_in_bin,
|
||||||
filter_cnt, bin_type, config_.use_missing,
|
filter_cnt, bin_type, config_.use_missing,
|
||||||
config_.zero_as_missing);
|
config_.zero_as_missing, forced_bin_bounds[i]);
|
||||||
}
|
}
|
||||||
OMP_LOOP_EX_END();
|
OMP_LOOP_EX_END();
|
||||||
}
|
}
|
||||||
|
@ -634,12 +661,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
|
||||||
if (config_.max_bin_by_feature.empty()) {
|
if (config_.max_bin_by_feature.empty()) {
|
||||||
bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
|
bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
|
||||||
total_sample_size, config_.max_bin, config_.min_data_in_bin,
|
total_sample_size, config_.max_bin, config_.min_data_in_bin,
|
||||||
filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
|
filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
|
||||||
|
forced_bin_bounds[i]);
|
||||||
} else {
|
} else {
|
||||||
bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
|
bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i],
|
||||||
total_sample_size, config_.max_bin_by_feature[start[rank] + i],
|
total_sample_size, config_.max_bin_by_feature[start[rank] + i],
|
||||||
config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
|
config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
|
||||||
config_.zero_as_missing);
|
config_.zero_as_missing, forced_bin_bounds[i]);
|
||||||
}
|
}
|
||||||
OMP_LOOP_EX_END();
|
OMP_LOOP_EX_END();
|
||||||
}
|
}
|
||||||
|
@ -692,7 +720,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
|
auto dataset = std::unique_ptr<Dataset>(new Dataset(num_data));
|
||||||
dataset->Construct(&bin_mappers, sample_indices, num_per_col, total_sample_size, config_);
|
dataset->Construct(&bin_mappers, forced_bin_bounds, sample_indices, num_per_col, total_sample_size, config_);
|
||||||
dataset->set_feature_names(feature_names_);
|
dataset->set_feature_names(feature_names_);
|
||||||
return dataset.release();
|
return dataset.release();
|
||||||
}
|
}
|
||||||
|
@ -876,6 +904,11 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
|
||||||
CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
|
CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get forced split
|
||||||
|
std::string forced_bins_path = config_.forcedbins_filename;
|
||||||
|
std::vector<std::vector<double>> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, dataset->num_total_features_,
|
||||||
|
categorical_features_);
|
||||||
|
|
||||||
// check the range of label_idx, weight_idx and group_idx
|
// check the range of label_idx, weight_idx and group_idx
|
||||||
CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
|
CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_);
|
||||||
CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
|
CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_);
|
||||||
|
@ -913,12 +946,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
|
||||||
if (config_.max_bin_by_feature.empty()) {
|
if (config_.max_bin_by_feature.empty()) {
|
||||||
bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
|
bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
|
||||||
sample_data.size(), config_.max_bin, config_.min_data_in_bin,
|
sample_data.size(), config_.max_bin, config_.min_data_in_bin,
|
||||||
filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
|
filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
|
||||||
|
forced_bin_bounds[i]);
|
||||||
} else {
|
} else {
|
||||||
bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
|
bin_mappers[i]->FindBin(sample_values[i].data(), static_cast<int>(sample_values[i].size()),
|
||||||
sample_data.size(), config_.max_bin_by_feature[i],
|
sample_data.size(), config_.max_bin_by_feature[i],
|
||||||
config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
|
config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing,
|
||||||
config_.zero_as_missing);
|
config_.zero_as_missing, forced_bin_bounds[i]);
|
||||||
}
|
}
|
||||||
OMP_LOOP_EX_END();
|
OMP_LOOP_EX_END();
|
||||||
}
|
}
|
||||||
|
@ -959,13 +993,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
|
||||||
bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
|
bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
|
||||||
static_cast<int>(sample_values[start[rank] + i].size()),
|
static_cast<int>(sample_values[start[rank] + i].size()),
|
||||||
sample_data.size(), config_.max_bin, config_.min_data_in_bin,
|
sample_data.size(), config_.max_bin, config_.min_data_in_bin,
|
||||||
filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing);
|
filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing,
|
||||||
|
forced_bin_bounds[i]);
|
||||||
} else {
|
} else {
|
||||||
bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
|
bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(),
|
||||||
static_cast<int>(sample_values[start[rank] + i].size()),
|
static_cast<int>(sample_values[start[rank] + i].size()),
|
||||||
sample_data.size(), config_.max_bin_by_feature[i],
|
sample_data.size(), config_.max_bin_by_feature[i],
|
||||||
config_.min_data_in_bin, filter_cnt, bin_type,
|
config_.min_data_in_bin, filter_cnt, bin_type,
|
||||||
config_.use_missing, config_.zero_as_missing);
|
config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]);
|
||||||
}
|
}
|
||||||
OMP_LOOP_EX_END();
|
OMP_LOOP_EX_END();
|
||||||
}
|
}
|
||||||
|
@ -1018,7 +1053,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sample_values.clear();
|
sample_values.clear();
|
||||||
dataset->Construct(&bin_mappers, Common::Vector2Ptr<int>(&sample_indices).data(),
|
dataset->Construct(&bin_mappers, forced_bin_bounds, Common::Vector2Ptr<int>(&sample_indices).data(),
|
||||||
Common::VectorSize<int>(sample_indices).data(), sample_data.size(), config_);
|
Common::VectorSize<int>(sample_indices).data(), sample_data.size(), config_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1207,4 +1242,42 @@ std::string DatasetLoader::CheckCanLoadFromBin(const char* filename) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
std::vector<std::vector<double>> DatasetLoader::GetForcedBins(std::string forced_bins_path, int num_total_features,
|
||||||
|
const std::unordered_set<int>& categorical_features) {
|
||||||
|
std::vector<std::vector<double>> forced_bins(num_total_features, std::vector<double>());
|
||||||
|
if (forced_bins_path != "") {
|
||||||
|
std::ifstream forced_bins_stream(forced_bins_path.c_str());
|
||||||
|
if (forced_bins_stream.fail()) {
|
||||||
|
Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str());
|
||||||
|
} else {
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << forced_bins_stream.rdbuf();
|
||||||
|
std::string err;
|
||||||
|
Json forced_bins_json = Json::parse(buffer.str(), err);
|
||||||
|
CHECK(forced_bins_json.is_array());
|
||||||
|
std::vector<Json> forced_bins_arr = forced_bins_json.array_items();
|
||||||
|
for (size_t i = 0; i < forced_bins_arr.size(); ++i) {
|
||||||
|
int feature_num = forced_bins_arr[i]["feature"].int_value();
|
||||||
|
CHECK(feature_num < num_total_features);
|
||||||
|
if (categorical_features.count(feature_num)) {
|
||||||
|
Log::Warning("Feature %d is categorical. Will ignore forced bins for this feature.", feature_num);
|
||||||
|
} else {
|
||||||
|
std::vector<Json> bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items();
|
||||||
|
for (size_t j = 0; j < bounds_arr.size(); ++j) {
|
||||||
|
forced_bins[feature_num].push_back(bounds_arr[j].number_value());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// remove duplicates
|
||||||
|
for (int i = 0; i < num_total_features; ++i) {
|
||||||
|
auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end());
|
||||||
|
forced_bins[i].erase(new_end, forced_bins[i].end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return forced_bins;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace LightGBM
|
} // namespace LightGBM
|
||||||
|
|
|
@ -1688,3 +1688,68 @@ class TestEngine(unittest.TestCase):
|
||||||
num_boost_round=25)
|
num_boost_round=25)
|
||||||
ret2 = log_loss(y_test, gbm2.predict(X_test))
|
ret2 = log_loss(y_test, gbm2.predict(X_test))
|
||||||
self.assertNotEqual(ret, ret2)
|
self.assertNotEqual(ret, ret2)
|
||||||
|
|
||||||
|
def test_forced_bins(self):
|
||||||
|
x = np.zeros((100, 2))
|
||||||
|
x[:, 0] = np.arange(0, 1, 0.01)
|
||||||
|
x[:, 1] = -np.arange(0, 1, 0.01)
|
||||||
|
y = np.arange(0, 1, 0.01)
|
||||||
|
forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)),
|
||||||
|
'../../examples/regression/forced_bins.json')
|
||||||
|
params = {'objective': 'regression_l1',
|
||||||
|
'max_bin': 5,
|
||||||
|
'forcedbins_filename': forcedbins_filename,
|
||||||
|
'num_leaves': 2,
|
||||||
|
'min_data_in_leaf': 1,
|
||||||
|
'verbose': -1,
|
||||||
|
'seed': 0}
|
||||||
|
lgb_x = lgb.Dataset(x, label=y)
|
||||||
|
est = lgb.train(params, lgb_x, num_boost_round=100)
|
||||||
|
new_x = np.zeros((3, x.shape[1]))
|
||||||
|
new_x[:, 0] = [0.31, 0.37, 0.41]
|
||||||
|
new_x[:, 1] = [0, 0, 0]
|
||||||
|
predicted = est.predict(new_x)
|
||||||
|
self.assertEqual(len(np.unique(predicted)), 3)
|
||||||
|
new_x[:, 0] = [0, 0, 0]
|
||||||
|
new_x[:, 1] = [-0.9, -0.6, -0.3]
|
||||||
|
predicted = est.predict(new_x)
|
||||||
|
self.assertEqual(len(np.unique(predicted)), 1)
|
||||||
|
params['forcedbins_filename'] = ''
|
||||||
|
lgb_x = lgb.Dataset(x, label=y)
|
||||||
|
est = lgb.train(params, lgb_x, num_boost_round=100)
|
||||||
|
predicted = est.predict(new_x)
|
||||||
|
self.assertEqual(len(np.unique(predicted)), 3)
|
||||||
|
params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)),
|
||||||
|
'../../examples/regression/forced_bins2.json')
|
||||||
|
params['max_bin'] = 11
|
||||||
|
lgb_x = lgb.Dataset(x[:, :1], label=y)
|
||||||
|
est = lgb.train(params, lgb_x, num_boost_round=100)
|
||||||
|
predicted = est.predict(x[1:, :1])
|
||||||
|
vals, counts = np.unique(predicted, return_counts=True)
|
||||||
|
self.assertGreaterEqual(min(counts), 9)
|
||||||
|
self.assertLessEqual(max(counts), 11)
|
||||||
|
|
||||||
|
def test_binning_same_sign(self):
|
||||||
|
# test that binning works properly for features with only positive or only negative values
|
||||||
|
x = np.zeros((99, 2))
|
||||||
|
x[:, 0] = np.arange(0.01, 1, 0.01)
|
||||||
|
x[:, 1] = -np.arange(0.01, 1, 0.01)
|
||||||
|
y = np.arange(0.01, 1, 0.01)
|
||||||
|
params = {'objective': 'regression_l1',
|
||||||
|
'max_bin': 5,
|
||||||
|
'num_leaves': 2,
|
||||||
|
'min_data_in_leaf': 1,
|
||||||
|
'verbose': -1,
|
||||||
|
'seed': 0}
|
||||||
|
lgb_x = lgb.Dataset(x, label=y)
|
||||||
|
est = lgb.train(params, lgb_x, num_boost_round=100)
|
||||||
|
new_x = np.zeros((3, 2))
|
||||||
|
new_x[:, 0] = [-1, 0, 1]
|
||||||
|
predicted = est.predict(new_x)
|
||||||
|
self.assertAlmostEqual(predicted[0], predicted[1])
|
||||||
|
self.assertNotAlmostEqual(predicted[1], predicted[2])
|
||||||
|
new_x = np.zeros((3, 2))
|
||||||
|
new_x[:, 1] = [-1, 0, 1]
|
||||||
|
predicted = est.predict(new_x)
|
||||||
|
self.assertNotAlmostEqual(predicted[0], predicted[1])
|
||||||
|
self.assertAlmostEqual(predicted[1], predicted[2])
|
||||||
|
|
Загрузка…
Ссылка в новой задаче