From cc7a1e27398d86e9befdf6eb08141359882f3d49 Mon Sep 17 00:00:00 2001 From: Belinda Trotta Date: Sun, 29 Sep 2019 01:31:31 +1000 Subject: [PATCH] Predefined bin thresholds (#2325) * Fix bug where small values of max_bin cause crash. * Revert "Fix bug where small values of max_bin cause crash." This reverts commit fe5c8e2547057c1fa5750bcddd359dd7708fab4b. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Add functionality to force bin thresholds. * Fix style issues. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Add functionality to force bin thresholds. * Fix style issues. * Use stable sort. * Minor style and doc fixes. * Change binning behavior to be same as PR #2342. * Use different bin finding function for predefined bounds. * Fix style issues. * Minor refactoring, overload FindBinWithZeroAsOneBin. * Fix style issues. * Fix bug and add new test. * Add warning when using categorical features with forced bins. * Pass forced_upper_bounds by reference. * Pass container types by const reference. * Get categorical features using FeatureBinMapper. * Fix bug for small max_bin. * Move GetForcedBins to DatasetLoader. * Find forced bins in dataset_loader. * Minor fixes. --- docs/Parameters.rst | 8 ++ examples/regression/train.conf | 3 + include/LightGBM/bin.h | 3 +- include/LightGBM/config.h | 5 + include/LightGBM/dataset.h | 2 + include/LightGBM/dataset_loader.h | 3 + src/io/bin.cpp | 125 +++++++++++++++++++++-- src/io/config_auto.cpp | 4 + src/io/dataset.cpp | 30 ++++++ src/io/dataset_loader.cpp | 93 +++++++++++++++-- tests/python_package_test/test_engine.py | 65 ++++++++++++ 11 files changed, 323 insertions(+), 18 deletions(-) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index ad6011adb..a996b0132 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -414,6 +414,14 @@ Learning Control Parameters - see `this file `__ as an example +- ``forcedbins_filename`` :raw-html:`🔗︎`, default = ``""``, type = string + + - path to a ``.json`` file that specifies bin upper bounds for some or all features + + - ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) + + - see `this file `__ as an example + - ``refit_decay_rate`` :raw-html:`🔗︎`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0`` - decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/examples/regression/train.conf b/examples/regression/train.conf index 11396c23e..4c73169dc 100644 --- a/examples/regression/train.conf +++ b/examples/regression/train.conf @@ -29,6 +29,9 @@ is_training_metric = true # number of bins for feature bucket, 255 is a recommend setting, it can save memories, and also has good accuracy. max_bin = 255 +# forced bin thresholds +# forcedbins_filename = forced_bins.json + # training data # if exsting weight file, should name to "regression.train.weight" # alias: train_data, train diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 46baee58f..7ea86acdd 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -146,9 +146,10 @@ class BinMapper { * \param bin_type Type of this bin * \param use_missing True to enable missing value handle * \param zero_as_missing True to use zero as missing value + * \param forced_upper_bounds Vector of split points that must be used (if this has size less than max_bin, remaining splits are found by the algorithm) */ void FindBin(double* values, int num_values, size_t total_sample_cnt, int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, - bool use_missing, bool zero_as_missing); + bool use_missing, bool zero_as_missing, const std::vector& forced_upper_bounds); /*! * \brief Use specific number of bin to calculate the size of this class diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index f3943b76f..974735532 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -412,6 +412,11 @@ struct Config { // desc = see `this file `__ as an example std::string forcedsplits_filename = ""; + // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features + // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning) + // desc = see `this file `__ as an example + std::string forcedbins_filename = ""; + // check = >=0.0 // check = <=1.0 // desc = decay rate of ``refit`` task, will use ``leaf_output = refit_decay_rate * old_leaf_output + (1.0 - refit_decay_rate) * new_leaf_output`` to refit trees diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index c7147a32f..3d0ae9902 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -290,6 +290,7 @@ class Dataset { void Construct( std::vector>* bin_mappers, + const std::vector>& forced_bins, int** sample_non_zero_indices, const int* num_per_col, size_t total_sample_cnt, @@ -630,6 +631,7 @@ class Dataset { bool is_finish_load_; int max_bin_; std::vector max_bin_by_feature_; + std::vector> forced_bin_bounds_; int bin_construct_sample_cnt_; int min_data_in_bin_; bool use_missing_; diff --git a/include/LightGBM/dataset_loader.h b/include/LightGBM/dataset_loader.h index ed4c2af93..c5555ef38 100644 --- a/include/LightGBM/dataset_loader.h +++ b/include/LightGBM/dataset_loader.h @@ -36,6 +36,9 @@ class DatasetLoader { /*! \brief Disable copy */ DatasetLoader(const DatasetLoader&) = delete; + static std::vector> GetForcedBins(std::string forced_bins_path, int num_total_features, + const std::unordered_set& categorical_features); + private: Dataset* LoadFromBinFile(const char* data_filename, const char* bin_filename, int rank, int num_machines, int* num_global_data, std::vector* used_data_indices); diff --git a/src/io/bin.cpp b/src/io/bin.cpp index 14e7c0fe9..c1cd35ea3 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -71,7 +71,7 @@ namespace LightGBM { return true; } - std::vector GreedyFindBin(const double* distinct_values, const int* counts, + std::vector GreedyFindBin(const double* distinct_values, const int* counts, int num_distinct_values, int max_bin, size_t total_cnt, int min_data_in_bin) { std::vector bin_upper_bound; CHECK(max_bin > 0); @@ -149,8 +149,105 @@ namespace LightGBM { return bin_upper_bound; } - std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, - int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin) { + std::vector FindBinWithPredefinedBin(const double* distinct_values, const int* counts, + int num_distinct_values, int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector& forced_upper_bounds) { + std::vector bin_upper_bound; + + // get list of distinct values + int left_cnt_data = 0; + int cnt_zero = 0; + int right_cnt_data = 0; + for (int i = 0; i < num_distinct_values; ++i) { + if (distinct_values[i] <= -kZeroThreshold) { + left_cnt_data += counts[i]; + } else if (distinct_values[i] > kZeroThreshold) { + right_cnt_data += counts[i]; + } else { + cnt_zero += counts[i]; + } + } + + // get number of positive and negative distinct values + int left_cnt = -1; + for (int i = 0; i < num_distinct_values; ++i) { + if (distinct_values[i] > -kZeroThreshold) { + left_cnt = i; + break; + } + } + if (left_cnt < 0) { + left_cnt = num_distinct_values; + } + int right_start = -1; + for (int i = left_cnt; i < num_distinct_values; ++i) { + if (distinct_values[i] > kZeroThreshold) { + right_start = i; + break; + } + } + + // include zero bounds and infinity bound + if (max_bin == 2) { + if (left_cnt == 0) { + bin_upper_bound.push_back(kZeroThreshold); + } else { + bin_upper_bound.push_back(-kZeroThreshold); + } + } else if (max_bin >= 3) { + if (left_cnt > 0) { + bin_upper_bound.push_back(-kZeroThreshold); + } + if (right_start >= 0) { + bin_upper_bound.push_back(kZeroThreshold); + } + } + bin_upper_bound.push_back(std::numeric_limits::infinity()); + + // add forced bounds, excluding zeros since we have already added zero bounds + int max_to_insert = max_bin - static_cast(bin_upper_bound.size()); + int num_inserted = 0; + for (size_t i = 0; i < forced_upper_bounds.size(); ++i) { + if (num_inserted >= max_to_insert) { + break; + } + if (std::fabs(forced_upper_bounds[i]) > kZeroThreshold) { + bin_upper_bound.push_back(forced_upper_bounds[i]); + ++num_inserted; + } + } + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); + + // find remaining bounds + int free_bins = max_bin - static_cast(bin_upper_bound.size()); + std::vector bounds_to_add; + int value_ind = 0; + for (size_t i = 0; i < bin_upper_bound.size(); ++i) { + int cnt_in_bin = 0; + int distinct_cnt_in_bin = 0; + int bin_start = value_ind; + while ((value_ind < num_distinct_values) && (distinct_values[value_ind] < bin_upper_bound[i])) { + cnt_in_bin += counts[value_ind]; + ++distinct_cnt_in_bin; + ++value_ind; + } + int bins_remaining = max_bin - static_cast(bin_upper_bound.size()) - static_cast(bounds_to_add.size()); + int num_sub_bins = static_cast(std::lround((static_cast(cnt_in_bin) * free_bins / total_sample_cnt))); + num_sub_bins = std::min(num_sub_bins, bins_remaining) + 1; + if (i == bin_upper_bound.size() - 1) { + num_sub_bins = bins_remaining + 1; + } + std::vector new_upper_bounds = GreedyFindBin(distinct_values + bin_start, counts + bin_start, distinct_cnt_in_bin, + num_sub_bins, cnt_in_bin, min_data_in_bin); + bounds_to_add.insert(bounds_to_add.end(), new_upper_bounds.begin(), new_upper_bounds.end() - 1); // last bound is infinity + } + bin_upper_bound.insert(bin_upper_bound.end(), bounds_to_add.begin(), bounds_to_add.end()); + std::stable_sort(bin_upper_bound.begin(), bin_upper_bound.end()); + CHECK(bin_upper_bound.size() <= static_cast(max_bin)); + return bin_upper_bound; + } + + std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, + int max_bin, size_t total_sample_cnt, int min_data_in_bin) { std::vector bin_upper_bound; int left_cnt_data = 0; int cnt_zero = 0; @@ -207,8 +304,19 @@ namespace LightGBM { return bin_upper_bound; } + std::vector FindBinWithZeroAsOneBin(const double* distinct_values, const int* counts, int num_distinct_values, + int max_bin, size_t total_sample_cnt, int min_data_in_bin, const std::vector& forced_upper_bounds) { + if (forced_upper_bounds.empty()) { + return FindBinWithZeroAsOneBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + } else { + return FindBinWithPredefinedBin(distinct_values, counts, num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin, + forced_upper_bounds); + } + } + void BinMapper::FindBin(double* values, int num_sample_values, size_t total_sample_cnt, - int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing) { + int max_bin, int min_data_in_bin, int min_split_data, BinType bin_type, bool use_missing, bool zero_as_missing, + const std::vector& forced_upper_bounds) { int na_cnt = 0; int tmp_num_sample_values = 0; for (int i = 0; i < num_sample_values; ++i) { @@ -276,14 +384,17 @@ namespace LightGBM { int num_distinct_values = static_cast(distinct_values.size()); if (bin_type_ == BinType::NumericalBin) { if (missing_type_ == MissingType::Zero) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); if (bin_upper_bound_.size() == 2) { missing_type_ = MissingType::None; } } else if (missing_type_ == MissingType::None) { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin, total_sample_cnt, + min_data_in_bin, forced_upper_bounds); } else { - bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, min_data_in_bin); + bin_upper_bound_ = FindBinWithZeroAsOneBin(distinct_values.data(), counts.data(), num_distinct_values, max_bin - 1, total_sample_cnt - na_cnt, + min_data_in_bin, forced_upper_bounds); bin_upper_bound_.push_back(NaN); } num_bin_ = static_cast(bin_upper_bound_.size()); diff --git a/src/io/config_auto.cpp b/src/io/config_auto.cpp index 8498ef239..542084515 100644 --- a/src/io/config_auto.cpp +++ b/src/io/config_auto.cpp @@ -215,6 +215,7 @@ std::unordered_set Config::parameter_set({ "monotone_constraints", "feature_contri", "forcedsplits_filename", + "forcedbins_filename", "refit_decay_rate", "cegb_tradeoff", "cegb_penalty_split", @@ -406,6 +407,8 @@ void Config::GetMembersFromString(const std::unordered_map=0.0); CHECK(refit_decay_rate <=1.0); @@ -621,6 +624,7 @@ std::string Config::SaveMembersToString() const { str_buf << "[monotone_constraints: " << Common::Join(Common::ArrayCast(monotone_constraints), ",") << "]\n"; str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n"; str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n"; + str_buf << "[forcedbins_filename: " << forcedbins_filename << "]\n"; str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n"; str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n"; str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n"; diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 3967fabb9..54c8fcc22 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -15,6 +15,7 @@ #include #include + namespace LightGBM { const char* Dataset::binary_file_token = "______LightGBM_Binary_File_Token______\n"; @@ -214,6 +215,7 @@ std::vector> FastFeatureBundling(const std::vector>* bin_mappers, + const std::vector>& forced_bins, int** sample_non_zero_indices, const int* num_per_col, size_t total_sample_cnt, @@ -324,6 +326,7 @@ void Dataset::Construct( max_bin_by_feature_.resize(num_total_features_); max_bin_by_feature_.assign(io_config.max_bin_by_feature.begin(), io_config.max_bin_by_feature.end()); } + forced_bin_bounds_ = forced_bins; max_bin_ = io_config.max_bin; min_data_in_bin_ = io_config.min_data_in_bin; bin_construct_sample_cnt_ = io_config.bin_construct_sample_cnt; @@ -356,6 +359,9 @@ void Dataset::ResetConfig(const char* parameters) { if (param.count("sparse_threshold") && io_config.sparse_threshold != sparse_threshold_) { Log::Warning("Cannot change sparse_threshold after constructed Dataset handle."); } + if (param.count("forcedbins_filename")) { + Log::Warning("Cannot change forced bins after constructed Dataset handle."); + } if (!io_config.monotone_constraints.empty()) { CHECK(static_cast(num_total_features_) == io_config.monotone_constraints.size()); @@ -430,6 +436,7 @@ void Dataset::CopyFeatureMapperFrom(const Dataset* dataset) { group_feature_cnt_ = dataset->group_feature_cnt_; monotone_types_ = dataset->monotone_types_; feature_penalty_ = dataset->feature_penalty_; + forced_bin_bounds_ = dataset->forced_bin_bounds_; } void Dataset::CreateValid(const Dataset* dataset) { @@ -484,6 +491,7 @@ void Dataset::CreateValid(const Dataset* dataset) { } monotone_types_ = dataset->monotone_types_; feature_penalty_ = dataset->feature_penalty_; + forced_bin_bounds_ = dataset->forced_bin_bounds_; } void Dataset::ReSize(data_size_t num_data) { @@ -657,6 +665,10 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { for (int i = 0; i < num_total_features_; ++i) { size_of_header += feature_names_[i].size() + sizeof(int); } + // size of forced bins + for (int i = 0; i < num_total_features_; ++i) { + size_of_header += forced_bin_bounds_[i].size() * sizeof(double) + sizeof(int); + } writer->Write(&size_of_header, sizeof(size_of_header)); // write header writer->Write(&num_data_, sizeof(num_data_)); @@ -705,6 +717,15 @@ void Dataset::SaveBinaryFile(const char* bin_filename) { const char* c_str = feature_names_[i].c_str(); writer->Write(c_str, sizeof(char) * str_len); } + // write forced bins + for (int i = 0; i < num_total_features_; ++i) { + int num_bounds = static_cast(forced_bin_bounds_[i].size()); + writer->Write(&num_bounds, sizeof(int)); + + for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) { + writer->Write(&forced_bin_bounds_[i][j], sizeof(double)); + } + } // get size of meta data size_t size_of_metadata = metadata_.SizesInByte(); @@ -754,6 +775,13 @@ void Dataset::DumpTextFile(const char* text_filename) { for (auto n : feature_names_) { fprintf(file, "%s, ", n.c_str()); } + fprintf(file, "\nforced_bins: "); + for (int i = 0; i < num_total_features_; ++i) { + fprintf(file, "\nfeature %d: ", i); + for (size_t j = 0; j < forced_bin_bounds_[i].size(); ++j) { + fprintf(file, "%lf, ", forced_bin_bounds_[i][j]); + } + } std::vector> iterators; iterators.reserve(num_features_); for (int j = 0; j < num_features_; ++j) { @@ -1005,6 +1033,7 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushVector(&feature_names_, other->feature_names_); PushVector(&feature2subfeature_, other->feature2subfeature_); PushVector(&group_feature_cnt_, other->group_feature_cnt_); + PushVector(&forced_bin_bounds_, other->forced_bin_bounds_); feature_groups_.reserve(other->feature_groups_.size()); for (auto& fg : other->feature_groups_) { feature_groups_.emplace_back(new FeatureGroup(*fg)); @@ -1027,6 +1056,7 @@ void Dataset::addFeaturesFrom(Dataset* other) { PushClearIfEmpty(&monotone_types_, num_total_features_, other->monotone_types_, other->num_total_features_, (int8_t)0); PushClearIfEmpty(&feature_penalty_, num_total_features_, other->feature_penalty_, other->num_total_features_, 1.0); + PushClearIfEmpty(&max_bin_by_feature_, num_total_features_, other->max_bin_by_feature_, other->num_total_features_, -1); num_features_ += other->num_features_; num_total_features_ += other->num_total_features_; diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 265adc27f..da0f6b9df 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -4,11 +4,16 @@ */ #include +#include #include #include #include #include +#include + +using namespace json11; + namespace LightGBM { DatasetLoader::DatasetLoader(const Config& io_config, const PredictFunction& predict_fun, int num_class, const char* filename) @@ -458,6 +463,21 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b } dataset->feature_names_.emplace_back(str_buf.str()); } + // get forced_bin_bounds_ + dataset->forced_bin_bounds_ = std::vector>(dataset->num_total_features_, std::vector()); + for (int i = 0; i < dataset->num_total_features_; ++i) { + int num_bounds = *(reinterpret_cast(mem_ptr)); + mem_ptr += sizeof(int); + dataset->forced_bin_bounds_[i] = std::vector(); + const double* tmp_ptr_forced_bounds = reinterpret_cast(mem_ptr); + + for (int j = 0; j < num_bounds; ++j) { + double bound = tmp_ptr_forced_bounds[j]; + dataset->forced_bin_bounds_[i].push_back(bound); + } + mem_ptr += num_bounds * sizeof(double); + + } // read size of meta data read_cnt = reader->Read(buffer.data(), sizeof(size_t)); @@ -549,6 +569,7 @@ Dataset* DatasetLoader::LoadFromBinFile(const char* data_filename, const char* b return dataset.release(); } + Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, int** sample_indices, int num_col, const int* num_per_col, size_t total_sample_size, data_size_t num_data) { @@ -565,6 +586,11 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, CHECK(static_cast(num_col) == config_.max_bin_by_feature.size()); CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, num_col, categorical_features_); + const data_size_t filter_cnt = static_cast( static_cast(config_.min_data_in_leaf * total_sample_size) / num_data); if (Network::num_machines() == 1) { @@ -589,12 +615,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin, config_.min_data_in_bin, filter_cnt, - bin_type, config_.use_missing, config_.zero_as_missing); + bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i], num_per_col[i], total_sample_size, config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -634,12 +661,13 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i], num_per_col[start[rank] + i], total_sample_size, config_.max_bin_by_feature[start[rank] + i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -692,7 +720,7 @@ Dataset* DatasetLoader::CostructFromSampleData(double** sample_values, } } auto dataset = std::unique_ptr(new Dataset(num_data)); - dataset->Construct(&bin_mappers, sample_indices, num_per_col, total_sample_size, config_); + dataset->Construct(&bin_mappers, forced_bin_bounds, sample_indices, num_per_col, total_sample_size, config_); dataset->set_feature_names(feature_names_); return dataset.release(); } @@ -876,6 +904,11 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, CHECK(*(std::min_element(config_.max_bin_by_feature.begin(), config_.max_bin_by_feature.end())) > 1); } + // get forced split + std::string forced_bins_path = config_.forcedbins_filename; + std::vector> forced_bin_bounds = DatasetLoader::GetForcedBins(forced_bins_path, dataset->num_total_features_, + categorical_features_); + // check the range of label_idx, weight_idx and group_idx CHECK(label_idx_ >= 0 && label_idx_ <= dataset->num_total_features_); CHECK(weight_idx_ < 0 || weight_idx_ < dataset->num_total_features_); @@ -913,12 +946,13 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, if (config_.max_bin_by_feature.empty()) { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[i].data(), static_cast(sample_values[i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, config_.use_missing, - config_.zero_as_missing); + config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -959,13 +993,14 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin, config_.min_data_in_bin, - filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing); + filter_cnt, bin_type, config_.use_missing, config_.zero_as_missing, + forced_bin_bounds[i]); } else { bin_mappers[i]->FindBin(sample_values[start[rank] + i].data(), static_cast(sample_values[start[rank] + i].size()), sample_data.size(), config_.max_bin_by_feature[i], config_.min_data_in_bin, filter_cnt, bin_type, - config_.use_missing, config_.zero_as_missing); + config_.use_missing, config_.zero_as_missing, forced_bin_bounds[i]); } OMP_LOOP_EX_END(); } @@ -1018,7 +1053,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines, } } sample_values.clear(); - dataset->Construct(&bin_mappers, Common::Vector2Ptr(&sample_indices).data(), + dataset->Construct(&bin_mappers, forced_bin_bounds, Common::Vector2Ptr(&sample_indices).data(), Common::VectorSize(sample_indices).data(), sample_data.size(), config_); } @@ -1207,4 +1242,42 @@ std::string DatasetLoader::CheckCanLoadFromBin(const char* filename) { } } + + +std::vector> DatasetLoader::GetForcedBins(std::string forced_bins_path, int num_total_features, + const std::unordered_set& categorical_features) { + std::vector> forced_bins(num_total_features, std::vector()); + if (forced_bins_path != "") { + std::ifstream forced_bins_stream(forced_bins_path.c_str()); + if (forced_bins_stream.fail()) { + Log::Warning("Could not open %s. Will ignore.", forced_bins_path.c_str()); + } else { + std::stringstream buffer; + buffer << forced_bins_stream.rdbuf(); + std::string err; + Json forced_bins_json = Json::parse(buffer.str(), err); + CHECK(forced_bins_json.is_array()); + std::vector forced_bins_arr = forced_bins_json.array_items(); + for (size_t i = 0; i < forced_bins_arr.size(); ++i) { + int feature_num = forced_bins_arr[i]["feature"].int_value(); + CHECK(feature_num < num_total_features); + if (categorical_features.count(feature_num)) { + Log::Warning("Feature %d is categorical. Will ignore forced bins for this feature.", feature_num); + } else { + std::vector bounds_arr = forced_bins_arr[i]["bin_upper_bound"].array_items(); + for (size_t j = 0; j < bounds_arr.size(); ++j) { + forced_bins[feature_num].push_back(bounds_arr[j].number_value()); + } + } + } + // remove duplicates + for (int i = 0; i < num_total_features; ++i) { + auto new_end = std::unique(forced_bins[i].begin(), forced_bins[i].end()); + forced_bins[i].erase(new_end, forced_bins[i].end()); + } + } + } + return forced_bins; +} + } // namespace LightGBM diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 724124927..51c99494e 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -1688,3 +1688,68 @@ class TestEngine(unittest.TestCase): num_boost_round=25) ret2 = log_loss(y_test, gbm2.predict(X_test)) self.assertNotEqual(ret, ret2) + + def test_forced_bins(self): + x = np.zeros((100, 2)) + x[:, 0] = np.arange(0, 1, 0.01) + x[:, 1] = -np.arange(0, 1, 0.01) + y = np.arange(0, 1, 0.01) + forcedbins_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins.json') + params = {'objective': 'regression_l1', + 'max_bin': 5, + 'forcedbins_filename': forcedbins_filename, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, x.shape[1])) + new_x[:, 0] = [0.31, 0.37, 0.41] + new_x[:, 1] = [0, 0, 0] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) + new_x[:, 0] = [0, 0, 0] + new_x[:, 1] = [-0.9, -0.6, -0.3] + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 1) + params['forcedbins_filename'] = '' + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + predicted = est.predict(new_x) + self.assertEqual(len(np.unique(predicted)), 3) + params['forcedbins_filename'] = os.path.join(os.path.dirname(os.path.realpath(__file__)), + '../../examples/regression/forced_bins2.json') + params['max_bin'] = 11 + lgb_x = lgb.Dataset(x[:, :1], label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + predicted = est.predict(x[1:, :1]) + vals, counts = np.unique(predicted, return_counts=True) + self.assertGreaterEqual(min(counts), 9) + self.assertLessEqual(max(counts), 11) + + def test_binning_same_sign(self): + # test that binning works properly for features with only positive or only negative values + x = np.zeros((99, 2)) + x[:, 0] = np.arange(0.01, 1, 0.01) + x[:, 1] = -np.arange(0.01, 1, 0.01) + y = np.arange(0.01, 1, 0.01) + params = {'objective': 'regression_l1', + 'max_bin': 5, + 'num_leaves': 2, + 'min_data_in_leaf': 1, + 'verbose': -1, + 'seed': 0} + lgb_x = lgb.Dataset(x, label=y) + est = lgb.train(params, lgb_x, num_boost_round=100) + new_x = np.zeros((3, 2)) + new_x[:, 0] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertAlmostEqual(predicted[0], predicted[1]) + self.assertNotAlmostEqual(predicted[1], predicted[2]) + new_x = np.zeros((3, 2)) + new_x[:, 1] = [-1, 0, 1] + predicted = est.predict(new_x) + self.assertNotAlmostEqual(predicted[0], predicted[1]) + self.assertAlmostEqual(predicted[1], predicted[2])