зеркало из https://github.com/microsoft/LightGBM.git
Add Cost Effective Gradient Boosting (#2014)
* Add configuration parameters for CEGB. * Add skeleton CEGB tree learner Like the original CEGB version, this inherits from SerialTreeLearner. Currently, it changes nothing from the original. * Track features used in CEGB tree learner. * Pull CEGB tradeoff and coupled feature penalty from config. * Implement finding best splits for CEGB This is heavily based on the serial version, but just adds using the coupled penalties. * Set proper defaults for cegb parameters. * Ensure sanity checks don't switch off CEGB. * Implement per-data-point feature penalties in CEGB. * Implement split penalty and remove unused parameters. * Merge changes from CEGB tree learner into serial tree learner * Represent features_used_in_data by a bitset, to reduce the memory overhead of CEGB, and add sanity checks for the lengths of the penalty vectors. * Fix bug where CEGB would incorrectly penalise a previously used feature The tree learner did not update the gains of previously computed leaf splits when splitting a leaf elsewhere in the tree. This caused it to prefer new features due to incorrectly penalising splitting on previously used features. * Document CEGB parameters and add them to the appropriate section. * Remove leftover reference to cegb tree learner. * Remove outdated diff. * Fix warnings * Fix minor issues identified by @StrikerRUS. * Add docs section on CEGB, including citation. * Fix link. * Fix CI failure. * Add some unit tests * Fix pylint issues. * Fix remaining pylint issue
This commit is contained in:
Родитель
fe115bbb72
Коммит
76102284d1
|
@ -41,6 +41,19 @@ LambdaRank
|
|||
|
||||
- Use ``max_position`` to set the NDCG optimization position.
|
||||
|
||||
Cost Efficient Gradient Boosting
|
||||
--------------------------------
|
||||
|
||||
`Cost Efficient Gradient Boosting <https://papers.nips.cc/paper/6753-cost-efficient-gradient-boosting.pdf>`_ (CEGB) makes it possible to penalise boosting based on the cost of obtaining feature values.
|
||||
CEGB penalises learning in the following ways:
|
||||
|
||||
- Each time a tree is split, a penalty of ``cegb_penalty_split`` is applied.
|
||||
- When a feature is used for the first time, ``cegb_penalty_feature_coupled`` is applied. This penalty can be different for each feature and should be specified as one ``double`` per feature.
|
||||
- When a feature is used for the first time for a data row, ``cegb_penalty_feature_lazy`` is applied. Like ``cegb_penalty_feature_coupled``, this penalty is specified as one ``double`` per feature.
|
||||
|
||||
Each of the penalties above is scaled by ``cegb_tradeoff``.
|
||||
Using this parameter, it is possible to change the overall strength of the CEGB penalties by changing only one parameter.
|
||||
|
||||
Parameters Tuning
|
||||
-----------------
|
||||
|
||||
|
|
|
@ -374,6 +374,26 @@ Learning Control Parameters
|
|||
|
||||
- used only in ``refit`` task in CLI version or as argument in ``refit`` function in language-specific package
|
||||
|
||||
- ``cegb_tradeoff`` :raw-html:`<a id="cegb_tradeoff" title="Permalink to this parameter" href="#cegb_tradeoff">🔗︎</a>`, default = ``1.0``, type = double, constraints: ``cegb_tradeoff >= 0.0``
|
||||
|
||||
- cost-effective gradient boosting multiplier for all penalties
|
||||
|
||||
- ``cegb_penalty_split`` :raw-html:`<a id="cegb_penalty_split" title="Permalink to this parameter" href="#cegb_penalty_split">🔗︎</a>`, default = ``0.0``, type = double, constraints: ``cegb_penalty_split >= 0.0``
|
||||
|
||||
- cost-effective gradient-boosting penalty for splitting a node
|
||||
|
||||
- ``cegb_penalty_feature_lazy`` :raw-html:`<a id="cegb_penalty_feature_lazy" title="Permalink to this parameter" href="#cegb_penalty_feature_lazy">🔗︎</a>`, default = ``0,0,...,0``, type = multi-double
|
||||
|
||||
- cost-effective gradient boosting penalty for using a feature
|
||||
|
||||
- applied per data point
|
||||
|
||||
- ``cegb_penalty_feature_coupled`` :raw-html:`<a id="cegb_penalty_feature_coupled" title="Permalink to this parameter" href="#cegb_penalty_feature_coupled">🔗︎</a>`, default = ``0,0,...,0``, type = multi-double
|
||||
|
||||
- cost-effective gradient boosting penalty for using a feature
|
||||
|
||||
- applied once per forest
|
||||
|
||||
IO Parameters
|
||||
-------------
|
||||
|
||||
|
|
|
@ -377,6 +377,26 @@ struct Config {
|
|||
// desc = used only in ``refit`` task in CLI version or as argument in ``refit`` function in language-specific package
|
||||
double refit_decay_rate = 0.9;
|
||||
|
||||
// check = >=0.0
|
||||
// desc = cost-effective gradient boosting multiplier for all penalties
|
||||
double cegb_tradeoff = 1.0;
|
||||
|
||||
// check = >=0.0
|
||||
// desc = cost-effective gradient-boosting penalty for splitting a node
|
||||
double cegb_penalty_split = 0.0;
|
||||
|
||||
// type = multi-double
|
||||
// default = 0,0,...,0
|
||||
// desc = cost-effective gradient boosting penalty for using a feature
|
||||
// desc = applied per data point
|
||||
std::vector<double> cegb_penalty_feature_lazy;
|
||||
|
||||
// type = multi-double
|
||||
// default = 0,0,...,0
|
||||
// desc = cost-effective gradient boosting penalty for using a feature
|
||||
// desc = applied once per forest
|
||||
std::vector<double> cegb_penalty_feature_coupled;
|
||||
|
||||
#pragma endregion
|
||||
|
||||
#pragma region IO Parameters
|
||||
|
|
|
@ -808,6 +808,22 @@ inline static void ObtainMinMaxSum(const T1 *w, int nw, T1 *mi, T1 *ma, T2 *su)
|
|||
}
|
||||
}
|
||||
|
||||
inline static std::vector<uint32_t> EmptyBitset(int n){
|
||||
int size = n / 32;
|
||||
if(n % 32 != 0) size++;
|
||||
return std::vector<uint32_t>(size);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline static void InsertBitset(std::vector<uint32_t>& vec, const T val){
|
||||
int i1 = val / 32;
|
||||
int i2 = val % 32;
|
||||
if (static_cast<int>(vec.size()) < i1 + 1) {
|
||||
vec.resize(i1 + 1, 0);
|
||||
}
|
||||
vec[i1] |= (1 << i2);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline static std::vector<uint32_t> ConstructBitset(const T* vals, int n) {
|
||||
std::vector<uint32_t> ret;
|
||||
|
|
|
@ -197,6 +197,10 @@ std::unordered_set<std::string> Config::parameter_set({
|
|||
"feature_contri",
|
||||
"forcedsplits_filename",
|
||||
"refit_decay_rate",
|
||||
"cegb_tradeoff",
|
||||
"cegb_penalty_split",
|
||||
"cegb_penalty_feature_lazy",
|
||||
"cegb_penalty_feature_coupled",
|
||||
"verbosity",
|
||||
"max_bin",
|
||||
"min_data_in_bin",
|
||||
|
@ -369,6 +373,20 @@ void Config::GetMembersFromString(const std::unordered_map<std::string, std::str
|
|||
CHECK(refit_decay_rate >=0.0);
|
||||
CHECK(refit_decay_rate <=1.0);
|
||||
|
||||
GetDouble(params, "cegb_tradeoff", &cegb_tradeoff);
|
||||
CHECK(cegb_tradeoff >=0.0);
|
||||
|
||||
GetDouble(params, "cegb_penalty_split", &cegb_penalty_split);
|
||||
CHECK(cegb_penalty_split >=0.0);
|
||||
|
||||
if (GetString(params, "cegb_penalty_feature_lazy", &tmp_str)) {
|
||||
cegb_penalty_feature_lazy = Common::StringToArray<double>(tmp_str, ',');
|
||||
}
|
||||
|
||||
if (GetString(params, "cegb_penalty_feature_coupled", &tmp_str)) {
|
||||
cegb_penalty_feature_coupled = Common::StringToArray<double>(tmp_str, ',');
|
||||
}
|
||||
|
||||
GetInt(params, "verbosity", &verbosity);
|
||||
|
||||
GetInt(params, "max_bin", &max_bin);
|
||||
|
@ -554,6 +572,10 @@ std::string Config::SaveMembersToString() const {
|
|||
str_buf << "[feature_contri: " << Common::Join(feature_contri, ",") << "]\n";
|
||||
str_buf << "[forcedsplits_filename: " << forcedsplits_filename << "]\n";
|
||||
str_buf << "[refit_decay_rate: " << refit_decay_rate << "]\n";
|
||||
str_buf << "[cegb_tradeoff: " << cegb_tradeoff << "]\n";
|
||||
str_buf << "[cegb_penalty_split: " << cegb_penalty_split << "]\n";
|
||||
str_buf << "[cegb_penalty_feature_lazy: " << Common::Join(cegb_penalty_feature_lazy, ",") << "]\n";
|
||||
str_buf << "[cegb_penalty_feature_coupled: " << Common::Join(cegb_penalty_feature_coupled, ",") << "]\n";
|
||||
str_buf << "[verbosity: " << verbosity << "]\n";
|
||||
str_buf << "[max_bin: " << max_bin << "]\n";
|
||||
str_buf << "[min_data_in_bin: " << min_data_in_bin << "]\n";
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include <LightGBM/objective_function.h>
|
||||
|
||||
#include <LightGBM/utils/array_args.h>
|
||||
#include <LightGBM/utils/common.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
@ -64,6 +65,7 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
|
|||
histogram_pool_.DynamicChangeSize(train_data_, config_, max_cache_size, config_->num_leaves);
|
||||
// push split information for all leaves
|
||||
best_split_per_leaf_.resize(config_->num_leaves);
|
||||
splits_per_leaf_.resize(config_->num_leaves*train_data_->num_features());
|
||||
|
||||
// get ordered bin
|
||||
train_data_->CreateOrderedBins(&ordered_bins_);
|
||||
|
@ -98,6 +100,16 @@ void SerialTreeLearner::Init(const Dataset* train_data, bool is_constant_hessian
|
|||
}
|
||||
}
|
||||
Log::Info("Number of data: %d, number of used features: %d", num_data_, num_features_);
|
||||
feature_used.clear();
|
||||
feature_used.resize(train_data->num_features());
|
||||
|
||||
if(!config_->cegb_penalty_feature_coupled.empty()){
|
||||
CHECK(config_->cegb_penalty_feature_coupled.size() == static_cast<size_t>(train_data_->num_total_features()));
|
||||
}
|
||||
if(!config_->cegb_penalty_feature_lazy.empty()){
|
||||
CHECK(config_->cegb_penalty_feature_lazy.size() == static_cast<size_t>(train_data_->num_total_features()));
|
||||
feature_used_in_data = Common::EmptyBitset(train_data->num_features() * num_data_);
|
||||
}
|
||||
}
|
||||
|
||||
void SerialTreeLearner::ResetTrainingData(const Dataset* train_data) {
|
||||
|
@ -469,6 +481,28 @@ void SerialTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_featur
|
|||
#endif
|
||||
}
|
||||
|
||||
double SerialTreeLearner::CalculateOndemandCosts(int feature_index, int leaf_index) {
|
||||
if (config_->cegb_penalty_feature_lazy.empty())
|
||||
return 0.0f;
|
||||
|
||||
double penalty = config_->cegb_penalty_feature_lazy[feature_index];
|
||||
|
||||
const int inner_fidx = train_data_->InnerFeatureIndex(feature_index);
|
||||
|
||||
double total = 0.0f;
|
||||
data_size_t cnt_leaf_data = 0;
|
||||
auto tmp_idx = data_partition_->GetIndexOnLeaf(leaf_index, &cnt_leaf_data);
|
||||
|
||||
for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
|
||||
int real_idx = tmp_idx[i_input];
|
||||
if (Common::FindInBitset(feature_used_in_data.data(), train_data_->num_data()*train_data_->num_features(), train_data_->num_data() * inner_fidx + real_idx))
|
||||
continue;
|
||||
total += penalty;
|
||||
}
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
|
||||
#ifdef TIMETAG
|
||||
auto start_time = std::chrono::steady_clock::now();
|
||||
|
@ -496,6 +530,14 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
|
|||
smaller_leaf_splits_->max_constraint(),
|
||||
&smaller_split);
|
||||
smaller_split.feature = real_fidx;
|
||||
smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * smaller_leaf_splits_->num_data_in_leaf();
|
||||
if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]){
|
||||
smaller_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
|
||||
}
|
||||
if(!config_->cegb_penalty_feature_lazy.empty()){
|
||||
smaller_split.gain -= config_->cegb_tradeoff * CalculateOndemandCosts(real_fidx, smaller_leaf_splits_->LeafIndex());
|
||||
}
|
||||
splits_per_leaf_[smaller_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = smaller_split;
|
||||
if (smaller_split > smaller_best[tid]) {
|
||||
smaller_best[tid] = smaller_split;
|
||||
}
|
||||
|
@ -519,6 +561,14 @@ void SerialTreeLearner::FindBestSplitsFromHistograms(const std::vector<int8_t>&
|
|||
larger_leaf_splits_->max_constraint(),
|
||||
&larger_split);
|
||||
larger_split.feature = real_fidx;
|
||||
larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_split * larger_leaf_splits_->num_data_in_leaf();
|
||||
if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[feature_index]){
|
||||
larger_split.gain -= config_->cegb_tradeoff * config_->cegb_penalty_feature_coupled[real_fidx];
|
||||
}
|
||||
if(!config_->cegb_penalty_feature_lazy.empty()){
|
||||
larger_split.gain -= config_->cegb_tradeoff*CalculateOndemandCosts(real_fidx, larger_leaf_splits_->LeafIndex());
|
||||
}
|
||||
splits_per_leaf_[larger_leaf_splits_->LeafIndex()*train_data_->num_features() + feature_index] = larger_split;
|
||||
if (larger_split > larger_best[tid]) {
|
||||
larger_best[tid] = larger_split;
|
||||
}
|
||||
|
@ -703,6 +753,26 @@ int32_t SerialTreeLearner::ForceSplits(Tree* tree, Json& forced_split_json, int*
|
|||
void SerialTreeLearner::Split(Tree* tree, int best_leaf, int* left_leaf, int* right_leaf) {
|
||||
const SplitInfo& best_split_info = best_split_per_leaf_[best_leaf];
|
||||
const int inner_feature_index = train_data_->InnerFeatureIndex(best_split_info.feature);
|
||||
if(!config_->cegb_penalty_feature_coupled.empty() && !feature_used[inner_feature_index]){
|
||||
feature_used[inner_feature_index] = true;
|
||||
for(int i = 0; i < tree->num_leaves(); ++i){
|
||||
if(i == best_leaf) continue;
|
||||
auto split = &splits_per_leaf_[i*train_data_->num_features() + inner_feature_index];
|
||||
split->gain += config_->cegb_tradeoff*config_->cegb_penalty_feature_coupled[best_split_info.feature];
|
||||
if(*split > best_split_per_leaf_[i])
|
||||
best_split_per_leaf_[i] = *split;
|
||||
}
|
||||
}
|
||||
|
||||
if(!config_->cegb_penalty_feature_lazy.empty()){
|
||||
data_size_t cnt_leaf_data = 0;
|
||||
auto tmp_idx = data_partition_->GetIndexOnLeaf(best_leaf, &cnt_leaf_data);
|
||||
for (data_size_t i_input = 0; i_input < cnt_leaf_data; ++i_input) {
|
||||
int real_idx = tmp_idx[i_input];
|
||||
Common::InsertBitset(feature_used_in_data, train_data_->num_data() * inner_feature_index + real_idx);
|
||||
}
|
||||
}
|
||||
|
||||
// left = parent
|
||||
*left_leaf = best_leaf;
|
||||
bool is_numerical_split = train_data_->FeatureBinMapper(inner_feature_index)->bin_type() == BinType::NumericalBin;
|
||||
|
|
|
@ -112,6 +112,9 @@ class SerialTreeLearner: public TreeLearner {
|
|||
* \return The number of data in the leaf_idx leaf
|
||||
*/
|
||||
inline virtual data_size_t GetGlobalDataCountInLeaf(int leaf_idx) const;
|
||||
|
||||
double CalculateOndemandCosts(int feature_index, int leaf_index);
|
||||
|
||||
/*! \brief number of data */
|
||||
data_size_t num_data_;
|
||||
/*! \brief number of features */
|
||||
|
@ -137,6 +140,8 @@ class SerialTreeLearner: public TreeLearner {
|
|||
|
||||
/*! \brief store best split points for all leaves */
|
||||
std::vector<SplitInfo> best_split_per_leaf_;
|
||||
/*! \brief store best split per feature for all leaves */
|
||||
std::vector<SplitInfo> splits_per_leaf_;
|
||||
|
||||
/*! \brief stores best thresholds for all feature for smaller leaf */
|
||||
std::unique_ptr<LeafSplits> smaller_leaf_splits_;
|
||||
|
@ -169,6 +174,9 @@ class SerialTreeLearner: public TreeLearner {
|
|||
int num_threads_;
|
||||
std::vector<int> ordered_bin_indices_;
|
||||
bool is_constant_hessian_;
|
||||
|
||||
std::vector<bool> feature_used;
|
||||
std::vector<uint32_t> feature_used_in_data;
|
||||
};
|
||||
|
||||
inline data_size_t SerialTreeLearner::GetGlobalDataCountInLeaf(int leaf_idx) const {
|
||||
|
|
|
@ -216,3 +216,68 @@ class TestBasic(unittest.TestCase):
|
|||
self.assertIsNone(actual)
|
||||
else:
|
||||
np.testing.assert_array_equal(actual, expected)
|
||||
|
||||
def test_cegb_affects_behavior(self):
|
||||
X = np.random.random((1000, 5))
|
||||
X[:, [1, 3]] = 0
|
||||
y = np.random.random(1000)
|
||||
names = ['col_%d' % i for i in range(5)]
|
||||
ds = lgb.Dataset(X, feature_name=names).construct()
|
||||
ds.set_label(y)
|
||||
base = lgb.Booster(train_set=ds)
|
||||
for k in range(10):
|
||||
base.update()
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
basename = f.name
|
||||
base.save_model(basename)
|
||||
with open(basename, 'rt') as f:
|
||||
basetxt = f.read()
|
||||
# Set extremely harsh penalties, so CEGB will block most splits.
|
||||
cases = [{'cegb_penalty_feature_coupled': [50, 100, 10, 25, 30]},
|
||||
{'cegb_penalty_feature_lazy': [1, 2, 3, 4, 5]},
|
||||
{'cegb_penalty_split': 1}]
|
||||
for case in cases:
|
||||
booster = lgb.Booster(train_set=ds, params=case)
|
||||
for k in range(10):
|
||||
booster.update()
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
casename = f.name
|
||||
booster.save_model(casename)
|
||||
with open(casename, 'rt') as f:
|
||||
casetxt = f.read()
|
||||
self.assertNotEqual(basetxt, casetxt)
|
||||
|
||||
def test_cegb_scaling_equalities(self):
|
||||
X = np.random.random((1000, 5))
|
||||
X[:, [1, 3]] = 0
|
||||
y = np.random.random(1000)
|
||||
names = ['col_%d' % i for i in range(5)]
|
||||
ds = lgb.Dataset(X, feature_name=names).construct()
|
||||
ds.set_label(y)
|
||||
# Compare pairs of penalties, to ensure scaling works as intended
|
||||
pairs = [({'cegb_penalty_feature_coupled': [1, 2, 1, 2, 1]},
|
||||
{'cegb_penalty_feature_coupled': [0.5, 1, 0.5, 1, 0.5], 'cegb_tradeoff': 2}),
|
||||
({'cegb_penalty_feature_lazy': [0.01, 0.02, 0.03, 0.04, 0.05]},
|
||||
{'cegb_penalty_feature_lazy': [0.005, 0.01, 0.015, 0.02, 0.025], 'cegb_tradeoff': 2}),
|
||||
({'cegb_penalty_split': 1},
|
||||
{'cegb_penalty_split': 2, 'cegb_tradeoff': 0.5})]
|
||||
for (p1, p2) in pairs:
|
||||
booster1 = lgb.Booster(train_set=ds, params=p1)
|
||||
booster2 = lgb.Booster(train_set=ds, params=p2)
|
||||
for k in range(10):
|
||||
booster1.update()
|
||||
booster2.update()
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
p1name = f.name
|
||||
# Reset booster1's parameters to p2, so the parameter section of the file matches.
|
||||
booster1.reset_parameter(p2)
|
||||
booster1.save_model(p1name)
|
||||
with open(p1name, 'rt') as f:
|
||||
p1txt = f.read()
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
p2name = f.name
|
||||
booster2.save_model(p2name)
|
||||
self.maxDiff = None
|
||||
with open(p2name, 'rt') as f:
|
||||
p2txt = f.read()
|
||||
self.assertEqual(p1txt, p2txt)
|
||||
|
|
Загрузка…
Ссылка в новой задаче