[python] Configure choice of `feature_importance_` in sklearn API (#1470)

* ignore vim temporary files

* add importance_type arg to sklearn API

* update documentation info

* remote a trailing space

* remove trailing space (again :))

* add instructions on importance choices to sklearn API

* drop mention of constructor in the feature type setting

* adding a test for different feture types

* remove trailing spaces, make shorter assert in feature importance type handling test

* fixing style issue introduced with the new test
This commit is contained in:
Misha Lisovyi 2018-07-11 17:28:37 +02:00 коммит произвёл Nikita Titov
Родитель fac4afe099
Коммит dae7551629
3 изменённых файлов: 25 добавлений и 3 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -181,6 +181,7 @@ BundleArtifacts/
ClientBin/
~$*
*~
.*.swp
*.dbmdl
*.dbproj.schemaview
*.pfx

Просмотреть файл

@ -134,7 +134,7 @@ class LGBMModel(_LGBMModelBase):
min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
subsample=1., subsample_freq=0, colsample_bytree=1.,
reg_alpha=0., reg_lambda=0., random_state=None,
n_jobs=-1, silent=True, **kwargs):
n_jobs=-1, silent=True, importance_type='split', **kwargs):
"""Construct a gradient boosting model.
Parameters
@ -193,6 +193,10 @@ class LGBMModel(_LGBMModelBase):
Number of parallel threads.
silent : bool, optional (default=True)
Whether to print messages while running boosting.
importance_type : str, optional (default='split')
The type of feature importance to be filled into ``feature_importances_``.
If "split", result contains numbers of times the feature is used in a model.
If "gain", result contains total gains of splits which use the feature.
**kwargs : other parameters
Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.
@ -264,6 +268,7 @@ class LGBMModel(_LGBMModelBase):
self.random_state = random_state
self.n_jobs = n_jobs
self.silent = silent
self.importance_type = importance_type
self._Booster = None
self._evals_result = None
self._best_score = None
@ -399,6 +404,7 @@ class LGBMModel(_LGBMModelBase):
if 'verbose' not in params and self.silent:
params['verbose'] = 0
params.pop('silent', None)
params.pop('importance_type', None)
params.pop('n_estimators', None)
params.pop('class_weight', None)
if self._n_classes is not None and self._n_classes > 2:
@ -606,11 +612,13 @@ class LGBMModel(_LGBMModelBase):
Note
----
Feature importance in sklearn interface used to normalize to 1,
it's deprecated after 2.0.4 and same as Booster.feature_importance() now.
it's deprecated after 2.0.4 and is the same as Booster.feature_importance() now.
``importance_type`` attribute is passed to the function
to configure the type of importance values to be extracted.
"""
if self._n_features is None:
raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
return self.booster_.feature_importance()
return self.booster_.feature_importance(importance_type=self.importance_type)
class LGBMRegressor(LGBMModel, _LGBMRegressorBase):

Просмотреть файл

@ -166,6 +166,19 @@ class TestSklearn(unittest.TestCase):
importances = clf.feature_importances_
self.assertEqual(len(importances), 4)
def test_feature_importances_type(self):
clf = lgb.LGBMClassifier(n_estimators=100)
data = load_iris()
clf.fit(data.data, data.target)
clf.set_params(importance_type='split')
importances_split = clf.feature_importances_
clf.set_params(importance_type='gain')
importances_gain = clf.feature_importances_
# Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
importance_split_top1 = sorted(importances_split, reverse=True)[0]
importance_gain_top1 = sorted(importances_gain, reverse=True)[0]
self.assertNotEqual(importance_split_top1, importance_gain_top1)
def test_sklearn_backward_compatibility(self):
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)