[python] Configure choice of `feature_importance_` in sklearn API (#1470)

* ignore vim temporary files * add importance_type arg to sklearn API * update documentation info * remote a trailing space * remove trailing space (again :)) * add instructions on importance choices to sklearn API * drop mention of constructor in the feature type setting * adding a test for different feture types * remove trailing spaces, make shorter assert in feature importance type handling test * fixing style issue introduced with the new test
2018-07-11 17:28:37 +02:00 · 2018-07-11 17:28:37 +02:00 · dae7551629
--- a/.gitignore
+++ b/.gitignore
@ -181,6 +181,7 @@ BundleArtifacts/
 ClientBin/
 ~$*
 *~
+.*.swp
 *.dbmdl
 *.dbproj.schemaview
 *.pfx
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@ -134,7 +134,7 @@ class LGBMModel(_LGBMModelBase):
                 min_split_gain=0., min_child_weight=1e-3, min_child_samples=20,
                 subsample=1., subsample_freq=0, colsample_bytree=1.,
                 reg_alpha=0., reg_lambda=0., random_state=None,
-                 n_jobs=-1, silent=True, **kwargs):
+                 n_jobs=-1, silent=True, importance_type='split', **kwargs):
        """Construct a gradient boosting model.

        Parameters
@ -193,6 +193,10 @@ class LGBMModel(_LGBMModelBase):
            Number of parallel threads.
        silent : bool, optional (default=True)
            Whether to print messages while running boosting.
+        importance_type : str, optional (default='split')
+            The type of feature importance to be filled into ``feature_importances_``.
+            If "split", result contains numbers of times the feature is used in a model.
+            If "gain", result contains total gains of splits which use the feature.
        **kwargs : other parameters
            Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more parameters.

@ -264,6 +268,7 @@ class LGBMModel(_LGBMModelBase):
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.silent = silent
+        self.importance_type = importance_type
        self._Booster = None
        self._evals_result = None
        self._best_score = None
@ -399,6 +404,7 @@ class LGBMModel(_LGBMModelBase):
        if 'verbose' not in params and self.silent:
            params['verbose'] = 0
        params.pop('silent', None)
+        params.pop('importance_type', None)
        params.pop('n_estimators', None)
        params.pop('class_weight', None)
        if self._n_classes is not None and self._n_classes > 2:
@ -606,11 +612,13 @@ class LGBMModel(_LGBMModelBase):
        Note
        ----
        Feature importance in sklearn interface used to normalize to 1,
-        it's deprecated after 2.0.4 and same as Booster.feature_importance() now.
+        it's deprecated after 2.0.4 and is the same as Booster.feature_importance() now.
+        ``importance_type`` attribute is passed to the function
+        to configure the type of importance values to be extracted.
        """
        if self._n_features is None:
            raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
-        return self.booster_.feature_importance()
+        return self.booster_.feature_importance(importance_type=self.importance_type)


 class LGBMRegressor(LGBMModel, _LGBMRegressorBase):
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@ -166,6 +166,19 @@ class TestSklearn(unittest.TestCase):
        importances = clf.feature_importances_
        self.assertEqual(len(importances), 4)

+    def test_feature_importances_type(self):
+        clf = lgb.LGBMClassifier(n_estimators=100)
+        data = load_iris()
+        clf.fit(data.data, data.target)
+        clf.set_params(importance_type='split')
+        importances_split = clf.feature_importances_
+        clf.set_params(importance_type='gain')
+        importances_gain = clf.feature_importances_
+        # Test that the largest element is NOT the same, the smallest can be the same, i.e. zero
+        importance_split_top1 = sorted(importances_split, reverse=True)[0]
+        importance_gain_top1 = sorted(importances_gain, reverse=True)[0]
+        self.assertNotEqual(importance_split_top1, importance_gain_top1)
+
    def test_sklearn_backward_compatibility(self):
        iris = load_iris()
        X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)