[python-package] Add `feature_names_in_` attribute for scikit-learn estimators (fixes #6279) (#6310)

2024-07-03 16:13:47 -07:00 · 2024-07-03 16:13:47 -07:00 · f811c82708
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@ -1144,11 +1144,23 @@ class LGBMModel(_LGBMModelBase):

    @property
    def feature_name_(self) -> List[str]:
-        """:obj:`list` of shape = [n_features]: The names of features."""
+        """:obj:`list` of shape = [n_features]: The names of features.
+
+        .. note::
+
+            If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``.
+        """
        if not self.__sklearn_is_fitted__():
            raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.")
        return self._Booster.feature_name()  # type: ignore[union-attr]

+    @property
+    def feature_names_in_(self) -> np.ndarray:
+        """:obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``."""
+        if not self.__sklearn_is_fitted__():
+            raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
+        return np.array(self.feature_name_)
+

 class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
    """LightGBM regressor."""
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@ -1290,6 +1290,46 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out


+@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
+def test_getting_feature_names_in_np_input(estimator_class):
+    # input is a numpy array, which doesn't have feature names. LightGBM adds
+    # feature names to the fitted model, which is inconsistent with sklearn's behavior
+    X, y = load_digits(n_class=2, return_X_y=True)
+    params = {"n_estimators": 2, "num_leaves": 7}
+    if estimator_class is lgb.LGBMModel:
+        model = estimator_class(**{**params, "objective": "binary"})
+    else:
+        model = estimator_class(**params)
+    with pytest.raises(lgb.compat.LGBMNotFittedError):
+        check_is_fitted(model)
+    if isinstance(model, lgb.LGBMRanker):
+        model.fit(X, y, group=[X.shape[0]])
+    else:
+        model.fit(X, y)
+    np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))
+
+
+@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
+def test_getting_feature_names_in_pd_input(estimator_class):
+    X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
+    col_names = X.columns.to_list()
+    assert isinstance(col_names, list) and all(
+        isinstance(c, str) for c in col_names
+    ), "input data must have feature names for this test to cover the expected functionality"
+    params = {"n_estimators": 2, "num_leaves": 7}
+    if estimator_class is lgb.LGBMModel:
+        model = estimator_class(**{**params, "objective": "binary"})
+    else:
+        model = estimator_class(**params)
+    with pytest.raises(lgb.compat.LGBMNotFittedError):
+        check_is_fitted(model)
+    if isinstance(model, lgb.LGBMRanker):
+        model.fit(X, y, group=[X.shape[0]])
+    else:
+        model.fit(X, y)
+    np.testing.assert_array_equal(model.feature_names_in_, X.columns)
+
+
@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
 def test_sklearn_integration(estimator, check):
    estimator.set_params(min_child_samples=1, min_data_in_bin=1)