[python-package] Add `feature_names_in_` attribute for scikit-learn estimators (fixes #6279) (#6310)

This commit is contained in:
Nick Miller 2024-07-03 16:13:47 -07:00 коммит произвёл GitHub
Родитель 7d9106d209
Коммит f811c82708
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
2 изменённых файлов: 53 добавлений и 1 удалений

Просмотреть файл

@ -1144,11 +1144,23 @@ class LGBMModel(_LGBMModelBase):
@property
def feature_name_(self) -> List[str]:
""":obj:`list` of shape = [n_features]: The names of features."""
""":obj:`list` of shape = [n_features]: The names of features.
.. note::
If input does not contain feature names, they will be added during fitting in the format ``Column_0``, ``Column_1``, ..., ``Column_N``.
"""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("No feature_name found. Need to call fit beforehand.")
return self._Booster.feature_name() # type: ignore[union-attr]
@property
def feature_names_in_(self) -> np.ndarray:
""":obj:`array` of shape = [n_features]: scikit-learn compatible version of ``.feature_name_``."""
if not self.__sklearn_is_fitted__():
raise LGBMNotFittedError("No feature_names_in_ found. Need to call fit beforehand.")
return np.array(self.feature_name_)
class LGBMRegressor(_LGBMRegressorBase, LGBMModel):
"""LightGBM regressor."""

Просмотреть файл

@ -1290,6 +1290,46 @@ def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
def test_getting_feature_names_in_np_input(estimator_class):
# input is a numpy array, which doesn't have feature names. LightGBM adds
# feature names to the fitted model, which is inconsistent with sklearn's behavior
X, y = load_digits(n_class=2, return_X_y=True)
params = {"n_estimators": 2, "num_leaves": 7}
if estimator_class is lgb.LGBMModel:
model = estimator_class(**{**params, "objective": "binary"})
else:
model = estimator_class(**params)
with pytest.raises(lgb.compat.LGBMNotFittedError):
check_is_fitted(model)
if isinstance(model, lgb.LGBMRanker):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, np.array([f"Column_{i}" for i in range(X.shape[1])]))
@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
def test_getting_feature_names_in_pd_input(estimator_class):
X, y = load_digits(n_class=2, return_X_y=True, as_frame=True)
col_names = X.columns.to_list()
assert isinstance(col_names, list) and all(
isinstance(c, str) for c in col_names
), "input data must have feature names for this test to cover the expected functionality"
params = {"n_estimators": 2, "num_leaves": 7}
if estimator_class is lgb.LGBMModel:
model = estimator_class(**{**params, "objective": "binary"})
else:
model = estimator_class(**params)
with pytest.raises(lgb.compat.LGBMNotFittedError):
check_is_fitted(model)
if isinstance(model, lgb.LGBMRanker):
model.fit(X, y, group=[X.shape[0]])
else:
model.fit(X, y)
np.testing.assert_array_equal(model.feature_names_in_, X.columns)
@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
def test_sklearn_integration(estimator, check):
estimator.set_params(min_child_samples=1, min_data_in_bin=1)