move __version__ import to compat.py, test with all ML tasks

2024-10-05 23:46:22 -05:00 · 2024-10-05 23:46:22 -05:00 · 86b5ab3b7b
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, List
 # see https://github.com/microsoft/LightGBM/issues/6509
 """sklearn"""
 try:
+    from sklearn import __version__ as _sklearn_version
    from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
    from sklearn.preprocessing import LabelEncoder
    from sklearn.utils.class_weight import compute_sample_weight
@ -117,6 +118,7 @@ except ImportError:
    _LGBMCheckClassificationTargets = None
    _LGBMComputeSampleWeight = None
    _LGBMValidateData = None
+    _sklearn_version = None

 # additional scikit-learn imports only for type hints
 if TYPE_CHECKING:
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@ -40,6 +40,7 @@ from .compat import (
    _LGBMModelBase,
    _LGBMRegressorBase,
    _LGBMValidateData,
+    _sklearn_version,
    dt_DataTable,
    pd_DataFrame,
 )
@ -729,11 +730,9 @@ class LGBMModel(_LGBMModelBase):
        # _LGBMModelBase.__sklearn_tags__() cannot be called unconditionally,
        # because that method isn't defined for scikit-learn<1.6
        if not hasattr(_LGBMModelBase, "__sklearn_tags__"):
-            from sklearn import __version__ as sklearn_version
-
            err_msg = (
                "__sklearn_tags__() should not be called when using scikit-learn<1.6. "
-                f"detected version: {sklearn_version}"
+                f"detected version: {_sklearn_version}"
            )
            raise AttributeError(err_msg)

--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@ -43,6 +43,7 @@ task_to_model_factory = {
    "multiclass-classification": lgb.LGBMClassifier,
    "regression": lgb.LGBMRegressor,
 }
+all_tasks = tuple(task_to_model_factory.keys())


 def _create_data(task, n_samples=100, n_features=4):
@ -1459,7 +1460,7 @@ def test_sklearn_tags_should_correctly_reflect_lightgbm_specific_values(estimato
        assert sklearn_tags._xfail_checks == more_tags["_xfail_checks"]


-@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
+@pytest.mark.parametrize("task", all_tasks)
 def test_training_succeeds_when_data_is_dataframe_and_label_is_column_array(task):
    pd = pytest.importorskip("pandas")
    X, y, g = _create_data(task)
@ -1563,7 +1564,7 @@ def test_default_n_jobs(tmp_path):


@pytest.mark.skipif(not PANDAS_INSTALLED, reason="pandas is not installed")
-@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "ranking", "regression"])
+@pytest.mark.parametrize("task", all_tasks)
 def test_validate_features(task):
    X, y, g = _create_data(task, n_features=4)
    features = ["x1", "x2", "x3", "x4"]
@ -1586,24 +1587,49 @@ def test_validate_features(task):

 # LightGBM's 'predict_disable_shape_check' mechanism is intentionally not respected by
 # its scikit-learn estimators, for consistency with scikit-learn's own behavior.
+@pytest.mark.parametrize("task", all_tasks)
@pytest.mark.parametrize("predict_disable_shape_check", [True, False])
-def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check):
-    X, y, _ = _create_data(task="regression", n_features=4)
+def test_predict_rejects_inputs_with_incorrect_number_of_features(predict_disable_shape_check, task):
+    X, y, g = _create_data(task, n_features=4)
+    model_factory = task_to_model_factory[task]
+    fit_kwargs = {"X": X[:, :-1], "y": y}
+    if task == "ranking":
+        estimator_name = "LGBMRanker"
+        fit_kwargs.update({"group": g})
+    elif task == "regression":
+        estimator_name = "LGBMRegressor"
+    else:
+        estimator_name = "LGBMClassifier"
+
    # train on the first 3 features
-    model = lgb.LGBMRegressor(n_estimators=5, num_leaves=7, verbose=-1).fit(X[:, :-1], y)
+    model = model_factory(n_estimators=5, num_leaves=7, verbose=-1).fit(**fit_kwargs)

    # more cols in X than features: error
-    with pytest.raises(ValueError, match="X has 4 features, but LGBMRegressor is expecting 3 features as input"):
+    err_msg = f"X has 4 features, but {estimator_name} is expecting 3 features as input"
+    with pytest.raises(ValueError, match=err_msg):
        model.predict(X, predict_disable_shape_check=predict_disable_shape_check)

+    if estimator_name == "LGBMClassifier":
+        with pytest.raises(ValueError, match=err_msg):
+            model.predict_proba(X, predict_disable_shape_check=predict_disable_shape_check)
+
    # fewer cols in X than features: error
-    with pytest.raises(ValueError, match="X has 2 features, but LGBMRegressor is expecting 3 features as input"):
+    err_msg = f"X has 2 features, but {estimator_name} is expecting 3 features as input"
+    with pytest.raises(ValueError, match=err_msg):
        model.predict(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)

+    if estimator_name == "LGBMClassifier":
+        with pytest.raises(ValueError, match=err_msg):
+            model.predict_proba(X[:, :-2], predict_disable_shape_check=predict_disable_shape_check)
+
    # same number of columns in both: no error
    preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
    assert preds.shape == y.shape

+    if estimator_name == "LGBMClassifier":
+        preds = model.predict(X[:, :-1], predict_disable_shape_check=predict_disable_shape_check)
+        assert preds.shape == y.shape
+

@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])