Enable support for sklearn 1.4

Signed-off-by: Keith Battocchi <kebatt@microsoft.com>
2024-02-12 11:58:47 -05:00 · 2024-02-12 11:58:47 -05:00 · ab572dea46
--- a/doc/spec/estimation/dml.rst
+++ b/doc/spec/estimation/dml.rst
@ -663,7 +663,7 @@ To add fixed effect heterogeneity, we can create one-hot encodings of the id, wh
    from econml.dml import LinearDML
    from sklearn.preprocessing import OneHotEncoder
    # removing one id to avoid colinearity, as is standard for fixed effects
-    X_oh = OneHotEncoder(sparse=False).fit_transform(X)[:, 1:]
+    X_oh = OneHotEncoder(sparse_output=False).fit_transform(X)[:, 1:]

    est = LinearDML(model_y=RandomForestRegressor(),
                                 model_t=RandomForestRegressor())
--- a/econml/_ortho_learner.py
+++ b/econml/_ortho_learner.py
@ -44,7 +44,7 @@ from ._cate_estimator import (BaseCateEstimator, LinearCateEstimator,
                              TreatmentExpansionMixin)
 from .inference import BootstrapInference
 from .utilities import (_deprecate_positional, check_input_arrays,
-                        cross_product, filter_none_kwargs, strata_from_discrete_arrays,
+                        cross_product, filter_none_kwargs, one_hot_encoder, strata_from_discrete_arrays,
                        inverse_onehot, jacify_featurizer, ndim, reshape, shape, transpose)
 from .sklearn_extensions.model_selection import ModelSelector

@ -780,7 +780,7 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
                categories = self.categories
                if categories != 'auto':
                    categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-                self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
+                self.transformer = one_hot_encoder(categories=categories, drop='first')
                self.transformer.fit(reshape(T, (-1, 1)))
                self._d_t = (len(self.transformer.categories_[0]) - 1,)
            elif self.treatment_featurizer:
@ -792,7 +792,7 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
                self.transformer = None

            if self.discrete_instrument:
-                self.z_transformer = OneHotEncoder(categories='auto', sparse=False, drop='first')
+                self.z_transformer = one_hot_encoder(categories='auto', drop='first')
                self.z_transformer.fit(reshape(Z, (-1, 1)))
            else:
                self.z_transformer = None
--- a/econml/metalearners/_metalearners.py
+++ b/econml/metalearners/_metalearners.py
@ -16,7 +16,7 @@ from sklearn.pipeline import Pipeline
 from sklearn.utils import check_array, check_X_y
 from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
 from ..utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects,
-                         inverse_onehot, transpose, _deprecate_positional)
+                         one_hot_encoder, inverse_onehot, transpose, _deprecate_positional)
 from .._shap import _shap_explain_model_cate


@ -109,7 +109,7 @@ class TLearner(TreatmentExpansionMixin, LinearCateEstimator):
        categories = self.categories
        if categories != 'auto':
            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-        self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
+        self.transformer = one_hot_encoder(categories=categories, drop='first')
        T = self.transformer.fit_transform(T.reshape(-1, 1))
        self._d_t = T.shape[1:]
        T = inverse_onehot(T)
@ -232,7 +232,7 @@ class SLearner(TreatmentExpansionMixin, LinearCateEstimator):
        categories = self.categories
        if categories != 'auto':
            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-        self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
+        self.transformer = one_hot_encoder(categories=categories, drop='first')
        T = self.transformer.fit_transform(T.reshape(-1, 1))
        self._d_t = (T.shape[1], )
        # Note: unlike other Metalearners, we need the controls' encoded column for training
@ -375,7 +375,7 @@ class XLearner(TreatmentExpansionMixin, LinearCateEstimator):
        categories = self.categories
        if categories != 'auto':
            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-        self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
+        self.transformer = one_hot_encoder(categories=categories, drop='first')
        T = self.transformer.fit_transform(T.reshape(-1, 1))
        self._d_t = T.shape[1:]
        T = inverse_onehot(T)
@ -537,7 +537,7 @@ class DomainAdaptationLearner(TreatmentExpansionMixin, LinearCateEstimator):
        categories = self.categories
        if categories != 'auto':
            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-        self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
+        self.transformer = one_hot_encoder(categories=categories, drop='first')
        T = self.transformer.fit_transform(T.reshape(-1, 1))
        self._d_t = T.shape[1:]
        T = inverse_onehot(T)
--- a/econml/orf/_ortho_forest.py
+++ b/econml/orf/_ortho_forest.py
@ -39,7 +39,7 @@ from .._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentE
 from ._causal_tree import CausalTree
 from ..inference import NormalInferenceResults
 from ..inference._inference import Inference
-from ..utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
+from ..utilities import (one_hot_encoder, reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
                         cross_product, inverse_onehot, check_input_arrays, jacify_featurizer,
                         _RegressionWrapper, deprecated, ndim)
 from sklearn.model_selection import check_cv
@ -676,7 +676,7 @@ class DMLOrthoForest(BaseOrthoForest):
            categories = self.categories
            if categories != 'auto':
                categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-            self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
+            self.transformer = one_hot_encoder(categories=categories, drop='first')
            d_t_in = T.shape[1:]
            T = self.transformer.fit_transform(T.reshape(-1, 1))
            self._d_t = T.shape[1:]
@ -1030,7 +1030,7 @@ class DROrthoForest(BaseOrthoForest):
        categories = self.categories
        if categories != 'auto':
            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
-        self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
+        self.transformer = one_hot_encoder(categories=categories, drop='first')
        d_t_in = T.shape[1:]
        T = self.transformer.fit_transform(T.reshape(-1, 1))
        self._d_t = T.shape[1:]
--- a/econml/sklearn_extensions/linear_model.py
+++ b/econml/sklearn_extensions/linear_model.py
@ -108,7 +108,7 @@ class WeightedModelMixin:

            # Normalize inputs
            X, y, X_offset, y_offset, X_scale = _preprocess_data(
-                X, y, fit_intercept=self.fit_intercept, normalize=False,
+                X, y, fit_intercept=self.fit_intercept,
                copy=self.copy_X, check_input=check_input if check_input is not None else True,
                sample_weight=sample_weight)
            # Weight inputs
@ -737,7 +737,7 @@ class DebiasedLasso(WeightedLasso):
        super().fit(X, y, sample_weight, check_input)
        # Center X, y
        X, y, X_offset, y_offset, X_scale = _preprocess_data(
-            X, y, fit_intercept=self.fit_intercept, normalize=False,
+            X, y, fit_intercept=self.fit_intercept,
            copy=self.copy_X, check_input=check_input, sample_weight=sample_weight)

        # Calculate quantities that will be used later on. Account for centered data
--- a/econml/sklearn_extensions/model_selection.py
+++ b/econml/sklearn_extensions/model_selection.py
@ -842,13 +842,17 @@ def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
    # independent, and that it is pickle-able.
    parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                        pre_dispatch=pre_dispatch)
-    predictions = parallel(delayed(_fit_and_predict)(
-        clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method)
-        for train, test in splits)
+
    from pkg_resources import parse_version
-    if parse_version(sklearn.__version__) < parse_version("0.24.0"):
-        # Prior to 0.24.0, this private scikit-learn method returned a tuple of two values
-        predictions = [p[0] for p in predictions]
+    # verbose was removed from sklearn's non-public _fit_and_predict method in 1.4
+    if parse_version(sklearn.__version__) < parse_version("1.4"):
+        predictions = parallel(delayed(_fit_and_predict)(
+            clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method)
+            for train, test in splits)
+    else:
+        predictions = parallel(delayed(_fit_and_predict)(
+            clone(estimator, safe=safe), X, y, train, test, fit_params, method)
+            for train, test in splits)

    inv_test_indices = np.empty(len(test_indices), dtype=int)
    inv_test_indices[test_indices] = np.arange(len(test_indices))
--- a/econml/solutions/causal_analysis/_causal_analysis.py
+++ b/econml/solutions/causal_analysis/_causal_analysis.py
@ -25,7 +25,7 @@ from ...dml import LinearDML, CausalForestDML
 from ...inference import NormalInferenceResults
 from ...sklearn_extensions.linear_model import WeightedLasso
 from ...sklearn_extensions.model_selection import GridSearchCVList
-from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot
+from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot, one_hot_encoder

 # TODO: this utility is documented but internal; reimplement?
 from sklearn.utils import _safe_indexing
@ -203,8 +203,7 @@ class _ColumnTransformer(TransformerMixin):
        if cat_cols.shape[1] > 0:
            self.has_cats = True
            # NOTE: set handle_unknown to 'ignore' so that we don't throw at runtime if given a novel value
-            self.one_hot_encoder = OneHotEncoder(sparse=False,
-                                                 handle_unknown='ignore').fit(cat_cols)
+            self.one_hot_encoder = one_hot_encoder(handle_unknown='ignore').fit(cat_cols)
        else:
            self.has_cats = False
        self.d_x = X.shape[1]
@ -335,12 +334,12 @@ def _process_feature(name, feat_ind, verbose, categorical_inds, categories, hete
        # we achieve this by pipelining the X scaling with the Y and T models (with fixed scaling, not refitting)

        hinds = heterogeneity_inds[feat_ind]
-        WX_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
+        WX_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
                                             [ind for ind in categorical_inds
                                              if ind != feat_ind]),
                                            ('drop', 'drop', feat_ind)],
                                           remainder=StandardScaler())
-        W_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
+        W_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
                                            [ind for ind in categorical_inds
                                             if ind != feat_ind and ind not in hinds]),
                                           ('drop', 'drop', hinds),
@ -732,8 +731,7 @@ class CausalAnalysis:
            if train_y_model:
                # perform model selection for the Y model using all X, not on a per-column basis
                allX = ColumnTransformer([('encode',
-                                           OneHotEncoder(
-                                               drop='first', sparse=False),
+                                           one_hot_encoder(drop='first'),
                                           self.categorical)],
                                         remainder=StandardScaler()).fit_transform(X)

@ -757,7 +755,7 @@ class CausalAnalysis:

            # note that this needs to happen after wrapping to generalize to the multi-class case,
            # since otherwise we'll have too many columns to be able to train a classifier
-            y = OneHotEncoder(drop='first', sparse=False).fit_transform(y)
+            y = one_hot_encoder(drop='first').fit_transform(y)

        assert y.ndim == 1 or y.shape[1] == 1, ("Multiclass classification isn't supported" if self.classification
                                                else "Only a single outcome is supported")
--- a/econml/tests/test_deepiv.py
+++ b/econml/tests/test_deepiv.py
@ -235,7 +235,7 @@ class TestDeepIV(unittest.TestCase):

        def one_hot(col, **kwargs):
            z = col.reshape(-1, 1)
-            enc = OneHotEncoder(sparse=False, **kwargs)
+            enc = OneHotEncoder(sparse_output=False, **kwargs)
            return enc.fit_transform(z)

        def sensf(x):
@ -375,7 +375,7 @@ Response:{y}".format(**{'x': x.shape, 'z': z.shape,

        def one_hot(col, **kwargs):
            z = col.reshape(-1, 1)
-            enc = OneHotEncoder(sparse=False, **kwargs)
+            enc = OneHotEncoder(sparse_output=False, **kwargs)
            return enc.fit_transform(z)

        def sensf(x):
--- a/econml/tests/test_dml.py
+++ b/econml/tests/test_dml.py
@ -910,7 +910,7 @@ class TestDML(unittest.TestCase):
                  [1, 4, 5, 7, 9, 10, 12, 14, 17])

        dml = LinearDML(model_y=LinearRegression(), model_t=LinearRegression(),
-                        fit_cate_intercept=False, featurizer=OneHotEncoder(sparse=False),
+                        fit_cate_intercept=False, featurizer=OneHotEncoder(sparse_output=False),
                        cv=[splits, splits[::-1]])

        T = np.tile([1, 2, 3], 6)
--- a/econml/tests/test_utilities.py
+++ b/econml/tests/test_utilities.py
@ -99,7 +99,7 @@ class TestUtilities(unittest.TestCase):

    def test_inverse_onehot(self):
        T = np.random.randint(4, size=100)
-        T_oh = OneHotEncoder(categories='auto', sparse=False).fit_transform(T.reshape(-1, 1))[:, 1:]
+        T_oh = OneHotEncoder(categories='auto', sparse_output=False).fit_transform(T.reshape(-1, 1))[:, 1:]
        T_inv = inverse_onehot(T_oh)
        np.testing.assert_array_equal(T, T_inv)

--- a/econml/utilities.py
+++ b/econml/utilities.py
@ -6,6 +6,7 @@
 import numpy as np
 import pandas as pd
 import scipy.sparse
+import sklearn
 import sparse as sp
 import itertools
 import inspect
@ -18,7 +19,7 @@ from sklearn.linear_model import LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLass
 from functools import reduce, wraps
 from sklearn.utils import check_array, check_X_y
 from sklearn.utils.validation import assert_all_finite
-from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
+from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, LabelEncoder
 import warnings
 from warnings import warn
 from collections.abc import Iterable
@ -1508,3 +1509,15 @@ def strata_from_discrete_arrays(arrs):
        curr_array = temp + curr_array * len(enc.classes_)

    return curr_array
+
+
+def one_hot_encoder(sparse=False, **kwargs):
+    """
+    Wrapper for sklearn's OneHotEncoder that handles the name change from `sparse` to `sparse_output`
+    between sklearn versions 1.1 and 1.2.
+    """
+    from pkg_resources import parse_version
+    if parse_version(sklearn.__version__) < parse_version("1.2"):
+        return OneHotEncoder(sparse=sparse, **kwargs)
+    else:
+        return OneHotEncoder(sparse_output=sparse, **kwargs)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -22,7 +22,7 @@ classifiers = [
 dependencies = [
    "numpy",
    "scipy > 1.4.0",
-    "scikit-learn >= 1.0, < 1.4",
+    "scikit-learn >= 1.0, < 1.5",
    "sparse",
    "joblib >= 0.13.0",
    "statsmodels >= 0.10",