Enable support for sklearn 1.4

Signed-off-by: Keith Battocchi <kebatt@microsoft.com>
This commit is contained in:
Keith Battocchi 2024-02-12 11:58:47 -05:00 коммит произвёл Keith Battocchi
Родитель b8a5e2f3c8
Коммит ab572dea46
12 изменённых файлов: 49 добавлений и 34 удалений

Просмотреть файл

@ -663,7 +663,7 @@ To add fixed effect heterogeneity, we can create one-hot encodings of the id, wh
from econml.dml import LinearDML
from sklearn.preprocessing import OneHotEncoder
# removing one id to avoid colinearity, as is standard for fixed effects
X_oh = OneHotEncoder(sparse=False).fit_transform(X)[:, 1:]
X_oh = OneHotEncoder(sparse_output=False).fit_transform(X)[:, 1:]
est = LinearDML(model_y=RandomForestRegressor(),
model_t=RandomForestRegressor())

Просмотреть файл

@ -44,7 +44,7 @@ from ._cate_estimator import (BaseCateEstimator, LinearCateEstimator,
TreatmentExpansionMixin)
from .inference import BootstrapInference
from .utilities import (_deprecate_positional, check_input_arrays,
cross_product, filter_none_kwargs, strata_from_discrete_arrays,
cross_product, filter_none_kwargs, one_hot_encoder, strata_from_discrete_arrays,
inverse_onehot, jacify_featurizer, ndim, reshape, shape, transpose)
from .sklearn_extensions.model_selection import ModelSelector
@ -780,7 +780,7 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
self.transformer.fit(reshape(T, (-1, 1)))
self._d_t = (len(self.transformer.categories_[0]) - 1,)
elif self.treatment_featurizer:
@ -792,7 +792,7 @@ class _OrthoLearner(TreatmentExpansionMixin, LinearCateEstimator):
self.transformer = None
if self.discrete_instrument:
self.z_transformer = OneHotEncoder(categories='auto', sparse=False, drop='first')
self.z_transformer = one_hot_encoder(categories='auto', drop='first')
self.z_transformer.fit(reshape(Z, (-1, 1)))
else:
self.z_transformer = None

Просмотреть файл

@ -16,7 +16,7 @@ from sklearn.pipeline import Pipeline
from sklearn.utils import check_array, check_X_y
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from ..utilities import (check_inputs, check_models, broadcast_unit_treatments, reshape_treatmentwise_effects,
inverse_onehot, transpose, _deprecate_positional)
one_hot_encoder, inverse_onehot, transpose, _deprecate_positional)
from .._shap import _shap_explain_model_cate
@ -109,7 +109,7 @@ class TLearner(TreatmentExpansionMixin, LinearCateEstimator):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)
@ -232,7 +232,7 @@ class SLearner(TreatmentExpansionMixin, LinearCateEstimator):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = (T.shape[1], )
# Note: unlike other Metalearners, we need the controls' encoded column for training
@ -375,7 +375,7 @@ class XLearner(TreatmentExpansionMixin, LinearCateEstimator):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)
@ -537,7 +537,7 @@ class DomainAdaptationLearner(TreatmentExpansionMixin, LinearCateEstimator):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)

Просмотреть файл

@ -39,7 +39,7 @@ from .._cate_estimator import BaseCateEstimator, LinearCateEstimator, TreatmentE
from ._causal_tree import CausalTree
from ..inference import NormalInferenceResults
from ..inference._inference import Inference
from ..utilities import (reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
from ..utilities import (one_hot_encoder, reshape, reshape_Y_T, MAX_RAND_SEED, check_inputs, _deprecate_positional,
cross_product, inverse_onehot, check_input_arrays, jacify_featurizer,
_RegressionWrapper, deprecated, ndim)
from sklearn.model_selection import check_cv
@ -676,7 +676,7 @@ class DMLOrthoForest(BaseOrthoForest):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
d_t_in = T.shape[1:]
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
@ -1030,7 +1030,7 @@ class DROrthoForest(BaseOrthoForest):
categories = self.categories
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self.transformer = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = one_hot_encoder(categories=categories, drop='first')
d_t_in = T.shape[1:]
T = self.transformer.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]

Просмотреть файл

@ -108,7 +108,7 @@ class WeightedModelMixin:
# Normalize inputs
X, y, X_offset, y_offset, X_scale = _preprocess_data(
X, y, fit_intercept=self.fit_intercept, normalize=False,
X, y, fit_intercept=self.fit_intercept,
copy=self.copy_X, check_input=check_input if check_input is not None else True,
sample_weight=sample_weight)
# Weight inputs
@ -737,7 +737,7 @@ class DebiasedLasso(WeightedLasso):
super().fit(X, y, sample_weight, check_input)
# Center X, y
X, y, X_offset, y_offset, X_scale = _preprocess_data(
X, y, fit_intercept=self.fit_intercept, normalize=False,
X, y, fit_intercept=self.fit_intercept,
copy=self.copy_X, check_input=check_input, sample_weight=sample_weight)
# Calculate quantities that will be used later on. Account for centered data

Просмотреть файл

@ -842,13 +842,17 @@ def _cross_val_predict(estimator, X, y=None, *, groups=None, cv=None,
# independent, and that it is pickle-able.
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)
predictions = parallel(delayed(_fit_and_predict)(
clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method)
for train, test in splits)
from pkg_resources import parse_version
if parse_version(sklearn.__version__) < parse_version("0.24.0"):
# Prior to 0.24.0, this private scikit-learn method returned a tuple of two values
predictions = [p[0] for p in predictions]
# verbose was removed from sklearn's non-public _fit_and_predict method in 1.4
if parse_version(sklearn.__version__) < parse_version("1.4"):
predictions = parallel(delayed(_fit_and_predict)(
clone(estimator, safe=safe), X, y, train, test, verbose, fit_params, method)
for train, test in splits)
else:
predictions = parallel(delayed(_fit_and_predict)(
clone(estimator, safe=safe), X, y, train, test, fit_params, method)
for train, test in splits)
inv_test_indices = np.empty(len(test_indices), dtype=int)
inv_test_indices[test_indices] = np.arange(len(test_indices))

Просмотреть файл

@ -25,7 +25,7 @@ from ...dml import LinearDML, CausalForestDML
from ...inference import NormalInferenceResults
from ...sklearn_extensions.linear_model import WeightedLasso
from ...sklearn_extensions.model_selection import GridSearchCVList
from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot
from ...utilities import _RegressionWrapper, get_feature_names_or_default, inverse_onehot, one_hot_encoder
# TODO: this utility is documented but internal; reimplement?
from sklearn.utils import _safe_indexing
@ -203,8 +203,7 @@ class _ColumnTransformer(TransformerMixin):
if cat_cols.shape[1] > 0:
self.has_cats = True
# NOTE: set handle_unknown to 'ignore' so that we don't throw at runtime if given a novel value
self.one_hot_encoder = OneHotEncoder(sparse=False,
handle_unknown='ignore').fit(cat_cols)
self.one_hot_encoder = one_hot_encoder(handle_unknown='ignore').fit(cat_cols)
else:
self.has_cats = False
self.d_x = X.shape[1]
@ -335,12 +334,12 @@ def _process_feature(name, feat_ind, verbose, categorical_inds, categories, hete
# we achieve this by pipelining the X scaling with the Y and T models (with fixed scaling, not refitting)
hinds = heterogeneity_inds[feat_ind]
WX_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
WX_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
[ind for ind in categorical_inds
if ind != feat_ind]),
('drop', 'drop', feat_ind)],
remainder=StandardScaler())
W_transformer = ColumnTransformer([('encode', OneHotEncoder(drop='first', sparse=False),
W_transformer = ColumnTransformer([('encode', one_hot_encoder(drop='first'),
[ind for ind in categorical_inds
if ind != feat_ind and ind not in hinds]),
('drop', 'drop', hinds),
@ -732,8 +731,7 @@ class CausalAnalysis:
if train_y_model:
# perform model selection for the Y model using all X, not on a per-column basis
allX = ColumnTransformer([('encode',
OneHotEncoder(
drop='first', sparse=False),
one_hot_encoder(drop='first'),
self.categorical)],
remainder=StandardScaler()).fit_transform(X)
@ -757,7 +755,7 @@ class CausalAnalysis:
# note that this needs to happen after wrapping to generalize to the multi-class case,
# since otherwise we'll have too many columns to be able to train a classifier
y = OneHotEncoder(drop='first', sparse=False).fit_transform(y)
y = one_hot_encoder(drop='first').fit_transform(y)
assert y.ndim == 1 or y.shape[1] == 1, ("Multiclass classification isn't supported" if self.classification
else "Only a single outcome is supported")

Просмотреть файл

@ -235,7 +235,7 @@ class TestDeepIV(unittest.TestCase):
def one_hot(col, **kwargs):
z = col.reshape(-1, 1)
enc = OneHotEncoder(sparse=False, **kwargs)
enc = OneHotEncoder(sparse_output=False, **kwargs)
return enc.fit_transform(z)
def sensf(x):
@ -375,7 +375,7 @@ Response:{y}".format(**{'x': x.shape, 'z': z.shape,
def one_hot(col, **kwargs):
z = col.reshape(-1, 1)
enc = OneHotEncoder(sparse=False, **kwargs)
enc = OneHotEncoder(sparse_output=False, **kwargs)
return enc.fit_transform(z)
def sensf(x):

Просмотреть файл

@ -910,7 +910,7 @@ class TestDML(unittest.TestCase):
[1, 4, 5, 7, 9, 10, 12, 14, 17])
dml = LinearDML(model_y=LinearRegression(), model_t=LinearRegression(),
fit_cate_intercept=False, featurizer=OneHotEncoder(sparse=False),
fit_cate_intercept=False, featurizer=OneHotEncoder(sparse_output=False),
cv=[splits, splits[::-1]])
T = np.tile([1, 2, 3], 6)

Просмотреть файл

@ -99,7 +99,7 @@ class TestUtilities(unittest.TestCase):
def test_inverse_onehot(self):
T = np.random.randint(4, size=100)
T_oh = OneHotEncoder(categories='auto', sparse=False).fit_transform(T.reshape(-1, 1))[:, 1:]
T_oh = OneHotEncoder(categories='auto', sparse_output=False).fit_transform(T.reshape(-1, 1))[:, 1:]
T_inv = inverse_onehot(T_oh)
np.testing.assert_array_equal(T, T_inv)

Просмотреть файл

@ -6,6 +6,7 @@
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn
import sparse as sp
import itertools
import inspect
@ -18,7 +19,7 @@ from sklearn.linear_model import LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLass
from functools import reduce, wraps
from sklearn.utils import check_array, check_X_y
from sklearn.utils.validation import assert_all_finite
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, LabelEncoder
import warnings
from warnings import warn
from collections.abc import Iterable
@ -1508,3 +1509,15 @@ def strata_from_discrete_arrays(arrs):
curr_array = temp + curr_array * len(enc.classes_)
return curr_array
def one_hot_encoder(sparse=False, **kwargs):
"""
Wrapper for sklearn's OneHotEncoder that handles the name change from `sparse` to `sparse_output`
between sklearn versions 1.1 and 1.2.
"""
from pkg_resources import parse_version
if parse_version(sklearn.__version__) < parse_version("1.2"):
return OneHotEncoder(sparse=sparse, **kwargs)
else:
return OneHotEncoder(sparse_output=sparse, **kwargs)

Просмотреть файл

@ -22,7 +22,7 @@ classifiers = [
dependencies = [
"numpy",
"scipy > 1.4.0",
"scikit-learn >= 1.0, < 1.4",
"scikit-learn >= 1.0, < 1.5",
"sparse",
"joblib >= 0.13.0",
"statsmodels >= 0.10",