EconML/econml/dml.py

1192 строки
56 KiB
Python
Исходник Ответственный История

Этот файл содержит неоднозначные символы Юникода!

Этот файл содержит неоднозначные символы Юникода, которые могут быть перепутаны с другими в текущей локали. Если это намеренно, можете спокойно проигнорировать это предупреждение. Используйте кнопку Экранировать, чтобы подсветить эти символы.

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""Double Machine Learning. The method uses machine learning methods to identify the
part of the observed outcome and treatment that is not predictable by the controls X, W
(aka residual outcome and residual treatment).
Then estimates a CATE model by regressing the residual outcome on the residual treatment
in a manner that accounts for heterogeneity in the regression coefficient, with respect
to X.
References
----------
\\ V. Chernozhukov, D. Chetverikov, M. Demirer, E. Duflo, C. Hansen, and a. W. Newey.
Double Machine Learning for Treatment and Causal Parameters.
https://arxiv.org/abs/1608.00060, 2016.
\\ X. Nie and S. Wager.
Quasi-Oracle Estimation of Heterogeneous Treatment Effects.
arXiv preprint arXiv:1712.04912, 2017. URL http://arxiv.org/abs/1712.04912.
\\ V. Chernozhukov, M. Goldman, V. Semenova, and M. Taddy.
Orthogonal Machine Learning for Demand Estimation: High Dimensional Causal Inference in Dynamic Panels.
https://arxiv.org/abs/1712.09988, December 2017.
\\ V. Chernozhukov, D. Nekipelov, V. Semenova, and V. Syrgkanis.
Two-Stage Estimation with a High-Dimensional Second Stage.
https://arxiv.org/abs/1806.04823, 2018.
\\ Dylan Foster, Vasilis Syrgkanis (2019).
Orthogonal Statistical Learning.
ACM Conference on Learning Theory. https://arxiv.org/abs/1901.09036
"""
from warnings import warn
import numpy as np
from sklearn.base import TransformerMixin, clone
from sklearn.linear_model import (ElasticNetCV, LassoCV, LogisticRegressionCV)
from sklearn.model_selection import KFold, StratifiedKFold, check_cv
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, LabelEncoder,
OneHotEncoder)
from sklearn.utils import check_random_state
from ._rlearner import _RLearner
from .cate_estimator import (DebiasedLassoCateEstimatorMixin,
ForestModelFinalCateEstimatorMixin,
LinearModelFinalCateEstimatorMixin,
StatsModelsCateEstimatorMixin)
from .inference import StatsModelsInference
from .sklearn_extensions.ensemble import SubsampledHonestForest
from .sklearn_extensions.linear_model import (MultiOutputDebiasedLasso,
StatsModelsLinearRegression,
WeightedLassoCVWrapper)
from .sklearn_extensions.model_selection import WeightedStratifiedKFold
from .utilities import (_deprecate_positional, add_intercept,
broadcast_unit_treatments, check_high_dimensional,
check_input_arrays, cross_product, deprecated,
fit_with_groups, hstack, inverse_onehot, ndim, reshape,
reshape_treatmentwise_effects, shape, transpose)
class _FirstStageWrapper:
def __init__(self, model, is_Y, featurizer, linear_first_stages, discrete_treatment):
self._model = clone(model, safe=False)
self._featurizer = clone(featurizer, safe=False)
self._is_Y = is_Y
self._linear_first_stages = linear_first_stages
self._discrete_treatment = discrete_treatment
def _combine(self, X, W, n_samples, fitting=True):
if X is None:
# if both X and W are None, just return a column of ones
return (W if W is not None else np.ones((n_samples, 1)))
XW = hstack([X, W]) if W is not None else X
if self._is_Y and self._linear_first_stages:
if self._featurizer is None:
F = X
else:
F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
return cross_product(XW, hstack([np.ones((shape(XW)[0], 1)), F]))
else:
return XW
def fit(self, X, W, Target, sample_weight=None, groups=None):
if (not self._is_Y) and self._discrete_treatment:
# In this case, the Target is the one-hot-encoding of the treatment variable
# We need to go back to the label representation of the one-hot so as to call
# the classifier.
if np.any(np.all(Target == 0, axis=0)) or (not np.any(np.all(Target == 0, axis=1))):
raise AttributeError("Provided crossfit folds contain training splits that " +
"don't contain all treatments")
Target = inverse_onehot(Target)
if sample_weight is not None:
fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups,
sample_weight=sample_weight)
else:
fit_with_groups(self._model, self._combine(X, W, Target.shape[0]), Target, groups=groups)
def predict(self, X, W):
n_samples = X.shape[0] if X is not None else (W.shape[0] if W is not None else 1)
if (not self._is_Y) and self._discrete_treatment:
return self._model.predict_proba(self._combine(X, W, n_samples, fitting=False))[:, 1:]
else:
return self._model.predict(self._combine(X, W, n_samples, fitting=False))
def score(self, X, W, Target, sample_weight=None):
if hasattr(self._model, 'score'):
if (not self._is_Y) and self._discrete_treatment:
# In this case, the Target is the one-hot-encoding of the treatment variable
# We need to go back to the label representation of the one-hot so as to call
# the classifier.
Target = inverse_onehot(Target)
if sample_weight is not None:
return self._model.score(self._combine(X, W, Target.shape[0]), Target, sample_weight=sample_weight)
else:
return self._model.score(self._combine(X, W, Target.shape[0]), Target)
else:
return None
class _FinalWrapper:
def __init__(self, model_final, fit_cate_intercept, featurizer, use_weight_trick):
self._model = clone(model_final, safe=False)
self._use_weight_trick = use_weight_trick
self._original_featurizer = clone(featurizer, safe=False)
if self._use_weight_trick:
self._fit_cate_intercept = False
self._featurizer = self._original_featurizer
else:
self._fit_cate_intercept = fit_cate_intercept
if self._fit_cate_intercept:
add_intercept_trans = FunctionTransformer(add_intercept,
validate=True)
if featurizer:
self._featurizer = Pipeline([('featurize', self._original_featurizer),
('add_intercept', add_intercept_trans)])
else:
self._featurizer = add_intercept_trans
else:
self._featurizer = self._original_featurizer
def _combine(self, X, T, fitting=True):
if X is not None:
if self._featurizer is not None:
F = self._featurizer.fit_transform(X) if fitting else self._featurizer.transform(X)
else:
F = X
else:
if not self._fit_cate_intercept:
if self._use_weight_trick:
raise AttributeError("Cannot use this method with X=None. Consider "
"using the LinearDML estimator.")
else:
raise AttributeError("Cannot have X=None and also not allow for a CATE intercept!")
F = np.ones((T.shape[0], 1))
return cross_product(F, T)
def fit(self, X, T_res, Y_res, sample_weight=None, sample_var=None):
# Track training dimensions to see if Y or T is a vector instead of a 2-dimensional array
self._d_t = shape(T_res)[1:]
self._d_y = shape(Y_res)[1:]
if not self._use_weight_trick:
fts = self._combine(X, T_res)
if sample_weight is not None:
if sample_var is not None:
self._model.fit(fts,
Y_res, sample_weight=sample_weight, sample_var=sample_var)
else:
self._model.fit(fts,
Y_res, sample_weight=sample_weight)
else:
self._model.fit(fts, Y_res)
self._intercept = None
intercept = self._model.predict(np.zeros_like(fts[0:1]))
if (np.count_nonzero(intercept) > 0):
warn("The final model has a nonzero intercept for at least one outcome; "
"it will be subtracted, but consider fitting a model without an intercept if possible.",
UserWarning)
self._intercept = intercept
elif not self._fit_cate_intercept:
if (np.ndim(T_res) > 1) and (self._d_t[0] > 1):
raise AttributeError("This method can only be used with single-dimensional continuous treatment "
"or binary categorical treatment.")
F = self._combine(X, np.ones(T_res.shape[0]))
self._intercept = None
T_res = T_res.ravel()
sign_T_res = np.sign(T_res)
sign_T_res[(sign_T_res < 1) & (sign_T_res > -1)] = 1
clipped_T_res = sign_T_res * np.clip(np.abs(T_res), 1e-5, np.inf)
if np.ndim(Y_res) > 1:
clipped_T_res = clipped_T_res.reshape(-1, 1)
target = Y_res / clipped_T_res
target_var = sample_var / clipped_T_res**2 if sample_var is not None else None
if sample_weight is not None:
if target_var is not None:
self._model.fit(F, target, sample_weight=sample_weight * T_res.flatten()**2,
sample_var=target_var)
else:
self._model.fit(F, target, sample_weight=sample_weight * T_res.flatten()**2)
else:
self._model.fit(F, target, sample_weight=T_res.flatten()**2)
else:
raise AttributeError("This combination is not a feasible one!")
def predict(self, X):
X2, T = broadcast_unit_treatments(X if X is not None else np.empty((1, 0)),
self._d_t[0] if self._d_t else 1)
# This works both with our without the weighting trick as the treatments T are unit vector
# treatments. And in the case of a weighting trick we also know that treatment is single-dimensional
prediction = self._model.predict(self._combine(None if X is None else X2, T, fitting=False))
if self._intercept is not None:
prediction -= self._intercept
return reshape_treatmentwise_effects(prediction,
self._d_t, self._d_y)
class _BaseDML(_RLearner):
# A helper class that access all the internal fitted objects of a DML Cate Estimator. Used by
# both Parametric and Non Parametric DML.
@property
def original_featurizer(self):
return super().model_final._original_featurizer
@property
def featurizer(self):
# NOTE This is used by the inference methods and has to be the overall featurizer. intended
# for internal use by the library
return super().model_final._featurizer
@property
def model_final(self):
# NOTE This is used by the inference methods and is more for internal use to the library
return super().model_final._model
@property
def model_cate(self):
"""
Get the fitted final CATE model.
Returns
-------
model_cate: object of type(model_final)
An instance of the model_final object that was fitted after calling fit which corresponds
to the constant marginal CATE model.
"""
return super().model_final._model
@property
def models_y(self):
"""
Get the fitted models for E[Y | X, W].
Returns
-------
models_y: list of objects of type(`model_y`)
A list of instances of the `model_y` object. Each element corresponds to a crossfitting
fold and is the model instance that was fitted for that training fold.
"""
return [mdl._model for mdl in super().models_y]
@property
def models_t(self):
"""
Get the fitted models for E[T | X, W].
Returns
-------
models_y: list of objects of type(`model_t`)
A list of instances of the `model_y` object. Each element corresponds to a crossfitting
fold and is the model instance that was fitted for that training fold.
"""
return [mdl._model for mdl in super().models_t]
def cate_feature_names(self, input_feature_names=None):
"""
Get the output feature names.
Parameters
----------
input_feature_names: list of strings of length X.shape[1] or None
The names of the input features
Returns
-------
out_feature_names: list of strings or None
The names of the output features :math:`\\phi(X)`, i.e. the features with respect to which the
final constant marginal CATE model is linear. It is the names of the features that are associated
with each entry of the :meth:`coef_` parameter. Not available when the featurizer is not None and
does not have a method: `get_feature_names(input_feature_names)`. Otherwise None is returned.
"""
if self.original_featurizer is None:
return input_feature_names
elif hasattr(self.original_featurizer, 'get_feature_names'):
return self.original_featurizer.get_feature_names(input_feature_names)
else:
raise AttributeError("Featurizer does not have a method: get_feature_names!")
class DML(LinearModelFinalCateEstimatorMixin, _BaseDML):
"""
The base class for parametric Double ML estimators. The estimator is a special
case of an :class:`._RLearner` estimator, which in turn is a special case
of an :class:`_OrthoLearner` estimator, so it follows the two
stage process, where a set of nuisance functions are estimated in the first stage in a crossfitting
manner and a final stage estimates the CATE model. See the documentation of
:class:`._OrthoLearner` for a description of this two stage process.
In this estimator, the CATE is estimated by using the following estimating equations:
.. math ::
Y - \\E[Y | X, W] = \\Theta(X) \\cdot (T - \\E[T | X, W]) + \\epsilon
Thus if we estimate the nuisance functions :math:`q(X, W) = \\E[Y | X, W]` and
:math:`f(X, W)=\\E[T | X, W]` in the first stage, we can estimate the final stage cate for each
treatment t, by running a regression, minimizing the residual on residual square loss:
.. math ::
\\hat{\\theta} = \\arg\\min_{\\Theta}\
\\E_n\\left[ (\\tilde{Y} - \\Theta(X) \\cdot \\tilde{T})^2 \\right]
Where :math:`\\tilde{Y}=Y - \\E[Y | X, W]` and :math:`\\tilde{T}=T-\\E[T | X, W]` denotes the
residual outcome and residual treatment.
The DML estimator further assumes a linear parametric form for the cate, i.e. for each outcome
:math:`i` and treatment :math:`j`:
.. math ::
\\Theta_{i, j}(X) = \\phi(X)' \\cdot \\Theta_{ij}
For some given feature mapping :math:`\\phi(X)` (the user can provide this featurizer via the `featurizer`
parameter at init time and could be any arbitrary class that adheres to the scikit-learn transformer
interface :class:`~sklearn.base.TransformerMixin`).
The second nuisance function :math:`q` is a simple regression problem and the
:class:`.DML`
class takes as input the parameter `model_y`, which is an arbitrary scikit-learn regressor that
is internally used to solve this regression problem.
The problem of estimating the nuisance function :math:`f` is also a regression problem and
the :class:`.DML`
class takes as input the parameter `model_t`, which is an arbitrary scikit-learn regressor that
is internally used to solve this regression problem. If the init flag `discrete_treatment` is set
to `True`, then the parameter `model_t` is treated as a scikit-learn classifier. The input categorical
treatment is one-hot encoded (excluding the lexicographically smallest treatment which is used as the
baseline) and the `predict_proba` method of the `model_t` classifier is used to
residualize the one-hot encoded treatment.
The final stage is (potentially multi-task) linear regression problem with outcomes the labels
:math:`\\tilde{Y}` and regressors the composite features
:math:`\\tilde{T}\\otimes \\phi(X) = \\mathtt{vec}(\\tilde{T}\\cdot \\phi(X)^T)`.
The :class:`.DML` takes as input parameter
``model_final``, which is any linear scikit-learn regressor that is internally used to solve this
(multi-task) linear regresion problem.
Parameters
----------
model_y: estimator
The estimator for fitting the response to the features. Must implement
`fit` and `predict` methods. Must be a linear model for correctness when linear_first_stages is ``True``.
model_t: estimator or 'auto' (default is 'auto')
The estimator for fitting the treatment to the features.
If estimator, it must implement `fit` and `predict` methods. Must be a linear model for correctness
when linear_first_stages is ``True``;
If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV`
will be applied for discrete treatment,
and :class:`.WeightedLassoCV`/
:class:`.WeightedMultiTaskLassoCV`
will be applied for continuous treatment.
model_final: estimator
The estimator for fitting the response residuals to the treatment residuals. Must implement
`fit` and `predict` methods, and must be a linear model for correctness.
featurizer: :term:`transformer`, optional, default None
Must support fit_transform and transform. Used to create composite features in the final CATE regression.
It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
If featurizer=None, then CATE is trained on X.
fit_cate_intercept : bool, optional, default True
Whether the linear CATE model should have a constant term.
linear_first_stages: bool
Whether the first stage models are linear (in which case we will expand the features passed to
`model_y` accordingly)
discrete_treatment: bool, optional, default False
Whether the treatment values should be treated as categorical, rather than continuous, quantities
categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.
n_splits: int, cross-validation generator or an iterable, optional, default 2
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`cv splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.
"""
def __init__(self,
model_y, model_t, model_final,
featurizer=None,
fit_cate_intercept=True,
linear_first_stages=False,
discrete_treatment=False,
categories='auto',
n_splits=2,
random_state=None):
# TODO: consider whether we need more care around stateful featurizers,
# since we clone it and fit separate copies
if model_t == 'auto':
if discrete_treatment:
model_t = LogisticRegressionCV(cv=WeightedStratifiedKFold())
else:
model_t = WeightedLassoCVWrapper()
self.bias_part_of_coef = fit_cate_intercept
self.fit_cate_intercept = fit_cate_intercept
super().__init__(model_y=_FirstStageWrapper(model_y, True,
featurizer, linear_first_stages, discrete_treatment),
model_t=_FirstStageWrapper(model_t, False,
featurizer, linear_first_stages, discrete_treatment),
model_final=_FinalWrapper(model_final, fit_cate_intercept, featurizer, False),
discrete_treatment=discrete_treatment,
categories=categories,
n_splits=n_splits,
random_state=random_state)
# override only so that we can update the docstring to indicate support for `StatsModelsInference`
@_deprecate_positional("X and W should be passed by keyword only. In a future release "
"we will disallow passing X and W by position.", ['X', 'W'])
def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
"""
Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
Parameters
----------
Y: (n × d_y) matrix or vector of length n
Outcomes for each sample
T: (n × dₜ) matrix or vector of length n
Treatments for each sample
X: optional (n × dₓ) matrix
Features for each sample
W: optional (n × d_w) matrix
Controls for each sample
sample_weight: optional (n,) vector
Weights for each row
inference: string, :class:`.Inference` instance, or None
Method for performing inference. This estimator supports 'bootstrap'
(or an instance of :class:`.BootstrapInference`) and 'auto'
(or an instance of :class:`.LinearModelFinalInference`)
Returns
-------
self
"""
return super().fit(Y, T, X=X, W=W, sample_weight=sample_weight, sample_var=sample_var, groups=groups,
inference=inference)
class LinearDML(StatsModelsCateEstimatorMixin, DML):
"""
The Double ML Estimator with a low-dimensional linear final stage implemented as a statsmodel regression.
Parameters
----------
model_y: estimator, optional (default is :class:`.WeightedLassoCVWrapper`)
The estimator for fitting the response to the features. Must implement
`fit` and `predict` methods.
model_t: estimator or 'auto', optional (default is 'auto')
The estimator for fitting the treatment to the features.
If estimator, it must implement `fit` and `predict` methods;
If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be applied for discrete treatment,
and :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV`
will be applied for continuous treatment.
featurizer : :term:`transformer`, optional, default None
Must support fit_transform and transform. Used to create composite features in the final CATE regression.
It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
If featurizer=None, then CATE is trained on X.
fit_cate_intercept : bool, optional, default True
Whether the linear CATE model should have a constant term.
linear_first_stages: bool
Whether the first stage models are linear (in which case we will expand the features passed to
`model_y` accordingly)
discrete_treatment: bool, optional (default is ``False``)
Whether the treatment values should be treated as categorical, rather than continuous, quantities
categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.
n_splits: int, cross-validation generator or an iterable, optional (Default=2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`cv splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
Unless an iterable is used, we call `split(X,T)` to generate the splits.
random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.
"""
def __init__(self,
model_y=WeightedLassoCVWrapper(), model_t='auto',
featurizer=None,
fit_cate_intercept=True,
linear_first_stages=True,
discrete_treatment=False,
categories='auto',
n_splits=2,
random_state=None):
super().__init__(model_y=model_y,
model_t=model_t,
model_final=StatsModelsLinearRegression(fit_intercept=False),
featurizer=featurizer,
fit_cate_intercept=fit_cate_intercept,
linear_first_stages=linear_first_stages,
discrete_treatment=discrete_treatment,
categories=categories,
n_splits=n_splits,
random_state=random_state)
# override only so that we can update the docstring to indicate support for `StatsModelsInference`
@_deprecate_positional("X and W should be passed by keyword only. In a future release "
"we will disallow passing X and W by position.", ['X', 'W'])
def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
"""
Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
Parameters
----------
Y: (n × d_y) matrix or vector of length n
Outcomes for each sample
T: (n × dₜ) matrix or vector of length n
Treatments for each sample
X: optional (n × dₓ) matrix
Features for each sample
W: optional (n × d_w) matrix
Controls for each sample
sample_weight: optional (n,) vector
Weights for each row
sample_var: (n,) vector, optional
Sample variance for each sample
groups: (n,) vector, optional
All rows corresponding to the same group will be kept together during splitting.
If groups is not None, the n_splits argument passed to this class's initializer
must support a 'groups' argument to its split method.
inference: string, :class:`.Inference` instance, or None
Method for performing inference. This estimator supports 'bootstrap'
(or an instance of :class:`.BootstrapInference`) and 'statsmodels'
(or an instance of :class:`.StatsModelsInference`)
Returns
-------
self
"""
return super().fit(Y, T, X=X, W=W,
sample_weight=sample_weight, sample_var=sample_var, groups=groups,
inference=inference)
class SparseLinearDML(DebiasedLassoCateEstimatorMixin, DML):
"""
A specialized version of the Double ML estimator for the sparse linear case.
This estimator should be used when the features of heterogeneity are high-dimensional
and the coefficients of the linear CATE function are sparse.
The last stage is an instance of the
:class:`.MultiOutputDebiasedLasso`
Parameters
----------
model_y: estimator, optional (default is :class:`WeightedLassoCVWrapper()
<econml.sklearn_extensions.linear_model.WeightedLassoCVWrapper>`)
The estimator for fitting the response to the features. Must implement
`fit` and `predict` methods.
model_t: estimator or 'auto', optional (default is 'auto')
The estimator for fitting the treatment to the features.
If estimator, it must implement `fit` and `predict` methods, and must be a
linear model for correctness;
If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV`
will be applied for discrete treatment,
and :class:`.WeightedLassoCV`/
:class:`.WeightedMultiTaskLassoCV`
will be applied for continuous treatment.
alpha: string | float, optional. Default='auto'.
CATE L1 regularization applied through the debiased lasso in the final model.
'auto' corresponds to a CV form of the :class:`MultiOutputDebiasedLasso`.
max_iter : int, optional, default=1000
The maximum number of iterations in the Debiased Lasso
tol : float, optional, default=1e-4
The tolerance for the optimization: if the updates are
smaller than ``tol``, the optimization code checks the
dual gap for optimality and continues until it is smaller
than ``tol``.
featurizer : :term:`transformer`, optional, default None
Must support fit_transform and transform. Used to create composite features in the final CATE regression.
It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
If featurizer=None, then CATE is trained on X.
fit_cate_intercept : bool, optional, default True
Whether the linear CATE model should have a constant term.
linear_first_stages: bool
Whether the first stage models are linear (in which case we will expand the features passed to
`model_y` accordingly)
discrete_treatment: bool, optional (default is ``False``)
Whether the treatment values should be treated as categorical, rather than continuous, quantities
categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.
n_splits: int, cross-validation generator or an iterable, optional (Default=2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`cv splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
Unless an iterable is used, we call `split(X,T)` to generate the splits.
random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.
"""
def __init__(self,
model_y=WeightedLassoCVWrapper(), model_t='auto',
alpha='auto',
max_iter=1000,
tol=1e-4,
featurizer=None,
fit_cate_intercept=True,
linear_first_stages=True,
discrete_treatment=False,
categories='auto',
n_splits=2,
random_state=None):
model_final = MultiOutputDebiasedLasso(
alpha=alpha,
fit_intercept=False,
max_iter=max_iter,
tol=tol)
super().__init__(model_y=model_y,
model_t=model_t,
model_final=model_final,
featurizer=featurizer,
fit_cate_intercept=fit_cate_intercept,
linear_first_stages=linear_first_stages,
discrete_treatment=discrete_treatment,
categories=categories,
n_splits=n_splits,
random_state=random_state)
@_deprecate_positional("X and W should be passed by keyword only. In a future release "
"we will disallow passing X and W by position.", ['X', 'W'])
def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
"""
Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
Parameters
----------
Y: (n × d_y) matrix or vector of length n
Outcomes for each sample
T: (n × dₜ) matrix or vector of length n
Treatments for each sample
X: optional (n × dₓ) matrix
Features for each sample
W: optional (n × d_w) matrix
Controls for each sample
sample_weight: optional (n,) vector
Weights for each row
sample_var: optional (n, n_y) vector
Variance of sample, in case it corresponds to summary of many samples. Currently
not in use by this method but will be supported in a future release.
groups: (n,) vector, optional
All rows corresponding to the same group will be kept together during splitting.
If groups is not None, the n_splits argument passed to this class's initializer
must support a 'groups' argument to its split method.
inference: string, `Inference` instance, or None
Method for performing inference. This estimator supports 'bootstrap'
(or an instance of :class:`.BootstrapInference`) and 'debiasedlasso'
(or an instance of :class:`.LinearModelFinalInference`)
Returns
-------
self
"""
# TODO: support sample_var
if sample_var is not None and inference is not None:
warn("This estimator does not yet support sample variances and inference does not take "
"sample variances into account. This feature will be supported in a future release.")
Y, T, X, W, sample_weight, sample_var = check_input_arrays(Y, T, X, W, sample_weight, sample_var)
check_high_dimensional(X, T, threshold=5, featurizer=self.featurizer,
discrete_treatment=self._discrete_treatment,
msg="The number of features in the final model (< 5) is too small for a sparse model. "
"We recommend using the LinearDML estimator for this low-dimensional setting.")
return super().fit(Y, T, X=X, W=W,
sample_weight=sample_weight, sample_var=None, groups=groups,
inference=inference)
class _RandomFeatures(TransformerMixin):
def __init__(self, dim, bw, random_state):
self._dim = dim
self._bw = bw
self._random_state = check_random_state(random_state)
def fit(self, X):
self.omegas = self._random_state.normal(0, 1 / self._bw, size=(shape(X)[1], self._dim))
self.biases = self._random_state.uniform(0, 2 * np.pi, size=(1, self._dim))
return self
def transform(self, X):
return np.sqrt(2 / self._dim) * np.cos(np.matmul(X, self.omegas) + self.biases)
class KernelDML(DML):
"""
A specialized version of the linear Double ML Estimator that uses random fourier features.
Parameters
----------
model_y: estimator, optional (default is :class:`<econml.sklearn_extensions.linear_model.WeightedLassoCVWrapper>`)
The estimator for fitting the response to the features. Must implement
`fit` and `predict` methods.
model_t: estimator or 'auto', optional (default is 'auto')
The estimator for fitting the treatment to the features.
If estimator, it must implement `fit` and `predict` methods;
If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV`
will be applied for discrete treatment,
and :class:`.WeightedLassoCV`/
:class:`.WeightedMultiTaskLassoCV`
will be applied for continuous treatment.
fit_cate_intercept : bool, optional, default True
Whether the linear CATE model should have a constant term.
dim: int, optional (default is 20)
The number of random Fourier features to generate
bw: float, optional (default is 1.0)
The bandwidth of the Gaussian used to generate features
discrete_treatment: bool, optional (default is ``False``)
Whether the treatment values should be treated as categorical, rather than continuous, quantities
categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.
n_splits: int, cross-validation generator or an iterable, optional (Default=2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`cv splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
Unless an iterable is used, we call `split(X,T)` to generate the splits.
random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.
"""
def __init__(self, model_y=WeightedLassoCVWrapper(), model_t='auto', fit_cate_intercept=True,
dim=20, bw=1.0, discrete_treatment=False, categories='auto', n_splits=2, random_state=None):
super().__init__(model_y=model_y, model_t=model_t,
model_final=ElasticNetCV(fit_intercept=False),
featurizer=_RandomFeatures(dim, bw, random_state),
fit_cate_intercept=fit_cate_intercept,
discrete_treatment=discrete_treatment,
categories=categories,
n_splits=n_splits, random_state=random_state)
class NonParamDML(_BaseDML):
"""
The base class for non-parametric Double ML estimators, that can have arbitrary final ML models of the CATE.
Works only for single-dimensional continuous treatment or for binary categorical treatment and uses
the re-weighting trick, reducing the final CATE estimation to a weighted square loss minimization.
The model_final parameter must support the sample_weight keyword argument at fit time.
Parameters
----------
model_y: estimator
The estimator for fitting the response to the features. Must implement
`fit` and `predict` methods. Must be a linear model for correctness when linear_first_stages is ``True``.
model_t: estimator
The estimator for fitting the treatment to the features. Must implement
`fit` and `predict` methods. Must be a linear model for correctness when linear_first_stages is ``True``.
model_final: estimator
The estimator for fitting the response residuals to the treatment residuals. Must implement
`fit` and `predict` methods. It can be an arbitrary scikit-learn regressor. The `fit` method
must accept `sample_weight` as a keyword argument.
featurizer: transformer
The transformer used to featurize the raw features when fitting the final model. Must implement
a `fit_transform` method.
discrete_treatment: bool, optional (default is ``False``)
Whether the treatment values should be treated as categorical, rather than continuous, quantities
categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.
n_splits: int, cross-validation generator or an iterable, optional (Default=2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`cv splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.
"""
def __init__(self,
model_y, model_t, model_final,
featurizer=None,
discrete_treatment=False,
categories='auto',
n_splits=2,
random_state=None):
# TODO: consider whether we need more care around stateful featurizers,
# since we clone it and fit separate copies
super().__init__(model_y=_FirstStageWrapper(model_y, True,
featurizer, False, discrete_treatment),
model_t=_FirstStageWrapper(model_t, False,
featurizer, False, discrete_treatment),
model_final=_FinalWrapper(model_final, False, featurizer, True),
discrete_treatment=discrete_treatment,
categories=categories,
n_splits=n_splits,
random_state=random_state)
class ForestDML(ForestModelFinalCateEstimatorMixin, NonParamDML):
""" Instance of NonParamDML with a
:class:`~econml.sklearn_extensions.ensemble.SubsampledHonestForest`
as a final model, so as to enable non-parametric inference.
Parameters
----------
model_y: estimator
The estimator for fitting the response to the features. Must implement
`fit` and `predict` methods. Must be a linear model for correctness when linear_first_stages is ``True``.
model_t: estimator
The estimator for fitting the treatment to the features. Must implement
`fit` and `predict` methods. Must be a linear model for correctness when linear_first_stages is ``True``.
discrete_treatment: bool, optional (default is ``False``)
Whether the treatment values should be treated as categorical, rather than continuous, quantities
categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.
n_crossfit_splits: int, cross-validation generator or an iterable, optional (Default=2)
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`cv splitter`
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).
Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
n_estimators : integer, optional (default=100)
The total number of trees in the forest. The forest consists of a
forest of sqrt(n_estimators) sub-forests, where each sub-forest
contains sqrt(n_estimators) trees.
criterion : string, optional (default="mse")
The function to measure the quality of a split. Supported criteria
are "mse" for the mean squared error, which is equal to variance
reduction as feature selection criterion, and "mae" for the mean
absolute error.
max_depth : integer or None, optional (default=None)
The maximum depth of the tree. If None, then nodes are expanded until
all leaves are pure or until all leaves contain less than
min_samples_split samples.
min_samples_split : int, float, optional (default=2)
The minimum number of splitting samples required to split an internal node.
- If int, then consider `min_samples_split` as the minimum number.
- If float, then `min_samples_split` is a fraction and
`ceil(min_samples_split * n_samples)` are the minimum
number of samples for each split.
min_samples_leaf : int, float, optional (default=1)
The minimum number of samples required to be at a leaf node.
A split point at any depth will only be considered if it leaves at
least ``min_samples_leaf`` splitting samples in each of the left and
right branches. This may have the effect of smoothing the model,
especially in regression. After construction the tree is also pruned
so that there are at least min_samples_leaf estimation samples on
each leaf.
- If int, then consider `min_samples_leaf` as the minimum number.
- If float, then `min_samples_leaf` is a fraction and
`ceil(min_samples_leaf * n_samples)` are the minimum
number of samples for each node.
min_weight_fraction_leaf : float, optional (default=0.)
The minimum weighted fraction of the sum total of weights (of all
splitting samples) required to be at a leaf node. Samples have
equal weight when sample_weight is not provided. After construction
the tree is pruned so that the fraction of the sum total weight
of the estimation samples contained in each leaf node is at
least min_weight_fraction_leaf
max_features : int, float, string or None, optional (default="auto")
The number of features to consider when looking for the best split:
- If int, then consider `max_features` features at each split.
- If float, then `max_features` is a fraction and
`int(max_features * n_features)` features are considered at each
split.
- If "auto", then `max_features=n_features`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.
Note: the search for a split does not stop until at least one
valid partition of the node samples is found, even if it requires to
effectively inspect more than ``max_features`` features.
max_leaf_nodes : int or None, optional (default=None)
Grow trees with ``max_leaf_nodes`` in best-first fashion.
Best nodes are defined as relative reduction in impurity.
If None then unlimited number of leaf nodes.
min_impurity_decrease : float, optional (default=0.)
A node will be split if this split induces a decrease of the impurity
greater than or equal to this value.
The weighted impurity decrease equation is the following::
N_t / N * (impurity - N_t_R / N_t * right_impurity
- N_t_L / N_t * left_impurity)
where ``N`` is the total number of split samples, ``N_t`` is the number of
split samples at the current node, ``N_t_L`` is the number of split samples in the
left child, and ``N_t_R`` is the number of split samples in the right child.
``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
if ``sample_weight`` is passed.
subsample_fr : float or 'auto', optional (default='auto')
The fraction of the half-samples that are used on each tree. Each tree
will be built on subsample_fr * n_samples/2.
If 'auto', then the subsampling fraction is set to::
(n_samples/2)**(1-1/(2*n_features+2))/(n_samples/2)
which is sufficient to guarantee asympotitcally valid inference.
honest : boolean, optional (default=True)
Whether to use honest trees, i.e. half of the samples are used for
creating the tree structure and the other half for the estimation at
the leafs. If False, then all samples are used for both parts.
n_jobs : int or None, optional (default=None)
The number of jobs to run in parallel for both `fit` and `predict`.
``None`` means 1 unless in a :func:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
verbose : int, optional (default=0)
Controls the verbosity when fitting and predicting.
random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.
"""
def __init__(self,
model_y, model_t,
discrete_treatment=False,
categories='auto',
n_crossfit_splits=2,
n_estimators=100,
criterion="mse",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_features="auto",
max_leaf_nodes=None,
min_impurity_decrease=0.,
subsample_fr='auto',
honest=True,
n_jobs=None,
verbose=0,
random_state=None):
model_final = SubsampledHonestForest(n_estimators=n_estimators,
criterion=criterion,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=max_features,
max_leaf_nodes=max_leaf_nodes,
min_impurity_decrease=min_impurity_decrease,
subsample_fr=subsample_fr,
honest=honest,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose)
super().__init__(model_y=model_y, model_t=model_t,
model_final=model_final, featurizer=None,
discrete_treatment=discrete_treatment,
categories=categories,
n_splits=n_crossfit_splits, random_state=random_state)
@_deprecate_positional("X and W should be passed by keyword only. In a future release "
"we will disallow passing X and W by position.", ['X', 'W'])
def fit(self, Y, T, X=None, W=None, *, sample_weight=None, sample_var=None, groups=None, inference='auto'):
"""
Estimate the counterfactual model from data, i.e. estimates functions τ(·,·,·), ∂τ(·,·).
Parameters
----------
Y: (n × d_y) matrix or vector of length n
Outcomes for each sample
T: (n × dₜ) matrix or vector of length n
Treatments for each sample
X: optional (n × dₓ) matrix
Features for each sample
W: optional (n × d_w) matrix
Controls for each sample
sample_weight: optional (n,) vector
Weights for each row
sample_var: optional (n, n_y) vector
Variance of sample, in case it corresponds to summary of many samples. Currently
not in use by this method (as inference method does not require sample variance info).
groups: (n,) vector, optional
All rows corresponding to the same group will be kept together during splitting.
If groups is not None, the n_splits argument passed to this class's initializer
must support a 'groups' argument to its split method.
inference: string, `Inference` instance, or None
Method for performing inference. This estimator supports 'bootstrap'
(or an instance of :class:`.BootstrapInference`) and 'blb'
(for Bootstrap-of-Little-Bags based inference)
Returns
-------
self
"""
return super().fit(Y, T, X=X, W=W,
sample_weight=sample_weight, sample_var=None, groups=groups,
inference=inference)
@deprecated("The DMLCateEstimator class has been renamed to DML; "
"an upcoming release will remove support for the old name")
class DMLCateEstimator(DML):
pass
@deprecated("The LinearDMLCateEstimator class has been renamed to LinearDML; "
"an upcoming release will remove support for the old name")
class LinearDMLCateEstimator(LinearDML):
pass
@deprecated("The SparseLinearDMLCateEstimator class has been renamed to SparseLinearDML; "
"an upcoming release will remove support for the old name")
class SparseLinearDMLCateEstimator(SparseLinearDML):
pass
@deprecated("The KernelDMLCateEstimator class has been renamed to KernelDML; "
"an upcoming release will remove support for the old name")
class KernelDMLCateEstimator(KernelDML):
pass
@deprecated("The NonParamDMLCateEstimator class has been renamed to NonParamDML; "
"an upcoming release will remove support for the old name")
class NonParamDMLCateEstimator(NonParamDML):
pass
@deprecated("The ForestDMLCateEstimator class has been renamed to ForestDML; "
"an upcoming release will remove support for the old name")
class ForestDMLCateEstimator(ForestDML):
pass