Enable calling dowhy from econml (#400)

* add feature of calling dowhy through econml

* fix bug for shap when parse const_marginal_effect
This commit is contained in:
Maggie Hei 2021-02-11 11:34:37 -05:00 коммит произвёл GitHub
Родитель 3e66b9507b
Коммит f3e46f4097
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
20 изменённых файлов: 1361 добавлений и 954 удалений

Просмотреть файл

@ -91,6 +91,10 @@ jobs:
- template: azure-pipelines-steps.yml
parameters:
body:
# Work around https://github.com/pypa/pip/issues/9542
- script: 'pip install -U numpy'
displayName: 'Upgrade numpy'
- script: 'python setup.py pytest'
displayName: 'Unit tests'
env:

Просмотреть файл

@ -213,7 +213,9 @@ intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
'numpy': ('https://docs.scipy.org/doc/numpy/', None),
'sklearn': ('https://scikit-learn.org/stable/', None),
'matplotlib': ('https://matplotlib.org/', None),
'shap': ('https://shap.readthedocs.io/en/stable/', None)}
'shap': ('https://shap.readthedocs.io/en/stable/', None),
'dowhy': ('https://microsoft.github.io/dowhy/', None)}
# -- Options for todo extension ----------------------------------------------

Просмотреть файл

@ -219,6 +219,17 @@ Inference Methods
econml.inference.StatsModelsInferenceDiscrete
.. _dowhy_api:
Integration with DoWhy
----------------------
.. autosummary::
:toctree: _autosummary
econml.dowhy.DoWhyWrapper
.. _utilities_api:
Utilities

Просмотреть файл

@ -7,4 +7,4 @@ __all__ = ['automated_ml', 'bootstrap',
'inference', 'iv',
'metalearners', 'ortho_forest', 'orf', 'ortho_iv',
'score', 'sklearn_extensions', 'tree',
'two_stage_least_squares', 'utilities']
'two_stage_least_squares', 'utilities', 'dowhy']

Просмотреть файл

@ -15,6 +15,7 @@ from .inference import StatsModelsInference, StatsModelsInferenceDiscrete, Linea
LinearModelFinalInferenceDiscrete, NormalInferenceResults, GenericSingleTreatmentModelFinalInference,\
GenericModelFinalInferenceDiscrete
from ._shap import _shap_explain_cme, _shap_explain_joint_linear_model_cate
from .dowhy import DoWhyWrapper
class BaseCateEstimator(metaclass=abc.ABCMeta):
@ -446,6 +447,18 @@ class BaseCateEstimator(metaclass=abc.ABCMeta):
"""
pass
@property
def dowhy(self):
""" Get an instance of :class:`.DoWhyWrapper` to allow other functionalities from dowhy package.
(e.g. causal graph, refutation test, etc.)
Returns
-------
DoWhyWrapper: instance
An instance of :class:`.DoWhyWrapper`
"""
return DoWhyWrapper(self)
class LinearCateEstimator(BaseCateEstimator):
"""Base class for all CATE estimators with linear treatment effects in this package."""

Просмотреть файл

@ -84,7 +84,7 @@ def _shap_explain_cme(cme_model, X, d_t, d_y,
return shap_outs
def _shap_explain_model_cate(cme_model, models, X, d_t, d_y, feature_names=None,
def _shap_explain_model_cate(cme_model, models, X, d_t, d_y, featurizer=None, feature_names=None,
treatment_names=None, output_names=None,
input_names=None, background_samples=100):
"""
@ -100,11 +100,13 @@ def _shap_explain_model_cate(cme_model, models, X, d_t, d_y, feature_names=None,
models: a single estimator or a list of estimators with one estimator per treatment
models for the model's final stage model.
X: (m, d_x) matrix
Features for each sample. Should be in the same shape of fitted X in final stage.
Features for each sample.
d_t: tuple of int
Tuple of number of treatment (exclude control in discrete treatment scenario.
d_y: tuple of int
Tuple of number of outcome.
featurizer: optional None or instance of featurizer
Fitted Featurizer of feature X.
feature_names: optional None or list of strings of length X.shape[1] (Default=None)
The names of input features.
treatment_names: optional None or list (Default=None)
@ -129,12 +131,16 @@ def _shap_explain_model_cate(cme_model, models, X, d_t, d_y, feature_names=None,
output_names_, input_names_ = output_names, input_names
(dt, dy, treatment_names, output_names, feature_names) = _define_names(d_t, d_y, treatment_names, output_names,
feature_names, input_names)
if featurizer is not None:
F = featurizer.transform(X)
else:
F = X
if not isinstance(models, list):
models = [models]
assert len(models) == dt, "Number of final stage models don't equals to number of treatments!"
# define masker by using entire dataset, otherwise Explainer will only sample 100 obs by default.
bg_samples = X.shape[0] if background_samples is None else min(background_samples, X.shape[0])
background = shap.maskers.Independent(X, max_samples=bg_samples)
bg_samples = F.shape[0] if background_samples is None else min(background_samples, F.shape[0])
background = shap.maskers.Independent(F, max_samples=bg_samples)
shap_outs = defaultdict(dict)
for i in range(dt):
@ -144,12 +150,12 @@ def _shap_explain_model_cate(cme_model, models, X, d_t, d_y, feature_names=None,
except Exception as e:
print("Final model can't be parsed, explain const_marginal_effect() instead!", repr(e))
return _shap_explain_cme(cme_model, X, d_t_, d_y_,
feature_names=feature_names_,
feature_names=None,
treatment_names=treatment_names_,
output_names=output_names_,
input_names=input_names_,
background_samples=background_samples)
shap_out = explainer(X)
shap_out = explainer(F)
if dy > 1:
for j in range(dy):
base_values = shap_out.base_values[..., j]
@ -243,7 +249,8 @@ def _shap_explain_joint_linear_model_cate(model_final, X, d_t, d_y, fit_cate_int
return shap_outs
def _shap_explain_multitask_model_cate(cme_model, multitask_model_cate, X, d_t, d_y, feature_names=None,
def _shap_explain_multitask_model_cate(cme_model, multitask_model_cate, X, d_t, d_y, featurizer=None,
feature_names=None,
treatment_names=None, output_names=None,
input_names=None, background_samples=100):
"""
@ -259,11 +266,13 @@ def _shap_explain_multitask_model_cate(cme_model, multitask_model_cate, X, d_t,
the model's final stage model whose predict represents the const_marginal_effect for
all treatments (or list of models, one for each outcome)
X: (m, d_x) matrix
Features for each sample. Should be in the same shape of fitted X in final stage.
Features for each sample.
d_t: tuple of int
Tuple of number of treatment (exclude control in discrete treatment scenario).
d_y: tuple of int
Tuple of number of outcome.
featurizer: optional None or instance of featurizer
Fitted Featurizer of feature X.
feature_names: optional None or list of strings of length X.shape[1] (Default=None)
The names of input features.
treatment_names: optional None or list (Default=None)
@ -288,12 +297,16 @@ def _shap_explain_multitask_model_cate(cme_model, multitask_model_cate, X, d_t,
output_names_, input_names_ = output_names, input_names
(dt, dy, treatment_names, output_names, feature_names) = _define_names(d_t, d_y, treatment_names, output_names,
feature_names, input_names)
if featurizer is not None:
F = featurizer.transform(X)
else:
F = X
if dy == 1 and (not isinstance(multitask_model_cate, list)):
multitask_model_cate = [multitask_model_cate]
# define masker by using entire dataset, otherwise Explainer will only sample 100 obs by default.
bg_samples = X.shape[0] if background_samples is None else min(background_samples, X.shape[0])
background = shap.maskers.Independent(X, max_samples=bg_samples)
bg_samples = F.shape[0] if background_samples is None else min(background_samples, F.shape[0])
background = shap.maskers.Independent(F, max_samples=bg_samples)
shap_outs = defaultdict(dict)
for j in range(dy):
try:
@ -302,13 +315,13 @@ def _shap_explain_multitask_model_cate(cme_model, multitask_model_cate, X, d_t,
except Exception as e:
print("Final model can't be parsed, explain const_marginal_effect() instead!", repr(e))
return _shap_explain_cme(cme_model, X, d_t_, d_y_,
feature_names=feature_names_,
feature_names=None,
treatment_names=treatment_names_,
output_names=output_names_,
input_names=input_names_,
background_samples=background_samples)
shap_out = explainer(X)
shap_out = explainer(F)
if dt > 1:
for i in range(dt):
base_values = shap_out.base_values[..., i]

Просмотреть файл

@ -555,13 +555,11 @@ class CausalForestDML(_BaseDML):
return imps.reshape(self._d_y + (-1,))
def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100):
if self.featurizer_ is not None:
F = self.featurizer_.transform(X)
else:
F = X
feature_names = self.cate_feature_names(feature_names)
return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.model_cate.estimators_, F,
self._d_t, self._d_y, feature_names=feature_names,
return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.model_cate.estimators_, X,
self._d_t, self._d_y, featurizer=self.featurizer_,
feature_names=feature_names,
treatment_names=treatment_names,
output_names=output_names,
input_names=self._input_names,

Просмотреть файл

@ -1177,13 +1177,10 @@ class NonParamDML(_BaseDML):
refit_final.__doc__ = _OrthoLearner.refit_final.__doc__
def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100):
if self.featurizer_ is not None:
F = self.featurizer_.transform(X)
else:
F = X
feature_names = self.cate_feature_names(feature_names)
return _shap_explain_model_cate(self.const_marginal_effect, self.model_cate, F, self._d_t, self._d_y,
return _shap_explain_model_cate(self.const_marginal_effect, self.model_cate, X, self._d_t, self._d_y,
featurizer=self.featurizer_,
feature_names=feature_names,
treatment_names=treatment_names,
output_names=output_names,

232
econml/dowhy.py Normal file
Просмотреть файл

@ -0,0 +1,232 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""Helper class to allow other functionalities from dowhy package.
References
----------
DoWhy, https://microsoft.github.io/dowhy/
"""
import inspect
import pandas as pd
import numpy as np
import warnings
from dowhy import CausalModel
from econml.utilities import check_input_arrays, reshape_arrays_2dim
class DoWhyWrapper:
"""
A wrapper class to allow user call other methods from dowhy package through EconML.
(e.g. causal graph, refutation test, etc.)
Parameters
----------
cate_estimator: instance
An instance of any CATE estimator we currently support
"""
def __init__(self, cate_estimator):
self._cate_estimator = cate_estimator
def _get_params(self):
init = self._cate_estimator.__init__
# introspect the constructor arguments to find the model parameters
# to represent
init_signature = inspect.signature(init)
parameters = init_signature.parameters.values()
for p in parameters:
if p.kind == p.VAR_POSITIONAL or p.kind == p.VAR_KEYWORD:
raise RuntimeError("cate estimators should always specify their parameters in the signature "
"of their __init__ (no varargs, no varkwargs). "
f"{self._cate_estimator} with constructor {init_signature} doesn't "
"follow this convention.")
# Extract and sort argument names excluding 'self'
return sorted([p.name for p in parameters])
def fit(self, Y, T, X=None, W=None, Z=None, *, outcome_names=None, treatment_names=None, feature_names=None,
confounder_names=None, instrument_names=None, graph=None, estimand_type="nonparametric-ate",
proceed_when_unidentifiable=True, missing_nodes_as_confounders=False,
control_value=0, treatment_value=1, target_units="ate", **kwargs):
"""
Estimate the counterfactual model from data through dowhy package.
Parameters
----------
Y: vector of length n
Outcomes for each sample
T: vector of length n
Treatments for each sample
X: optional (n, d_x) matrix (Default=None)
Features for each sample
W: optional (n, d_w) matrix (Default=None)
Controls for each sample
Z: optional (n, d_z) matrix (Default=None)
Instruments for each sample
outcome_names: optional list (Default=None)
Name of the outcome
treatment_names: optional list (Default=None)
Name of the treatment
feature_names: optional list (Default=None)
Name of the features
confounder_names: optional list (Default=None)
Name of the confounders
instrument_names: optional list (Default=None)
Name of the instruments
graph: optional
Path to DOT file containing a DAG or a string containing a DAG specification in DOT format
estimand_type: optional string
Type of estimand requested (currently only "nonparametric-ate" is supported).
In the future, may support other specific parametric forms of identification
proceed_when_unidentifiable: optional bool (Default=True)
Whether the identification should proceed by ignoring potential unobserved confounders
missing_nodes_as_confounders: optional bool (Default=False)
Whether variables in the dataframe that are not included in the causal graph should be automatically
included as confounder nodes
control_value: optional scalar (Default=0)
Value of the treatment in the control group, for effect estimation
treatment_value: optional scalar (Default=1)
Value of the treatment in the treated group, for effect estimation
target_units: optional (Default="ate")
The units for which the treatment effect should be estimated.
This can be of three types:
1. A string for common specifications of target units (namely, "ate", "att" and "atc"),
2. A lambda function that can be used as an index for the data (pandas DataFrame),
3. A new DataFrame that contains values of the effect_modifiers and effect will be estimated
only for this new data
kwargs: optional
Other keyword arguments from fit method for CATE estimator
Returns
-------
self
"""
Y, T, X, W, Z = check_input_arrays(Y, T, X, W, Z)
# create dataframe
n_obs = Y.shape[0]
Y, T, X, W, Z = reshape_arrays_2dim(n_obs, Y, T, X, W, Z)
# currently dowhy only support single outcome and single treatment
assert Y.shape[1] == 1, "Can only accept single dimensional outcome."
assert T.shape[1] == 1, "Can only accept single dimensional treatment."
# column names
if outcome_names is None:
outcome_names = [f"Y{i}" for i in range(Y.shape[1])]
if treatment_names is None:
treatment_names = [f"T{i}" for i in range(T.shape[1])]
if feature_names is None:
feature_names = [f"X{i}" for i in range(X.shape[1])]
if confounder_names is None:
confounder_names = [f"W{i}" for i in range(W.shape[1])]
if instrument_names is None:
instrument_names = [f"Z{i}" for i in range(Z.shape[1])]
column_names = outcome_names + treatment_names + feature_names + confounder_names + instrument_names
df = pd.DataFrame(np.hstack((Y, T, X, W, Z)), columns=column_names)
self.dowhy_ = CausalModel(
data=df,
treatment=treatment_names,
outcome=outcome_names,
graph=graph,
common_causes=feature_names + confounder_names if X.shape[1] > 0 or W.shape[1] > 0 else None,
instruments=instrument_names if Z.shape[1] > 0 else None,
effect_modifiers=feature_names if X.shape[1] > 0 else None,
estimand_type=estimand_type,
proceed_when_unidetifiable=proceed_when_unidentifiable,
missing_nodes_as_confounders=missing_nodes_as_confounders
)
self.identified_estimand_ = self.dowhy_.identify_effect(proceed_when_unidentifiable=True)
method_name = "backdoor." + self._cate_estimator.__module__ + "." + self._cate_estimator.__class__.__name__
init_params = {}
for p in self._get_params():
init_params[p] = getattr(self._cate_estimator, p)
self.estimate_ = self.dowhy_.estimate_effect(self.identified_estimand_,
method_name=method_name,
control_value=control_value,
treatment_value=treatment_value,
target_units=target_units,
method_params={
"init_params": init_params,
"fit_params": kwargs,
},
)
return self
def refute_estimate(self, *, method_name, **kwargs):
"""
Refute an estimated causal effect.
If method_name is provided, uses the provided method. In the future, we may support automatic
selection of suitable refutation tests.
Following refutation methods are supported:
- Adding a randomly-generated confounder: "random_common_cause"
- Adding a confounder that is associated with both treatment and outcome: "add_unobserved_common_cause"
- Replacing the treatment with a placebo (random) variable): "placebo_treatment_refuter"
- Removing a random subset of the data: "data_subset_refuter"
For more details, see docs :mod:`dowhy.causal_refuters`
Parameters
----------
method_name: string
Name of the refutation method
kwargs: optional
Additional arguments that are passed directly to the refutation method.
Can specify a random seed here to ensure reproducible results ('random_seed' parameter).
For method-specific parameters, consult the documentation for the specific method.
All refutation methods are in the causal_refuters subpackage.
Returns
-------
RefuteResult: an instance of the RefuteResult class
"""
return self.dowhy_.refute_estimate(
self.identified_estimand_, self.estimate_, method_name=method_name, **kwargs
)
# We don't allow user to call refit_final from this class, since internally dowhy effect estimate will only update
# cate estimator but not the effect.
def refit_final(self, inference=None):
raise AttributeError(
"Method refit_final is not allowed through a dowhy object; please perform a full fit instead.")
def __getattr__(self, attr):
# don't proxy special methods
if attr.startswith('__'):
raise AttributeError(attr)
elif attr in ['_cate_estimator', 'dowhy_',
'identified_estimand_', 'estimate_']:
return super().__getattr__(attr)
elif attr.startswith('dowhy__'):
return getattr(self.dowhy_, attr[len('dowhy__'):])
elif hasattr(self.estimate_._estimator_object, attr):
if hasattr(self.dowhy_, attr):
warnings.warn("This call is ambiguous, "
"we're defaulting to CATE estimator's attribute. "
"Please add 'dowhy__' as prefix if you want to get dowhy attribute.", UserWarning)
return getattr(self.estimate_._estimator_object, attr)
else:
return getattr(self.dowhy_, attr)
def __setattr__(self, attr, value):
if attr in ['_cate_estimator', 'dowhy_',
'identified_estimand_', 'estimate_']:
super().__setattr__(attr, value)
elif attr.startswith('dowhy__'):
setattr(self.dowhy_, attr[len('dowhy__'):], value)
elif hasattr(self.estimate_._estimator_object, attr):
if hasattr(self.dowhy_, attr):
warnings.warn("This call is ambiguous, "
"we're defaulting to CATE estimator's attribute. "
"Please add 'dowhy__' as prefix if you want to set dowhy attribute.", UserWarning)
setattr(self.estimate_._estimator_object, attr, value)
else:
setattr(self.dowhy_, attr, value)

Просмотреть файл

@ -637,15 +637,12 @@ class DRLearner(_OrthoLearner):
return self.ortho_learner_model_final_.models_cate
def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100):
if self.featurizer_ is not None:
F = self.featurizer_.transform(X)
else:
F = X
feature_names = self.cate_feature_names(feature_names)
if self.ortho_learner_model_final_._multitask_model_final:
return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.multitask_model_cate, F,
return _shap_explain_multitask_model_cate(self.const_marginal_effect, self.multitask_model_cate, X,
self._d_t, self._d_y,
featurizer=self.featurizer_,
feature_names=feature_names,
treatment_names=treatment_names,
output_names=output_names,
@ -653,7 +650,8 @@ class DRLearner(_OrthoLearner):
background_samples=background_samples)
else:
return _shap_explain_model_cate(self.const_marginal_effect, self.fitted_models_final,
F, self._d_t, self._d_y,
X, self._d_t, self._d_y,
featurizer=self.featurizer_,
feature_names=feature_names,
treatment_names=treatment_names,
output_names=output_names,

Просмотреть файл

@ -39,12 +39,7 @@ class TLearner(TreatmentExpansionMixin, LinearCateEstimator):
models,
categories='auto'):
self.models = clone(models, safe=False)
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder).encode,
validate=False)
self.categories = categories
super().__init__()
@_deprecate_positional("X should be passed by keyword only. In a future release "
@ -74,8 +69,17 @@ class TLearner(TreatmentExpansionMixin, LinearCateEstimator):
self : an instance of self.
"""
# Check inputs
Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False)
if self.categories != 'auto':
self.categories = [self.categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=self.categories, sparse=False, drop='first')
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder).encode,
validate=False)
T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)
@ -128,15 +132,7 @@ class SLearner(TreatmentExpansionMixin, LinearCateEstimator):
overall_model,
categories='auto'):
self.overall_model = clone(overall_model, safe=False)
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
# Note: unlike other Metalearners, we don't drop the first column because
# we concatenate all treatments to the other features;
# We might want to revisit, though, since it's linearly determined by the others
self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False)
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder, drop_first=True).encode,
validate=False)
self.categories = categories
super().__init__()
@_deprecate_positional("X should be passed by keyword only. In a future release "
@ -169,6 +165,17 @@ class SLearner(TreatmentExpansionMixin, LinearCateEstimator):
if X is None:
X = np.zeros((Y.shape[0], 1))
Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False)
if self.categories != 'auto':
self.categories = [self.categories] # OneHotEncoder expects a 2D array with features per column
# Note: unlike other Metalearners, we don't drop the first column because
# we concatenate all treatments to the other features;
# We might want to revisit, though, since it's linearly determined by the others
self._one_hot_encoder = OneHotEncoder(categories=self.categories, sparse=False)
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder, drop_first=True).encode,
validate=False)
T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
self._d_t = (T.shape[1] - 1,)
feat_arr = np.concatenate((X, T), axis=1)
@ -238,12 +245,7 @@ class XLearner(TreatmentExpansionMixin, LinearCateEstimator):
self.models = clone(models, safe=False)
self.cate_models = clone(cate_models, safe=False)
self.propensity_model = clone(propensity_model, safe=False)
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder).encode,
validate=False)
self.categories = categories
super().__init__()
@_deprecate_positional("X should be passed by keyword only. In a future release "
@ -274,6 +276,14 @@ class XLearner(TreatmentExpansionMixin, LinearCateEstimator):
"""
# Check inputs
Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False)
if self.categories != 'auto':
self.categories = [self.categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=self.categories, sparse=False, drop='first')
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder).encode,
validate=False)
if Y.ndim == 2 and Y.shape[1] == 1:
Y = Y.flatten()
T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
@ -366,12 +376,7 @@ class DomainAdaptationLearner(TreatmentExpansionMixin, LinearCateEstimator):
self.models = clone(models, safe=False)
self.final_models = clone(final_models, safe=False)
self.propensity_model = clone(propensity_model, safe=False)
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder).encode,
validate=False)
self.categories = categories
super().__init__()
@_deprecate_positional("X should be passed by keyword only. In a future release "
@ -402,6 +407,14 @@ class DomainAdaptationLearner(TreatmentExpansionMixin, LinearCateEstimator):
"""
# Check inputs
Y, T, X, _ = check_inputs(Y, T, X, multi_output_T=False)
if self.categories != 'auto':
self.categories = [self.categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=self.categories, sparse=False, drop='first')
self.transformer = FunctionTransformer(
func=_EncoderWrapper(self._one_hot_encoder).encode,
validate=False)
T = self._one_hot_encoder.fit_transform(T.reshape(-1, 1))
self._d_t = T.shape[1:]
T = inverse_onehot(T)
@ -468,6 +481,7 @@ class DomainAdaptationLearner(TreatmentExpansionMixin, LinearCateEstimator):
def shap_values(self, X, *, feature_names=None, treatment_names=None, output_names=None, background_samples=100):
return _shap_explain_model_cate(self.const_marginal_effect, self.final_models, X, self._d_t, self._d_y,
featurizer=None,
feature_names=feature_names,
treatment_names=treatment_names,
output_names=output_names,

Просмотреть файл

@ -247,6 +247,7 @@ class BaseOrthoForest(TreatmentExpansionMixin, LinearCateEstimator):
self.backend = backend
self.verbose = verbose
self.batch_size = batch_size
self.categories = categories
super().__init__()
@_deprecate_positional("X and W should be passed by keyword only. In a future release "
@ -581,10 +582,6 @@ class DMLOrthoForest(BaseOrthoForest):
self.lambda_reg)
# Define
moment_and_mean_gradient_estimator = _DMLOrthoForest_moment_and_mean_gradient_estimator_func
if discrete_treatment:
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
super().__init__(
nuisance_estimator,
second_stage_nuisance_estimator,
@ -640,6 +637,11 @@ class DMLOrthoForest(BaseOrthoForest):
-------
self: an instance of self.
"""
if self.discrete_treatment:
if self.categories != 'auto':
self.categories = [self.categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=self.categories, sparse=False, drop='first')
self._set_input_names(Y, T, X, set_flag=True)
Y, T, X, W = check_inputs(Y, T, X, W)
if self.discrete_treatment:
@ -910,6 +912,7 @@ class DROrthoForest(BaseOrthoForest):
verbose=3,
batch_size='auto',
random_state=None):
self.lambda_reg = lambda_reg
# Copy and/or define models
self.propensity_model = clone(propensity_model, safe=False)
self.model_Y = clone(model_Y, safe=False)
@ -928,12 +931,9 @@ class DROrthoForest(BaseOrthoForest):
# Define parameter estimators
parameter_estimator = DROrthoForest.parameter_estimator_func
second_stage_parameter_estimator = DROrthoForest.second_stage_parameter_estimator_gen(
lambda_reg)
self.lambda_reg)
# Define moment and mean gradient estimator
moment_and_mean_gradient_estimator = DROrthoForest.moment_and_mean_gradient_estimator_func
if categories != 'auto':
categories = [categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')
super().__init__(
nuisance_estimator,
@ -983,6 +983,10 @@ class DROrthoForest(BaseOrthoForest):
-------
self: an instance of self.
"""
if self.categories != 'auto':
self.categories = [self.categories] # OneHotEncoder expects a 2D array with features per column
self._one_hot_encoder = OneHotEncoder(categories=self.categories, sparse=False, drop='first')
self._set_input_names(Y, T, X, set_flag=True)
Y, T, X, W = check_inputs(Y, T, X, W)
# Check that T is shape (n, )

Просмотреть файл

@ -0,0 +1,66 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import numpy as np
import unittest
from econml.dml import LinearDML, CausalForestDML
from econml.orf import DROrthoForest
from econml.dr import DRLearner
from econml.metalearners import XLearner
from econml.iv.dml import DMLATEIV
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
class TestDowhy(unittest.TestCase):
def _get_data(self):
X = np.random.normal(0, 1, size=(500, 5))
T = np.random.binomial(1, .5, size=(500,))
Y = np.random.normal(0, 1, size=(500,))
Z = np.random.normal(0, 1, size=(500,))
return Y, T, X[:, [0]], X[:, 1:], Z
def test_dowhy(self):
def reg():
return LinearRegression()
def clf():
return LogisticRegression()
Y, T, X, W, Z = self._get_data()
# test at least one estimator from each category
models = {"dml": LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
linear_first_stages=False),
"dr": DRLearner(model_propensity=clf(), model_regression=reg(),
model_final=reg()),
"xlearner": XLearner(models=reg(), cate_models=reg(), propensity_model=clf()),
"cfdml": CausalForestDML(model_y=reg(), model_t=clf(), discrete_treatment=True),
"orf": DROrthoForest(n_trees=10, propensity_model=clf(), model_Y=reg()),
"dmlateiv": DMLATEIV(model_Y_W=reg(),
model_T_W=clf(),
model_Z_W=reg(),
discrete_treatment=True,
discrete_instrument=False)}
for name, model in models.items():
with self.subTest(name=name):
est = model
if name == "xlearner":
est_dowhy = est.dowhy.fit(Y, T, X=np.hstack((X, W)), W=None)
elif name == "dmlateiv":
est_dowhy = est.dowhy.fit(Y, T, W=W, Z=Z)
else:
est_dowhy = est.dowhy.fit(Y, T, X=X, W=W)
# test causal graph
est_dowhy.view_model()
# test refutation estimate
est_dowhy.refute_estimate(method_name="random_common_cause")
if name != "orf":
est_dowhy.refute_estimate(method_name="add_unobserved_common_cause",
confounders_effect_on_treatment="binary_flip",
confounders_effect_on_outcome="linear",
effect_strength_on_treatment=0.1,
effect_strength_on_outcome=0.1,)
est_dowhy.refute_estimate(method_name="placebo_treatment_refuter", placebo_type="permute",
num_simulations=3)
est_dowhy.refute_estimate(method_name="data_subset_refuter", subset_fraction=0.8,
num_simulations=3)

Просмотреть файл

@ -39,7 +39,6 @@ class TestShap(unittest.TestCase):
]
for est in est_list:
with self.subTest(est=est, featurizer=featurizer, d_y=d_y, d_t=d_t):
fd_x = featurizer.fit_transform(X).shape[1] if featurizer is not None else d_x
est.fit(Y, T, X, W)
shap_values = est.shap_values(X[:10], feature_names=["a", "b", "c"],
background_samples=None)
@ -50,19 +49,13 @@ class TestShap(unittest.TestCase):
mean_cate = mean_cate.flatten()[0] if not np.isscalar(mean_cate) else mean_cate
self.assertAlmostEqual(shap_values["Y0"]["T0"].base_values[0], mean_cate, delta=1e-2)
if isinstance(est, (CausalForestDML, DMLOrthoForest)):
fd_x = d_x
# test shape of shap values output is as expected
self.assertEqual(len(shap_values["Y0"]), d_t)
self.assertEqual(len(shap_values), d_y)
# test shape of attribute of explanation object is as expected
self.assertEqual(shap_values["Y0"]["T0"].values.shape, (10, fd_x))
self.assertEqual(shap_values["Y0"]["T0"].data.shape, (10, fd_x))
self.assertEqual(shap_values["Y0"]["T0"].values.shape[0], 10)
self.assertEqual(shap_values["Y0"]["T0"].data.shape[0], 10)
self.assertEqual(shap_values["Y0"]["T0"].base_values.shape, (10,))
ind = 6
self.assertEqual(len(shap_values["Y0"]["T0"].feature_names), fd_x)
self.assertEqual(len(shap_values["Y0"]["T0"][ind].feature_names), fd_x)
def test_discrete_t(self):
n = 100
@ -97,7 +90,6 @@ class TestShap(unittest.TestCase):
ForestDRLearner()]
for est in est_list:
with self.subTest(est=est, featurizer=featurizer, d_y=d_y, d_t=d_t):
fd_x = featurizer.fit_transform(X).shape[1] if featurizer is not None else d_x
if isinstance(est, (TLearner, SLearner, XLearner, DomainAdaptationLearner)):
est.fit(Y, T, X)
else:
@ -111,19 +103,13 @@ class TestShap(unittest.TestCase):
mean_cate = mean_cate.flatten()[0] if not np.isscalar(mean_cate) else mean_cate
self.assertAlmostEqual(shap_values["Y0"]["T0"].base_values[0], mean_cate, delta=1e-2)
if isinstance(est, (TLearner, SLearner, XLearner, DomainAdaptationLearner, CausalForestDML,
ForestDRLearner, DROrthoForest)):
fd_x = d_x
# test shape of shap values output is as expected
self.assertEqual(len(shap_values["Y0"]), d_t - 1)
self.assertEqual(len(shap_values), d_y)
# test shape of attribute of explanation object is as expected
self.assertEqual(shap_values["Y0"]["T0"].values.shape, (10, fd_x))
self.assertEqual(shap_values["Y0"]["T0"].data.shape, (10, fd_x))
self.assertEqual(shap_values["Y0"]["T0"].values.shape[0], 10)
self.assertEqual(shap_values["Y0"]["T0"].data.shape[0], 10)
self.assertEqual(shap_values["Y0"]["T0"].base_values.shape, (10,))
ind = 6
self.assertEqual(len(shap_values["Y0"]["T0"].feature_names), fd_x)
self.assertEqual(len(shap_values["Y0"]["T0"][ind].feature_names), fd_x)
def test_identical_output(self):
# Treatment effect function

Просмотреть файл

@ -1266,6 +1266,34 @@ def transpose_dictionary(d):
return output
def reshape_arrays_2dim(length, *args):
"""
Reshape the input arrays as two dimensional.
If None, will be reshaped as (n, 0).
Parameters
----------
length: scalar
Number of samples
args: arrays
Inputs to be reshaped
Returns
-------
new_args: arrays
Output of reshaped arrays
"""
new_args = []
for arg in args:
if arg is None:
new_args.append(np.array([]).reshape(length, 0))
elif arg.ndim == 1:
new_args.append(arg.reshape((-1, 1)))
else:
new_args.append(arg)
return new_args
class _RegressionWrapper:
"""
A simple wrapper that makes a binary classifier behave like a regressor.

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -53,6 +53,7 @@ install_requires =
matplotlib
pandas
shap ~= 0.38.1
dowhy
test_suite = econml.tests
tests_require =
pytest
@ -63,7 +64,6 @@ tests_require =
nbformat
seaborn
lightgbm
dowhy
xgboost
[options.extras_require]