зеркало из https://github.com/py-why/EconML.git
CATE validation - uplift uniform confidence bands (#840)
Add support for multiplier bootstrap uniform confidence band error bars for uplift curves
This commit is contained in:
Родитель
ed4fe33b2b
Коммит
27d3101e23
|
@ -147,6 +147,20 @@ CATE Interpreters
|
|||
econml.cate_interpreter.SingleTreeCateInterpreter
|
||||
econml.cate_interpreter.SingleTreePolicyInterpreter
|
||||
|
||||
.. _validation_api:
|
||||
|
||||
CATE Validation
|
||||
---------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: _autosummary
|
||||
|
||||
econml.validate.DRTester
|
||||
econml.validate.BLPEvaluationResults
|
||||
econml.validate.CalibrationEvaluationResults
|
||||
econml.validate.UpliftEvaluationResults
|
||||
econml.validate.EvaluationResults
|
||||
|
||||
.. _scorers_api:
|
||||
|
||||
CATE Scorers
|
||||
|
|
|
@ -5,7 +5,7 @@ import pandas as pd
|
|||
import scipy.stats as st
|
||||
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
|
||||
|
||||
from econml.validate.drtester import DRtester
|
||||
from econml.validate.drtester import DRTester
|
||||
from econml.dml import DML
|
||||
|
||||
|
||||
|
@ -70,7 +70,7 @@ class TestDRTester(unittest.TestCase):
|
|||
).fit(Y=Ytrain, T=Dtrain, X=Xtrain)
|
||||
|
||||
# test the DR outcome difference
|
||||
my_dr_tester = DRtester(
|
||||
my_dr_tester = DRTester(
|
||||
model_regression=reg_y,
|
||||
model_propensity=reg_t,
|
||||
cate=cate
|
||||
|
@ -123,7 +123,7 @@ class TestDRTester(unittest.TestCase):
|
|||
).fit(Y=Ytrain, T=Dtrain, X=Xtrain)
|
||||
|
||||
# test the DR outcome difference
|
||||
my_dr_tester = DRtester(
|
||||
my_dr_tester = DRTester(
|
||||
model_regression=reg_y,
|
||||
model_propensity=reg_t,
|
||||
cate=cate
|
||||
|
@ -148,8 +148,8 @@ class TestDRTester(unittest.TestCase):
|
|||
self.assertRaises(ValueError, res.plot_toc, k)
|
||||
else: # real treatment, k = 1
|
||||
self.assertTrue(res.plot_cal(k) is not None)
|
||||
self.assertTrue(res.plot_qini(k) is not None)
|
||||
self.assertTrue(res.plot_toc(k) is not None)
|
||||
self.assertTrue(res.plot_qini(k, 'ucb2') is not None)
|
||||
self.assertTrue(res.plot_toc(k, 'ucb1') is not None)
|
||||
|
||||
self.assertLess(res_df.blp_pval.values[0], 0.05) # heterogeneity
|
||||
self.assertGreater(res_df.cal_r_squared.values[0], 0) # good R2
|
||||
|
@ -171,7 +171,7 @@ class TestDRTester(unittest.TestCase):
|
|||
).fit(Y=Ytrain, T=Dtrain, X=Xtrain)
|
||||
|
||||
# test the DR outcome difference
|
||||
my_dr_tester = DRtester(
|
||||
my_dr_tester = DRTester(
|
||||
model_regression=reg_y,
|
||||
model_propensity=reg_t,
|
||||
cate=cate
|
||||
|
@ -193,8 +193,8 @@ class TestDRTester(unittest.TestCase):
|
|||
for kwargs in [{}, {'Xval': Xval}]:
|
||||
with self.assertRaises(Exception) as exc:
|
||||
my_dr_tester.evaluate_cal(kwargs)
|
||||
self.assertTrue(
|
||||
str(exc.exception) == "Must fit nuisance models on training sample data to use calibration test"
|
||||
self.assertEqual(
|
||||
str(exc.exception), "Must fit nuisance models on training sample data to use calibration test"
|
||||
)
|
||||
|
||||
def test_exceptions(self):
|
||||
|
@ -212,7 +212,7 @@ class TestDRTester(unittest.TestCase):
|
|||
).fit(Y=Ytrain, T=Dtrain, X=Xtrain)
|
||||
|
||||
# test the DR outcome difference
|
||||
my_dr_tester = DRtester(
|
||||
my_dr_tester = DRTester(
|
||||
model_regression=reg_y,
|
||||
model_propensity=reg_t,
|
||||
cate=cate
|
||||
|
@ -223,11 +223,11 @@ class TestDRTester(unittest.TestCase):
|
|||
with self.assertRaises(Exception) as exc:
|
||||
func()
|
||||
if func.__name__ == 'evaluate_cal':
|
||||
self.assertTrue(
|
||||
str(exc.exception) == "Must fit nuisance models on training sample data to use calibration test"
|
||||
self.assertEqual(
|
||||
str(exc.exception), "Must fit nuisance models on training sample data to use calibration test"
|
||||
)
|
||||
else:
|
||||
self.assertTrue(str(exc.exception) == "Must fit nuisances before evaluating")
|
||||
self.assertEqual(str(exc.exception), "Must fit nuisances before evaluating")
|
||||
|
||||
my_dr_tester = my_dr_tester.fit_nuisance(
|
||||
Xval, Dval, Yval, Xtrain, Dtrain, Ytrain
|
||||
|
@ -242,12 +242,12 @@ class TestDRTester(unittest.TestCase):
|
|||
with self.assertRaises(Exception) as exc:
|
||||
func()
|
||||
if func.__name__ == 'evaluate_blp':
|
||||
self.assertTrue(
|
||||
str(exc.exception) == "CATE predictions not yet calculated - must provide Xval"
|
||||
self.assertEqual(
|
||||
str(exc.exception), "CATE predictions not yet calculated - must provide Xval"
|
||||
)
|
||||
else:
|
||||
self.assertTrue(str(exc.exception) ==
|
||||
"CATE predictions not yet calculated - must provide both Xval, Xtrain")
|
||||
self.assertEqual(str(exc.exception),
|
||||
"CATE predictions not yet calculated - must provide both Xval, Xtrain")
|
||||
|
||||
for func in [
|
||||
my_dr_tester.evaluate_cal,
|
||||
|
@ -256,19 +256,19 @@ class TestDRTester(unittest.TestCase):
|
|||
]:
|
||||
with self.assertRaises(Exception) as exc:
|
||||
func(Xval=Xval)
|
||||
self.assertTrue(
|
||||
str(exc.exception) == "CATE predictions not yet calculated - must provide both Xval, Xtrain")
|
||||
self.assertEqual(
|
||||
str(exc.exception), "CATE predictions not yet calculated - must provide both Xval, Xtrain")
|
||||
|
||||
cal_res = my_dr_tester.evaluate_cal(Xval, Xtrain)
|
||||
self.assertGreater(cal_res.cal_r_squared[0], 0) # good R2
|
||||
|
||||
with self.assertRaises(Exception) as exc:
|
||||
my_dr_tester.evaluate_uplift(metric='blah')
|
||||
self.assertTrue(
|
||||
str(exc.exception) == "Unsupported metric - must be one of ['toc', 'qini']"
|
||||
self.assertEqual(
|
||||
str(exc.exception), "Unsupported metric 'blah' - must be one of ['toc', 'qini']"
|
||||
)
|
||||
|
||||
my_dr_tester = DRtester(
|
||||
my_dr_tester = DRTester(
|
||||
model_regression=reg_y,
|
||||
model_propensity=reg_t,
|
||||
cate=cate
|
||||
|
@ -278,5 +278,11 @@ class TestDRTester(unittest.TestCase):
|
|||
qini_res = my_dr_tester.evaluate_uplift(Xval, Xtrain)
|
||||
self.assertLess(qini_res.pvals[0], 0.05)
|
||||
|
||||
with self.assertRaises(Exception) as exc:
|
||||
qini_res.plot_uplift(tmt=1, err_type='blah')
|
||||
self.assertEqual(
|
||||
str(exc.exception), "Invalid error type 'blah'; must be one of [None, 'ucb2', 'ucb1']"
|
||||
)
|
||||
|
||||
autoc_res = my_dr_tester.evaluate_uplift(Xval, Xtrain, metric='toc')
|
||||
self.assertLess(autoc_res.pvals[0], 0.05)
|
||||
|
|
|
@ -5,7 +5,9 @@
|
|||
A suite of validation methods for CATE models.
|
||||
"""
|
||||
|
||||
from .drtester import DRtester
|
||||
from .drtester import DRTester
|
||||
from .results import BLPEvaluationResults, CalibrationEvaluationResults, UpliftEvaluationResults, EvaluationResults
|
||||
|
||||
|
||||
__all__ = ['DRtester']
|
||||
__all__ = ['DRTester',
|
||||
'BLPEvaluationResults', 'CalibrationEvaluationResults', 'UpliftEvaluationResults', 'EvaluationResults']
|
||||
|
|
|
@ -8,12 +8,13 @@ from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
|
|||
from statsmodels.api import OLS
|
||||
from statsmodels.tools import add_constant
|
||||
|
||||
from econml.utilities import deprecated
|
||||
|
||||
from .results import CalibrationEvaluationResults, BLPEvaluationResults, UpliftEvaluationResults, EvaluationResults
|
||||
from .utils import calculate_dr_outcomes, calc_uplift
|
||||
|
||||
|
||||
class DRtester:
|
||||
|
||||
class DRTester:
|
||||
"""
|
||||
Validation tests for CATE models. Includes the best linear predictor (BLP) test as in Chernozhukov et al. (2022),
|
||||
the calibration test in Dwivedi et al. (2020), and the QINI coefficient as in Radcliffe (2007).
|
||||
|
@ -46,47 +47,62 @@ class DRtester:
|
|||
The calibration r-squared metric is similar to the standard R-square score in that it can take any value
|
||||
less than or equal to 1, with scores closer to 1 indicating a better calibrated CATE model.
|
||||
|
||||
**QINI**
|
||||
**Uplift Modeling**
|
||||
|
||||
Units are ordered by predicted CATE values and a running measure of the average treatment effect in each cohort is
|
||||
kept as we progress through ranks. The QINI coefficient is then the area under the resulting curve, with a value
|
||||
of 0 interpreted as corresponding to a model with randomly assigned CATE coefficients. All calculations are
|
||||
performed on validation dataset results, using the training set as input.
|
||||
kept as we progress through ranks. The resulting TOC curve can then be plotted and its integral calculated and used
|
||||
as a measure of true heterogeneity captured by the CATE model; this integral is referred to as the AUTOC (area
|
||||
under TOC). The QINI curve is a variant of this curve that also incorporates treatment probability; its integral is
|
||||
referred to as the QINI coefficient.
|
||||
|
||||
More formally, the QINI curve is given by the following function:
|
||||
More formally, the TOC and QINI curves are given by the following functions:
|
||||
|
||||
.. math::
|
||||
|
||||
\\tau_{TOC}(q) = \\mathrm{Cov}(
|
||||
Y^{DR}(g,p),
|
||||
\\frac{
|
||||
\\mathbb{1}\\{\\hat{\\tau}(Z) \\geq \\hat{\\mu}(q)\\}
|
||||
}{
|
||||
\\mathrm{Pr}(\\hat{\\tau}(Z) \\geq \\hat{\\mu}(q))
|
||||
}
|
||||
)
|
||||
|
||||
\\tau_{QINI}(q) = \\mathrm{Cov}(Y^{DR}(g,p), \\mathbb{1}\\{\\hat{\\tau}(Z) \\geq \\hat{\\mu}(q)\\})
|
||||
|
||||
Where :math:`q` is the desired quantile, :math:`\\hat{\\mu}` is the quantile function, and :math:`\\hat{\\tau}` is
|
||||
the predicted CATE function.
|
||||
:math:`Y^{DR}(g,p)` refers to the doubly robust outcome difference (relative to control) for the given observation.
|
||||
|
||||
The QINI coefficient is then given by:
|
||||
The AUTOC and QINI coefficient are then given by:
|
||||
|
||||
.. math::
|
||||
|
||||
AUTOC = \\int_0^1 \\tau_{TOC}(q) dq
|
||||
|
||||
QINI = \\int_0^1 \\tau_{QINI}(q) dq
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model_regression: estimator
|
||||
Nuisance model estimator used to fit the outcome to features. Must be able to implement `fit' and `predict'
|
||||
Nuisance model estimator used to fit the outcome to features. Must be able to implement `fit` and `predict`
|
||||
methods
|
||||
|
||||
model_propensity: estimator
|
||||
Nuisance model estimator used to fit the treatment assignment to features. Must be able to implement `fit'
|
||||
method and either `predict' (in the case of binary treatment) or `predict_proba' methods (in the case of
|
||||
Nuisance model estimator used to fit the treatment assignment to features. Must be able to implement `fit`
|
||||
method and either `predict` (in the case of binary treatment) or `predict_proba` methods (in the case of
|
||||
multiple categorical treatments).
|
||||
|
||||
n_splits: integer, default 5
|
||||
Number of splits used to generate cross-validated predictions
|
||||
cate: estimator
|
||||
Fitted conditional average treatment effect (CATE) estimator to be validated.
|
||||
|
||||
cv: int or list, default 5
|
||||
Splitter used for cross-validation. Can be either an integer (corresponding to the number of desired folds)
|
||||
or a list of indices corresponding to membership in each fold.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
|
||||
[Chernozhukov2022] V. Chernozhukov et al.
|
||||
Generic Machine Learning Inference on Heterogeneous Treatment Effects in Randomized Experiments
|
||||
arXiv preprint arXiv:1712.04802, 2022.
|
||||
|
@ -97,7 +113,6 @@ class DRtester:
|
|||
arXiv preprint arXiv:2008.10109, 2020.
|
||||
`<https://arxiv.org/abs/2008.10109>`_
|
||||
|
||||
|
||||
[Radcliffe2007] N. Radcliffe
|
||||
Using control groups to target on predicted lift: Building and assessing uplift model.
|
||||
Direct Marketing Analytics Journal (2007), pages 14–21.
|
||||
|
@ -175,7 +190,7 @@ class DRtester:
|
|||
Generates nuisance predictions and calculates doubly robust (DR) outcomes either by (1) cross-fitting in the
|
||||
validation sample, or (2) fitting in the training sample and applying to the validation sample. If Xtrain,
|
||||
Dtrain, and ytrain are all not None, then option (2) will be implemented, otherwise, option (1) will be
|
||||
implemented. In order to use the `evaluate_cal' method then Xtrain, Dtrain, and ytrain must all be specified.
|
||||
implemented. In order to use the `evaluate_cal` method then Xtrain, Dtrain, and ytrain must all be specified.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -186,12 +201,12 @@ class DRtester:
|
|||
the control status be equal to 0, and all other treatments integers starting at 1.
|
||||
yval: vector of length n_val
|
||||
Outcomes for the validation sample
|
||||
Xtrain: (n_train x k) matrix or vector of length n, default ``None``
|
||||
Xtrain: (n_train x k) matrix or vector of length n, optional
|
||||
Features used in nuisance models for training sample
|
||||
Dtrain: vector of length n_train, default ``None''
|
||||
Dtrain: vector of length n_train, optional
|
||||
Treatment assignment of training sample. Control status must be minimum value. It is recommended to have
|
||||
the control status be equal to 0, and all other treatments integers starting at 1.
|
||||
ytrain: vector of length n_train, defaul ``None``
|
||||
ytrain: vector of length n_train, optional
|
||||
Outcomes for the training sample
|
||||
|
||||
Returns
|
||||
|
@ -332,7 +347,7 @@ class DRtester:
|
|||
----------
|
||||
Xval: (n_val x n_treatment) matrix
|
||||
Validation set features to be used to predict (and potentially fit) DR outcomes in CATE model
|
||||
Xtrain (n_train x n_treatment) matrix, defaul ``None``
|
||||
Xtrain (n_train x n_treatment) matrix, optional
|
||||
Training set features used to fit CATE model
|
||||
|
||||
Returns
|
||||
|
@ -359,11 +374,11 @@ class DRtester:
|
|||
|
||||
Parameters
|
||||
----------
|
||||
Xval: (n_val x n_treatment) matrix, default ``None``
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
|
||||
Xval: (n_val x n_treatment) matrix, optional
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
|
||||
implemented
|
||||
Xtrain: (n_train x n_treatment) matrix, default ``None``
|
||||
Training sample features for CATE model. If not specified, then `fit cate' method must already have been
|
||||
Xtrain: (n_train x n_treatment) matrix, optional
|
||||
Training sample features for CATE model. If not specified, then `fit cate` method must already have been
|
||||
implemented (with Xtrain specified)
|
||||
n_groups: integer, default 4
|
||||
Number of quantile-based groups used to calculate calibration score.
|
||||
|
@ -433,17 +448,17 @@ class DRtester:
|
|||
Xtrain: np.array = None
|
||||
) -> BLPEvaluationResults:
|
||||
"""
|
||||
Implements the best linear predictor (BLP) test as in [Chernozhukov2022]. `fit_nusiance' method must already
|
||||
Implements the best linear predictor (BLP) test as in [Chernozhukov2022]. `fit_nusiance` method must already
|
||||
be implemented.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Xval: (n_val x k) matrix, default ``None''
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
|
||||
Xval: (n_val x k) matrix, optional
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
|
||||
implemented
|
||||
Xtrain: (n_train x k) matrix, default ``None''
|
||||
Xtrain: (n_train x k) matrix, optional
|
||||
Training sample features for CATE model. If specified, then CATE is fitted on training sample and applied
|
||||
to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance' method (and
|
||||
to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance` method (and
|
||||
vice-versa)
|
||||
|
||||
Returns
|
||||
|
@ -489,29 +504,32 @@ class DRtester:
|
|||
Xval: np.array = None,
|
||||
Xtrain: np.array = None,
|
||||
percentiles: np.array = np.linspace(5, 95, 50),
|
||||
metric: str = 'qini'
|
||||
metric: str = 'qini',
|
||||
n_bootstrap: int = 1000
|
||||
) -> UpliftEvaluationResults:
|
||||
"""
|
||||
Calculates QINI coefficient for the given model as in Radcliffe (2007), where units are ordered by predicted
|
||||
Calculates uplift curves and coefficients for the given model, where units are ordered by predicted
|
||||
CATE values and a running measure of the average treatment effect in each cohort is kept as we progress
|
||||
through ranks. The QINI coefficient is then the area under the resulting curve, with a value of 0 interpreted
|
||||
through ranks. The uplift coefficient is then the area under the resulting curve, with a value of 0 interpreted
|
||||
as corresponding to a model with randomly assigned CATE coefficients. All calculations are performed on
|
||||
validation dataset results, using the training set as input.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Xval: (n_val x k) matrix, default ``None''
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
|
||||
Xval: (n_val x k) matrix, optional
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
|
||||
implemented
|
||||
Xtrain: (n_train x k) matrix, default ``None''
|
||||
Xtrain: (n_train x k) matrix, optional
|
||||
Training sample features for CATE model. If specified, then CATE is fitted on training sample and applied
|
||||
to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance' method (and
|
||||
to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance` method (and
|
||||
vice-versa)
|
||||
percentiles: one-dimensional array, default ``np.linspace(5, 95, 50)''
|
||||
percentiles: one-dimensional array, default ``np.linspace(5, 95, 50)``
|
||||
Array of percentiles over which the QINI curve should be constructed. Defaults to 5%-95% in intervals of
|
||||
5%.
|
||||
metric: string, default 'qini'
|
||||
Which type of uplift curve to evaluate. Must be one of ['toc', 'qini']
|
||||
n_bootstrap: integer, default 1000
|
||||
Number of bootstrap samples to run when calculating uniform confidence bands.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
@ -532,7 +550,8 @@ class DRtester:
|
|||
self.cate_preds_val_,
|
||||
self.dr_val_,
|
||||
percentiles,
|
||||
metric
|
||||
metric,
|
||||
n_bootstrap
|
||||
)
|
||||
coeffs = [coeff]
|
||||
errs = [err]
|
||||
|
@ -546,7 +565,8 @@ class DRtester:
|
|||
self.cate_preds_val_[:, k],
|
||||
self.dr_val_[:, k],
|
||||
percentiles,
|
||||
metric
|
||||
metric,
|
||||
n_bootstrap
|
||||
)
|
||||
coeffs.append(coeff)
|
||||
errs.append(err)
|
||||
|
@ -568,20 +588,25 @@ class DRtester:
|
|||
self,
|
||||
Xval: np.array = None,
|
||||
Xtrain: np.array = None,
|
||||
n_groups: int = 4
|
||||
n_groups: int = 4,
|
||||
n_bootstrap: int = 1000
|
||||
) -> EvaluationResults:
|
||||
"""
|
||||
Implements the best linear prediction (`evaluate_blp'), calibration (`evaluate_cal'), uplift curve
|
||||
('evaluate_uplift') methods
|
||||
Implements the best linear prediction (`evaluate_blp`), calibration (`evaluate_cal`), uplift curve
|
||||
(`evaluate_uplift`) methods
|
||||
|
||||
Parameters
|
||||
----------
|
||||
Xval: (n_cal x k) matrix, default ``None''
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
|
||||
Xval: (n_cal x k) matrix, optional
|
||||
Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
|
||||
implemented
|
||||
Xtrain: (n_train x k) matrix, default ``None''
|
||||
Training sample features for CATE model. If not specified, then `fit_cate' method must already have been
|
||||
Xtrain: (n_train x k) matrix, optional
|
||||
Training sample features for CATE model. If not specified, then `fit_cate` method must already have been
|
||||
implemented
|
||||
n_groups: integer, default 4
|
||||
Number of quantile-based groups used to calculate calibration score.
|
||||
n_bootstrap: integer, default 1000
|
||||
Number of bootstrap samples to run when calculating uniform confidence bands for uplift curves.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
@ -595,8 +620,8 @@ class DRtester:
|
|||
|
||||
blp_res = self.evaluate_blp()
|
||||
cal_res = self.evaluate_cal(n_groups=n_groups)
|
||||
qini_res = self.evaluate_uplift(metric='qini')
|
||||
toc_res = self.evaluate_uplift(metric='toc')
|
||||
qini_res = self.evaluate_uplift(metric='qini', n_bootstrap=n_bootstrap)
|
||||
toc_res = self.evaluate_uplift(metric='toc', n_bootstrap=n_bootstrap)
|
||||
|
||||
self.res = EvaluationResults(
|
||||
blp_res=blp_res,
|
||||
|
@ -606,3 +631,9 @@ class DRtester:
|
|||
)
|
||||
|
||||
return self.res
|
||||
|
||||
|
||||
@deprecated("DRtester has been renamed 'DRTester' and the old name has been deprecated and will be removed "
|
||||
"in a future release. Please use 'DRTester' instead.")
|
||||
class DRtester(DRTester):
|
||||
pass
|
||||
|
|
|
@ -20,6 +20,7 @@ class CalibrationEvaluationResults:
|
|||
treatments: list or numpy array of floats
|
||||
Sequence of treatment labels
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cal_r_squared: np.array,
|
||||
|
@ -99,6 +100,7 @@ class BLPEvaluationResults:
|
|||
treatments: list or numpy array of floats
|
||||
Sequence of treatment labels
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
params: List[float],
|
||||
|
@ -154,6 +156,7 @@ class UpliftEvaluationResults:
|
|||
Dictionary mapping treatment levels to dataframes containing
|
||||
necessary data for plotting uplift curves
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
params: List[float],
|
||||
|
@ -188,7 +191,7 @@ class UpliftEvaluationResults:
|
|||
}).round(3)
|
||||
return res
|
||||
|
||||
def plot_uplift(self, tmt: Any):
|
||||
def plot_uplift(self, tmt: Any, err_type: str = None):
|
||||
"""
|
||||
Plots uplift curves.
|
||||
|
||||
|
@ -197,6 +200,10 @@ class UpliftEvaluationResults:
|
|||
tmt: any (sortable)
|
||||
Name of treatment to plot.
|
||||
|
||||
err_type: str
|
||||
Type of error to plot. Accepted values are normal (None), two-sided uniform confidence band ('ucb2'),
|
||||
or 1-sided uniform confidence band ('ucb1').
|
||||
|
||||
Returns
|
||||
-------
|
||||
matplotlib plot with percentage treated on x-axis and uplift metric (and 95% CI) on y-axis
|
||||
|
@ -205,18 +212,38 @@ class UpliftEvaluationResults:
|
|||
raise ValueError(f'Invalid treatment; must be one of {self.treatments[1:]}')
|
||||
|
||||
df = self.curves[tmt].copy()
|
||||
df['95_err'] = 1.96 * df['err']
|
||||
|
||||
if err_type is None:
|
||||
df['95_err'] = 1.96 * df['err']
|
||||
elif err_type == 'ucb2':
|
||||
df['95_err'] = df['uniform_critical_value'] * df['err']
|
||||
elif err_type == 'ucb1':
|
||||
df['95_err'] = df['uniform_one_side_critical_value'] * df['err']
|
||||
else:
|
||||
raise ValueError(f"Invalid error type {err_type!r}; must be one of [None, 'ucb2', 'ucb1']")
|
||||
|
||||
res = self.summary()
|
||||
coeff = round(res.loc[res['treatment'] == tmt]['est'].values[0], 3)
|
||||
err = round(res.loc[res['treatment'] == tmt]['se'].values[0], 3)
|
||||
fig = df.plot(
|
||||
kind='scatter',
|
||||
x='Percentage treated',
|
||||
y='value',
|
||||
yerr='95_err',
|
||||
ylabel='Gain over Random',
|
||||
title=f"Treatment = {tmt}, Integral = {coeff} +/- {err}"
|
||||
)
|
||||
|
||||
if err_type == 'ucb1':
|
||||
fig = df.plot(
|
||||
kind='scatter',
|
||||
x='Percentage treated',
|
||||
y='value',
|
||||
yerr=[[df['95_err'], np.zeros(len(df))]],
|
||||
ylabel='Gain over Random',
|
||||
title=f"Treatment = {tmt}, Integral = {coeff} +/- {err}"
|
||||
)
|
||||
else:
|
||||
fig = df.plot(
|
||||
kind='scatter',
|
||||
x='Percentage treated',
|
||||
y='value',
|
||||
yerr='95_err',
|
||||
ylabel='Gain over Random',
|
||||
title=f"Treatment = {tmt}, Integral = {coeff} +/- {err}"
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
@ -239,6 +266,7 @@ class EvaluationResults:
|
|||
toc_res: UpliftEvaluationResults object
|
||||
Results object for TOC test
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cal_res: CalibrationEvaluationResults,
|
||||
|
@ -290,7 +318,7 @@ class EvaluationResults:
|
|||
"""
|
||||
return self.cal.plot_cal(tmt)
|
||||
|
||||
def plot_qini(self, tmt: int):
|
||||
def plot_qini(self, tmt: int, err_type: str = None):
|
||||
"""
|
||||
Plots QINI curves.
|
||||
|
||||
|
@ -299,13 +327,17 @@ class EvaluationResults:
|
|||
tmt: integer
|
||||
Treatment level to plot
|
||||
|
||||
err_type: str
|
||||
Type of error to plot. Accepted values are normal (None), two-sided uniform confidence band ('ucb2'),
|
||||
or 1-sided uniform confidence band ('ucb1').
|
||||
|
||||
Returns
|
||||
-------
|
||||
matplotlib plot with percentage treated on x-axis and QINI value (and 95% CI) on y-axis
|
||||
"""
|
||||
return self.qini.plot_uplift(tmt)
|
||||
return self.qini.plot_uplift(tmt, err_type)
|
||||
|
||||
def plot_toc(self, tmt: int):
|
||||
def plot_toc(self, tmt: int, err_type: str = None):
|
||||
"""
|
||||
Plots TOC curves.
|
||||
|
||||
|
@ -314,8 +346,12 @@ class EvaluationResults:
|
|||
tmt: integer
|
||||
Treatment level to plot
|
||||
|
||||
err_type: str
|
||||
Type of error to plot. Accepted values are normal (None), two-sided uniform confidence band ('ucb2'),
|
||||
or 1-sided uniform confidence band ('ucb1').
|
||||
|
||||
Returns
|
||||
-------
|
||||
matplotlib plot with percentage treated on x-axis and TOC value (and 95% CI) on y-axis
|
||||
"""
|
||||
return self.toc.plot_uplift(tmt)
|
||||
return self.toc.plot_uplift(tmt, err_type)
|
||||
|
|
|
@ -53,11 +53,14 @@ def calc_uplift(
|
|||
cate_preds_val: np.array,
|
||||
dr_val: np.array,
|
||||
percentiles: np.array,
|
||||
metric: str
|
||||
metric: str,
|
||||
n_bootstrap: int = 1000
|
||||
) -> Tuple[float, float, pd.DataFrame]:
|
||||
"""
|
||||
Helper function for QINI curve generation and QINI coefficient calculation.
|
||||
See documentation for "evaluate_qini" method for more details.
|
||||
Helper function for uplift curve generation and coefficient calculation.
|
||||
Calculates uplift curve points, integral, and errors on both points and integral.
|
||||
Also calculates appropriate critical value multipliers for confidence intervals (via multiplier bootstrap).
|
||||
See documentation for "drtester.evaluate_uplift" method for more details.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
@ -72,6 +75,8 @@ def calc_uplift(
|
|||
Array of percentiles over which the QINI curve should be constructed. Defaults to 5%-95% in intervals of 5%.
|
||||
metric: string
|
||||
String indicating whether to calculate TOC or QINI; should be one of ['toc', 'qini']
|
||||
n_bootstrap: integer, default 1000
|
||||
Number of bootstrap samples to run when calculating uniform confidence bands.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
@ -94,10 +99,19 @@ def calc_uplift(
|
|||
toc[it] = np.mean(dr_val[inds]) - ate # tau(q) := E[Y(1) - Y(0) | tau(X) >= q[it]] - E[Y(1) - Y(0)]
|
||||
toc_psi[it, :] = np.squeeze((dr_val - ate) * (inds / group_prob - 1) - toc[it])
|
||||
else:
|
||||
raise ValueError("Unsupported metric - must be one of ['toc', 'qini']")
|
||||
raise ValueError(f"Unsupported metric {metric!r} - must be one of ['toc', 'qini']")
|
||||
|
||||
toc_std[it] = np.sqrt(np.mean(toc_psi[it] ** 2) / n) # standard error of tau(q)
|
||||
|
||||
w = np.random.normal(0, 1, size=(n, n_bootstrap))
|
||||
mboot = (toc_psi / toc_std.reshape(-1, 1)) @ w / n
|
||||
|
||||
max_mboot = np.max(np.abs(mboot), axis=0)
|
||||
uniform_critical_value = np.percentile(max_mboot, 95)
|
||||
|
||||
min_mboot = np.min(mboot, axis=0)
|
||||
uniform_one_side_critical_value = np.abs(np.percentile(min_mboot, 5))
|
||||
|
||||
coeff_psi = np.sum(toc_psi[:-1] * np.diff(percentiles).reshape(-1, 1) / 100, 0)
|
||||
coeff = np.sum(toc[:-1] * np.diff(percentiles) / 100)
|
||||
coeff_stderr = np.sqrt(np.mean(coeff_psi ** 2) / n)
|
||||
|
@ -105,7 +119,9 @@ def calc_uplift(
|
|||
curve_df = pd.DataFrame({
|
||||
'Percentage treated': 100 - percentiles,
|
||||
'value': toc,
|
||||
'err': toc_std
|
||||
'err': toc_std,
|
||||
'uniform_critical_value': uniform_critical_value,
|
||||
'uniform_one_side_critical_value': uniform_one_side_critical_value
|
||||
})
|
||||
|
||||
return coeff, coeff_stderr, curve_df
|
||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Загрузка…
Ссылка в новой задаче