CATE validation - uplift uniform confidence bands (#840)

Add support for multiplier bootstrap uniform confidence band error bars for uplift curves
2024-03-19 09:54:10 -07:00 · 2024-03-19 09:54:10 -07:00 · 27d3101e23
--- a/doc/reference.rst
+++ b/doc/reference.rst
@ -147,6 +147,20 @@ CATE Interpreters
    econml.cate_interpreter.SingleTreeCateInterpreter
    econml.cate_interpreter.SingleTreePolicyInterpreter

+.. _validation_api:
+
+CATE Validation
+---------------
+
+.. autosummary::
+    :toctree: _autosummary
+
+    econml.validate.DRTester
+    econml.validate.BLPEvaluationResults
+    econml.validate.CalibrationEvaluationResults
+    econml.validate.UpliftEvaluationResults
+    econml.validate.EvaluationResults
+
 .. _scorers_api:

 CATE Scorers
--- a/econml/tests/test_drtester.py
+++ b/econml/tests/test_drtester.py
@ -5,7 +5,7 @@ import pandas as pd
 import scipy.stats as st
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor

-from econml.validate.drtester import DRtester
+from econml.validate.drtester import DRTester
 from econml.dml import DML


@ -70,7 +70,7 @@ class TestDRTester(unittest.TestCase):
        ).fit(Y=Ytrain, T=Dtrain, X=Xtrain)

        # test the DR outcome difference
-        my_dr_tester = DRtester(
+        my_dr_tester = DRTester(
            model_regression=reg_y,
            model_propensity=reg_t,
            cate=cate
@ -123,7 +123,7 @@ class TestDRTester(unittest.TestCase):
        ).fit(Y=Ytrain, T=Dtrain, X=Xtrain)

        # test the DR outcome difference
-        my_dr_tester = DRtester(
+        my_dr_tester = DRTester(
            model_regression=reg_y,
            model_propensity=reg_t,
            cate=cate
@ -148,8 +148,8 @@ class TestDRTester(unittest.TestCase):
                self.assertRaises(ValueError, res.plot_toc, k)
            else:  # real treatment, k = 1
                self.assertTrue(res.plot_cal(k) is not None)
-                self.assertTrue(res.plot_qini(k) is not None)
-                self.assertTrue(res.plot_toc(k) is not None)
+                self.assertTrue(res.plot_qini(k, 'ucb2') is not None)
+                self.assertTrue(res.plot_toc(k, 'ucb1') is not None)

        self.assertLess(res_df.blp_pval.values[0], 0.05)  # heterogeneity
        self.assertGreater(res_df.cal_r_squared.values[0], 0)  # good R2
@ -171,7 +171,7 @@ class TestDRTester(unittest.TestCase):
        ).fit(Y=Ytrain, T=Dtrain, X=Xtrain)

        # test the DR outcome difference
-        my_dr_tester = DRtester(
+        my_dr_tester = DRTester(
            model_regression=reg_y,
            model_propensity=reg_t,
            cate=cate
@ -193,8 +193,8 @@ class TestDRTester(unittest.TestCase):
        for kwargs in [{}, {'Xval': Xval}]:
            with self.assertRaises(Exception) as exc:
                my_dr_tester.evaluate_cal(kwargs)
-            self.assertTrue(
-                str(exc.exception) == "Must fit nuisance models on training sample data to use calibration test"
+            self.assertEqual(
+                str(exc.exception), "Must fit nuisance models on training sample data to use calibration test"
            )

    def test_exceptions(self):
@ -212,7 +212,7 @@ class TestDRTester(unittest.TestCase):
        ).fit(Y=Ytrain, T=Dtrain, X=Xtrain)

        # test the DR outcome difference
-        my_dr_tester = DRtester(
+        my_dr_tester = DRTester(
            model_regression=reg_y,
            model_propensity=reg_t,
            cate=cate
@ -223,11 +223,11 @@ class TestDRTester(unittest.TestCase):
            with self.assertRaises(Exception) as exc:
                func()
            if func.__name__ == 'evaluate_cal':
-                self.assertTrue(
-                    str(exc.exception) == "Must fit nuisance models on training sample data to use calibration test"
+                self.assertEqual(
+                    str(exc.exception), "Must fit nuisance models on training sample data to use calibration test"
                )
            else:
-                self.assertTrue(str(exc.exception) == "Must fit nuisances before evaluating")
+                self.assertEqual(str(exc.exception), "Must fit nuisances before evaluating")

        my_dr_tester = my_dr_tester.fit_nuisance(
            Xval, Dval, Yval, Xtrain, Dtrain, Ytrain
@ -242,11 +242,11 @@ class TestDRTester(unittest.TestCase):
            with self.assertRaises(Exception) as exc:
                func()
            if func.__name__ == 'evaluate_blp':
-                self.assertTrue(
-                    str(exc.exception) == "CATE predictions not yet calculated - must provide Xval"
+                self.assertEqual(
+                    str(exc.exception), "CATE predictions not yet calculated - must provide Xval"
                )
            else:
-                self.assertTrue(str(exc.exception) ==
+                self.assertEqual(str(exc.exception),
                                 "CATE predictions not yet calculated - must provide both Xval, Xtrain")

        for func in [
@ -256,19 +256,19 @@ class TestDRTester(unittest.TestCase):
        ]:
            with self.assertRaises(Exception) as exc:
                func(Xval=Xval)
-            self.assertTrue(
-                str(exc.exception) == "CATE predictions not yet calculated - must provide both Xval, Xtrain")
+            self.assertEqual(
+                str(exc.exception), "CATE predictions not yet calculated - must provide both Xval, Xtrain")

        cal_res = my_dr_tester.evaluate_cal(Xval, Xtrain)
        self.assertGreater(cal_res.cal_r_squared[0], 0)  # good R2

        with self.assertRaises(Exception) as exc:
            my_dr_tester.evaluate_uplift(metric='blah')
-        self.assertTrue(
-            str(exc.exception) == "Unsupported metric - must be one of ['toc', 'qini']"
+        self.assertEqual(
+            str(exc.exception), "Unsupported metric 'blah' - must be one of ['toc', 'qini']"
        )

-        my_dr_tester = DRtester(
+        my_dr_tester = DRTester(
            model_regression=reg_y,
            model_propensity=reg_t,
            cate=cate
@ -278,5 +278,11 @@ class TestDRTester(unittest.TestCase):
        qini_res = my_dr_tester.evaluate_uplift(Xval, Xtrain)
        self.assertLess(qini_res.pvals[0], 0.05)

+        with self.assertRaises(Exception) as exc:
+            qini_res.plot_uplift(tmt=1, err_type='blah')
+        self.assertEqual(
+            str(exc.exception), "Invalid error type 'blah'; must be one of [None, 'ucb2', 'ucb1']"
+        )
+
        autoc_res = my_dr_tester.evaluate_uplift(Xval, Xtrain, metric='toc')
        self.assertLess(autoc_res.pvals[0], 0.05)
--- a/econml/validate/init.py
+++ b/econml/validate/init.py
@ -5,7 +5,9 @@
 A suite of validation methods for CATE models.
 """

-from .drtester import DRtester
+from .drtester import DRTester
+from .results import BLPEvaluationResults, CalibrationEvaluationResults, UpliftEvaluationResults, EvaluationResults


-__all__ = ['DRtester']
+__all__ = ['DRTester',
+           'BLPEvaluationResults', 'CalibrationEvaluationResults', 'UpliftEvaluationResults', 'EvaluationResults']
--- a/econml/validate/drtester.py
+++ b/econml/validate/drtester.py
@ -8,12 +8,13 @@ from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
 from statsmodels.api import OLS
 from statsmodels.tools import add_constant

+from econml.utilities import deprecated
+
 from .results import CalibrationEvaluationResults, BLPEvaluationResults, UpliftEvaluationResults, EvaluationResults
 from .utils import calculate_dr_outcomes, calc_uplift


-class DRtester:
-
+class DRTester:
    """
    Validation tests for CATE models. Includes the best linear predictor (BLP) test as in Chernozhukov et al. (2022),
    the calibration test in Dwivedi et al. (2020), and the QINI coefficient as in Radcliffe (2007).
@ -46,47 +47,62 @@ class DRtester:
    The calibration r-squared metric is similar to the standard R-square score in that it can take any value
    less than or equal to 1, with scores closer to 1 indicating a better calibrated CATE model.

-    **QINI**
+    **Uplift Modeling**

    Units are ordered by predicted CATE values and a running measure of the average treatment effect in each cohort is
-    kept as we progress through ranks. The QINI coefficient is then the area under the resulting curve, with a value
-    of 0 interpreted as corresponding to a model with randomly assigned CATE coefficients. All calculations are
-    performed on validation dataset results, using the training set as input.
+    kept as we progress through ranks. The resulting TOC curve can then be plotted and its integral calculated and used
+    as a measure of true heterogeneity captured by the CATE model; this integral is referred to as the AUTOC (area
+    under TOC). The QINI curve is a variant of this curve that also incorporates treatment probability; its integral is
+    referred to as the QINI coefficient.

-    More formally, the QINI curve is given by the following function:
+    More formally, the TOC and QINI curves are given by the following functions:

    .. math::

+        \\tau_{TOC}(q) = \\mathrm{Cov}(
+            Y^{DR}(g,p),
+            \\frac{
+                \\mathbb{1}\\{\\hat{\\tau}(Z) \\geq \\hat{\\mu}(q)\\}
+            }{
+                \\mathrm{Pr}(\\hat{\\tau}(Z) \\geq \\hat{\\mu}(q))
+            }
+        )
+
        \\tau_{QINI}(q) = \\mathrm{Cov}(Y^{DR}(g,p), \\mathbb{1}\\{\\hat{\\tau}(Z) \\geq \\hat{\\mu}(q)\\})

    Where :math:`q` is the desired quantile, :math:`\\hat{\\mu}` is the quantile function, and :math:`\\hat{\\tau}` is
    the predicted CATE function.
    :math:`Y^{DR}(g,p)` refers to the doubly robust outcome difference (relative to control) for the given observation.

-    The QINI coefficient is then given by:
+    The AUTOC and QINI coefficient are then given by:

    .. math::

+        AUTOC = \\int_0^1 \\tau_{TOC}(q) dq
+
        QINI = \\int_0^1 \\tau_{QINI}(q) dq

    Parameters
    ----------
    model_regression: estimator
-        Nuisance model estimator used to fit the outcome to features. Must be able to implement `fit' and `predict'
+        Nuisance model estimator used to fit the outcome to features. Must be able to implement `fit` and `predict`
        methods

    model_propensity: estimator
-        Nuisance model estimator used to fit the treatment assignment to features. Must be able to implement `fit'
-        method and either `predict' (in the case of binary treatment) or `predict_proba' methods (in the case of
+        Nuisance model estimator used to fit the treatment assignment to features. Must be able to implement `fit`
+        method and either `predict` (in the case of binary treatment) or `predict_proba` methods (in the case of
        multiple categorical treatments).

-    n_splits: integer, default 5
-        Number of splits used to generate cross-validated predictions
+    cate: estimator
+        Fitted conditional average treatment effect (CATE) estimator to be validated.
+
+    cv: int or list, default 5
+        Splitter used for cross-validation. Can be either an integer (corresponding to the number of desired folds)
+        or a list of indices corresponding to membership in each fold.

    References
    ----------

-
    [Chernozhukov2022] V. Chernozhukov et al.
    Generic Machine Learning Inference on Heterogeneous Treatment Effects in Randomized Experiments
    arXiv preprint arXiv:1712.04802, 2022.
@ -97,7 +113,6 @@ class DRtester:
    arXiv preprint 	arXiv:2008.10109, 2020.
    `<https://arxiv.org/abs/2008.10109>`_

-
    [Radcliffe2007] N. Radcliffe
    Using control groups to target on predicted lift: Building and assessing uplift model.
    Direct Marketing Analytics Journal (2007), pages 14–21.
@ -175,7 +190,7 @@ class DRtester:
        Generates nuisance predictions and calculates doubly robust (DR) outcomes either by (1) cross-fitting in the
        validation sample, or (2) fitting in the training sample and applying to the validation sample. If Xtrain,
        Dtrain, and ytrain are all not None, then option (2) will be implemented, otherwise, option (1) will be
-        implemented. In order to use the `evaluate_cal' method then Xtrain, Dtrain, and ytrain must all be specified.
+        implemented. In order to use the `evaluate_cal` method then Xtrain, Dtrain, and ytrain must all be specified.

        Parameters
        ----------
@ -186,12 +201,12 @@ class DRtester:
            the control status be equal to 0, and all other treatments integers starting at 1.
        yval: vector of length n_val
            Outcomes for the validation sample
-        Xtrain: (n_train x k) matrix or vector of length n, default ``None``
+        Xtrain: (n_train x k) matrix or vector of length n, optional
            Features used in nuisance models for training sample
-        Dtrain: vector of length n_train, default ``None''
+        Dtrain: vector of length n_train, optional
            Treatment assignment of training sample. Control status must be minimum value. It is recommended to have
            the control status be equal to 0, and all other treatments integers starting at 1.
-        ytrain: vector of length n_train, defaul ``None``
+        ytrain: vector of length n_train, optional
            Outcomes for the training sample

        Returns
@ -332,7 +347,7 @@ class DRtester:
        ----------
        Xval: (n_val x n_treatment) matrix
            Validation set features to be used to predict (and potentially fit) DR outcomes in CATE model
-        Xtrain (n_train x n_treatment) matrix, defaul ``None``
+        Xtrain (n_train x n_treatment) matrix, optional
            Training set features used to fit CATE model

        Returns
@ -359,11 +374,11 @@ class DRtester:

        Parameters
        ----------
-        Xval: (n_val x n_treatment) matrix, default ``None``
-            Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
+        Xval: (n_val x n_treatment) matrix, optional
+            Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
            implemented
-        Xtrain: (n_train x n_treatment) matrix, default ``None``
-            Training sample features for CATE model. If not specified, then `fit cate' method must already have been
+        Xtrain: (n_train x n_treatment) matrix, optional
+            Training sample features for CATE model. If not specified, then `fit cate` method must already have been
            implemented (with Xtrain specified)
        n_groups: integer, default 4
            Number of quantile-based groups used to calculate calibration score.
@ -433,17 +448,17 @@ class DRtester:
        Xtrain: np.array = None
    ) -> BLPEvaluationResults:
        """
-        Implements the best linear predictor (BLP) test as in [Chernozhukov2022]. `fit_nusiance' method must already
+        Implements the best linear predictor (BLP) test as in [Chernozhukov2022]. `fit_nusiance` method must already
        be implemented.

        Parameters
        ----------
-        Xval: (n_val x k) matrix, default ``None''
-            Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
+        Xval: (n_val x k) matrix, optional
+            Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
            implemented
-        Xtrain: (n_train x k) matrix, default ``None''
+        Xtrain: (n_train x k) matrix, optional
            Training sample features for CATE model. If specified, then CATE is fitted on training sample and applied
-            to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance' method (and
+            to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance` method (and
            vice-versa)

        Returns
@ -489,29 +504,32 @@ class DRtester:
        Xval: np.array = None,
        Xtrain: np.array = None,
        percentiles: np.array = np.linspace(5, 95, 50),
-        metric: str = 'qini'
+        metric: str = 'qini',
+        n_bootstrap: int = 1000
    ) -> UpliftEvaluationResults:
        """
-        Calculates QINI coefficient for the given model as in Radcliffe (2007), where units are ordered by predicted
+        Calculates uplift curves and coefficients for the given model, where units are ordered by predicted
        CATE values and a running measure of the average treatment effect in each cohort is kept as we progress
-        through ranks. The QINI coefficient is then the area under the resulting curve, with a value of 0 interpreted
+        through ranks. The uplift coefficient is then the area under the resulting curve, with a value of 0 interpreted
        as corresponding to a model with randomly assigned CATE coefficients. All calculations are performed on
        validation dataset results, using the training set as input.

        Parameters
        ----------
-        Xval: (n_val x k) matrix, default ``None''
-            Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
+        Xval: (n_val x k) matrix, optional
+            Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
            implemented
-        Xtrain: (n_train x k) matrix, default ``None''
+        Xtrain: (n_train x k) matrix, optional
            Training sample features for CATE model. If specified, then CATE is fitted on training sample and applied
-            to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance' method (and
+            to Xval. If specified, then Xtrain, Dtrain, Ytrain must have been specified in `fit_nuisance` method (and
            vice-versa)
-        percentiles: one-dimensional array, default ``np.linspace(5, 95, 50)''
+        percentiles: one-dimensional array, default ``np.linspace(5, 95, 50)``
            Array of percentiles over which the QINI curve should be constructed. Defaults to 5%-95% in intervals of
            5%.
        metric: string, default 'qini'
            Which type of uplift curve to evaluate. Must be one of ['toc', 'qini']
+        n_bootstrap: integer, default 1000
+            Number of bootstrap samples to run when calculating uniform confidence bands.

        Returns
        -------
@ -532,7 +550,8 @@ class DRtester:
                self.cate_preds_val_,
                self.dr_val_,
                percentiles,
-                metric
+                metric,
+                n_bootstrap
            )
            coeffs = [coeff]
            errs = [err]
@ -546,7 +565,8 @@ class DRtester:
                    self.cate_preds_val_[:, k],
                    self.dr_val_[:, k],
                    percentiles,
-                    metric
+                    metric,
+                    n_bootstrap
                )
                coeffs.append(coeff)
                errs.append(err)
@ -568,20 +588,25 @@ class DRtester:
        self,
        Xval: np.array = None,
        Xtrain: np.array = None,
-        n_groups: int = 4
+        n_groups: int = 4,
+        n_bootstrap: int = 1000
    ) -> EvaluationResults:
        """
-        Implements the best linear prediction (`evaluate_blp'), calibration (`evaluate_cal'), uplift curve
-        ('evaluate_uplift') methods
+        Implements the best linear prediction (`evaluate_blp`), calibration (`evaluate_cal`), uplift curve
+        (`evaluate_uplift`) methods

        Parameters
        ----------
-        Xval: (n_cal x k) matrix, default ``None''
-            Validation sample features for CATE model. If not specified, then `fit_cate' method must already have been
+        Xval: (n_cal x k) matrix, optional
+            Validation sample features for CATE model. If not specified, then `fit_cate` method must already have been
            implemented
-        Xtrain: (n_train x k) matrix, default ``None''
-            Training sample features for CATE model. If not specified, then `fit_cate' method must already have been
+        Xtrain: (n_train x k) matrix, optional
+            Training sample features for CATE model. If not specified, then `fit_cate` method must already have been
            implemented
+        n_groups: integer, default 4
+            Number of quantile-based groups used to calculate calibration score.
+        n_bootstrap: integer, default 1000
+            Number of bootstrap samples to run when calculating uniform confidence bands for uplift curves.

        Returns
        -------
@ -595,8 +620,8 @@ class DRtester:

        blp_res = self.evaluate_blp()
        cal_res = self.evaluate_cal(n_groups=n_groups)
-        qini_res = self.evaluate_uplift(metric='qini')
-        toc_res = self.evaluate_uplift(metric='toc')
+        qini_res = self.evaluate_uplift(metric='qini', n_bootstrap=n_bootstrap)
+        toc_res = self.evaluate_uplift(metric='toc', n_bootstrap=n_bootstrap)

        self.res = EvaluationResults(
            blp_res=blp_res,
@ -606,3 +631,9 @@ class DRtester:
        )

        return self.res
+
+
+@deprecated("DRtester has been renamed 'DRTester' and the old name has been deprecated and will be removed "
+            "in a future release. Please use 'DRTester' instead.")
+class DRtester(DRTester):
+    pass
--- a/econml/validate/results.py
+++ b/econml/validate/results.py
@ -20,6 +20,7 @@ class CalibrationEvaluationResults:
    treatments: list or numpy array of floats
        Sequence of treatment labels
    """
+
    def __init__(
        self,
        cal_r_squared: np.array,
@ -99,6 +100,7 @@ class BLPEvaluationResults:
    treatments: list or numpy array of floats
       Sequence of treatment labels
    """
+
    def __init__(
        self,
        params: List[float],
@ -154,6 +156,7 @@ class UpliftEvaluationResults:
        Dictionary mapping treatment levels to dataframes containing
        necessary data for plotting uplift curves
    """
+
    def __init__(
        self,
        params: List[float],
@ -188,7 +191,7 @@ class UpliftEvaluationResults:
        }).round(3)
        return res

-    def plot_uplift(self, tmt: Any):
+    def plot_uplift(self, tmt: Any, err_type: str = None):
        """
        Plots uplift curves.

@ -197,6 +200,10 @@ class UpliftEvaluationResults:
        tmt: any (sortable)
            Name of treatment to plot.

+        err_type: str
+            Type of error to plot. Accepted values are normal (None), two-sided uniform confidence band ('ucb2'),
+            or 1-sided uniform confidence band ('ucb1').
+
        Returns
        -------
        matplotlib plot with percentage treated on x-axis and uplift metric (and 95% CI) on y-axis
@ -205,10 +212,30 @@ class UpliftEvaluationResults:
            raise ValueError(f'Invalid treatment; must be one of {self.treatments[1:]}')

        df = self.curves[tmt].copy()
+
+        if err_type is None:
            df['95_err'] = 1.96 * df['err']
+        elif err_type == 'ucb2':
+            df['95_err'] = df['uniform_critical_value'] * df['err']
+        elif err_type == 'ucb1':
+            df['95_err'] = df['uniform_one_side_critical_value'] * df['err']
+        else:
+            raise ValueError(f"Invalid error type {err_type!r}; must be one of [None, 'ucb2', 'ucb1']")
+
        res = self.summary()
        coeff = round(res.loc[res['treatment'] == tmt]['est'].values[0], 3)
        err = round(res.loc[res['treatment'] == tmt]['se'].values[0], 3)
+
+        if err_type == 'ucb1':
+            fig = df.plot(
+                kind='scatter',
+                x='Percentage treated',
+                y='value',
+                yerr=[[df['95_err'], np.zeros(len(df))]],
+                ylabel='Gain over Random',
+                title=f"Treatment = {tmt}, Integral = {coeff} +/- {err}"
+            )
+        else:
            fig = df.plot(
                kind='scatter',
                x='Percentage treated',
@ -239,6 +266,7 @@ class EvaluationResults:
    toc_res: UpliftEvaluationResults object
       Results object for TOC test
    """
+
    def __init__(
        self,
        cal_res: CalibrationEvaluationResults,
@ -290,7 +318,7 @@ class EvaluationResults:
        """
        return self.cal.plot_cal(tmt)

-    def plot_qini(self, tmt: int):
+    def plot_qini(self, tmt: int, err_type: str = None):
        """
        Plots QINI curves.

@ -299,13 +327,17 @@ class EvaluationResults:
        tmt: integer
            Treatment level to plot

+        err_type: str
+            Type of error to plot. Accepted values are normal (None), two-sided uniform confidence band ('ucb2'),
+            or 1-sided uniform confidence band ('ucb1').
+
        Returns
        -------
        matplotlib plot with percentage treated on x-axis and QINI value (and 95% CI) on y-axis
        """
-        return self.qini.plot_uplift(tmt)
+        return self.qini.plot_uplift(tmt, err_type)

-    def plot_toc(self, tmt: int):
+    def plot_toc(self, tmt: int, err_type: str = None):
        """
        Plots TOC curves.

@ -314,8 +346,12 @@ class EvaluationResults:
        tmt: integer
            Treatment level to plot

+        err_type: str
+            Type of error to plot. Accepted values are normal (None), two-sided uniform confidence band ('ucb2'),
+            or 1-sided uniform confidence band ('ucb1').
+
        Returns
        -------
        matplotlib plot with percentage treated on x-axis and TOC value (and 95% CI) on y-axis
        """
-        return self.toc.plot_uplift(tmt)
+        return self.toc.plot_uplift(tmt, err_type)
--- a/econml/validate/utils.py
+++ b/econml/validate/utils.py
@ -53,11 +53,14 @@ def calc_uplift(
    cate_preds_val: np.array,
    dr_val: np.array,
    percentiles: np.array,
-    metric: str
+    metric: str,
+    n_bootstrap: int = 1000
 ) -> Tuple[float, float, pd.DataFrame]:
    """
-    Helper function for QINI curve generation and QINI coefficient calculation.
-    See documentation for "evaluate_qini" method for more details.
+    Helper function for uplift curve generation and coefficient calculation.
+    Calculates uplift curve points, integral, and errors on both points and integral.
+    Also calculates appropriate critical value multipliers for confidence intervals (via multiplier bootstrap).
+    See documentation for "drtester.evaluate_uplift" method for more details.

    Parameters
    ----------
@ -72,6 +75,8 @@ def calc_uplift(
        Array of percentiles over which the QINI curve should be constructed. Defaults to 5%-95% in intervals of 5%.
    metric: string
        String indicating whether to calculate TOC or QINI; should be one of ['toc', 'qini']
+    n_bootstrap: integer, default 1000
+        Number of bootstrap samples to run when calculating uniform confidence bands.

    Returns
    -------
@ -94,10 +99,19 @@ def calc_uplift(
            toc[it] = np.mean(dr_val[inds]) - ate  # tau(q) := E[Y(1) - Y(0) | tau(X) >= q[it]] - E[Y(1) - Y(0)]
            toc_psi[it, :] = np.squeeze((dr_val - ate) * (inds / group_prob - 1) - toc[it])
        else:
-            raise ValueError("Unsupported metric - must be one of ['toc', 'qini']")
+            raise ValueError(f"Unsupported metric {metric!r} - must be one of ['toc', 'qini']")

        toc_std[it] = np.sqrt(np.mean(toc_psi[it] ** 2) / n)  # standard error of tau(q)

+    w = np.random.normal(0, 1, size=(n, n_bootstrap))
+    mboot = (toc_psi / toc_std.reshape(-1, 1)) @ w / n
+
+    max_mboot = np.max(np.abs(mboot), axis=0)
+    uniform_critical_value = np.percentile(max_mboot, 95)
+
+    min_mboot = np.min(mboot, axis=0)
+    uniform_one_side_critical_value = np.abs(np.percentile(min_mboot, 5))
+
    coeff_psi = np.sum(toc_psi[:-1] * np.diff(percentiles).reshape(-1, 1) / 100, 0)
    coeff = np.sum(toc[:-1] * np.diff(percentiles) / 100)
    coeff_stderr = np.sqrt(np.mean(coeff_psi ** 2) / n)
@ -105,7 +119,9 @@ def calc_uplift(
    curve_df = pd.DataFrame({
        'Percentage treated': 100 - percentiles,
        'value': toc,
-        'err': toc_std
+        'err': toc_std,
+        'uniform_critical_value': uniform_critical_value,
+        'uniform_one_side_critical_value': uniform_one_side_critical_value
    })

    return coeff, coeff_stderr, curve_df
--- a/validation.ipynb
+++ b/validation.ipynb