From cce3ded03c2e72234b46e300666bda291428f2a0 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Wed, 17 Jul 2024 18:45:34 -0400 Subject: [PATCH] Fix E731 Signed-off-by: Keith Battocchi --- ...utomated Machine Learning For EconML.ipynb | 6 +- ...nd Orthogonal Random Forest Examples.ipynb | 18 +- ...sal Model Selection with the RScorer.ipynb | 18 +- notebooks/Choosing First Stage Models.ipynb | 21 +- ... A Software Company - EconML + DoWhy.ipynb | 3 +- ...nt Attribution at A Software Company.ipynb | 675 +++++------ ...line Travel Company - EconML + DoWhy.ipynb | 3 +- ... Testing at An Online Travel Company.ipynb | 575 ++++----- .../Double Machine Learning Examples.ipynb | 18 +- notebooks/ForestLearners Basic Example.ipynb | 9 +- notebooks/Generalized Random Forests.ipynb | 17 +- notebooks/Interpretability with SHAP.ipynb | 7 +- notebooks/Metalearners Examples.ipynb | 1028 +++++++++-------- notebooks/OrthoIV and DRIV Examples.ipynb | 12 +- pyproject.toml | 1 - 15 files changed, 1219 insertions(+), 1192 deletions(-) diff --git a/notebooks/AutomatedML/Automated Machine Learning For EconML.ipynb b/notebooks/AutomatedML/Automated Machine Learning For EconML.ipynb index 3c7a34f7..42adc2f1 100644 --- a/notebooks/AutomatedML/Automated Machine Learning For EconML.ipynb +++ b/notebooks/AutomatedML/Automated Machine Learning For EconML.ipynb @@ -310,11 +310,13 @@ "# Outcome support\n", "support_Y = np.random.choice(np.arange(n_w), size=support_size, replace=False)\n", "coefs_Y = np.random.uniform(0, 1, size=support_size)\n", - "epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n)\n", + "def epsilon_sample(n):\n", + " return np.random.uniform(-1, 1, size=n)\n", "# Treatment support\n", "support_T = support_Y\n", "coefs_T = np.random.uniform(0, 1, size=support_size)\n", - "eta_sample = lambda n: np.random.uniform(-1, 1, size=n)\n", + "def eta_sample(n):\n", + " return np.random.uniform(-1, 1, size=n)\n", "\n", "# Generate controls, covariates, treatments and outcomes\n", "W = np.random.normal(0, 1, size=(n, n_w))\n", diff --git a/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb b/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb index 673c669e..26e37fd8 100644 --- a/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb +++ b/notebooks/Causal Forest and Orthogonal Random Forest Examples.ipynb @@ -125,11 +125,13 @@ "# Outcome support\n", "support_Y = np.random.choice(range(n_w), size=support_size, replace=False)\n", "coefs_Y = np.random.uniform(0, 1, size=support_size)\n", - "epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n)\n", + "def epsilon_sample(n):\n", + " return np.random.uniform(-1, 1, size=n)\n", "# Treatment support \n", "support_T = support_Y\n", "coefs_T = np.random.uniform(0, 1, size=support_size)\n", - "eta_sample = lambda n: np.random.uniform(-1, 1, size=n) \n", + "def eta_sample(n):\n", + " return np.random.uniform(-1, 1, size=n) \n", "\n", "# Generate controls, covariates, treatments and outcomes\n", "W = np.random.normal(0, 1, size=(n, n_w))\n", @@ -552,11 +554,13 @@ "# Outcome support\n", "support_Y = np.random.choice(range(n_w), size=support_size, replace=False)\n", "coefs_Y = np.random.uniform(0, 1, size=support_size)\n", - "epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n)\n", + "def epsilon_sample(n):\n", + " return np.random.uniform(-1, 1, size=n)\n", "# Treatment support\n", "support_T = support_Y\n", "coefs_T = np.random.uniform(0, 1, size=support_size)\n", - "eta_sample = lambda n: np.random.uniform(-1, 1, size=n) \n", + "def eta_sample(n):\n", + " return np.random.uniform(-1, 1, size=n) \n", "\n", "# Generate controls, covariates, treatments and outcomes\n", "W = np.random.normal(0, 1, size=(n, n_w))\n", @@ -895,11 +899,13 @@ " # Outcome support\n", " support_Y = np.random.choice(range(n_w), size=support_size, replace=False)\n", " coefs_Y = np.random.uniform(0, 1, size=support_size)\n", - " epsilon_sample = lambda n: np.random.uniform(-1, 1, size=n)\n", + " def epsilon_sample(n):\n", + " return np.random.uniform(-1, 1, size=n)\n", " # Treatment support \n", " support_T = support_Y\n", " coefs_T = np.random.uniform(0, 1, size=(support_size, n_treatments))\n", - " eta_sample = lambda n: np.random.uniform(-1, 1, size=n) \n", + " def eta_sample(n):\n", + " return np.random.uniform(-1, 1, size=n) \n", " # Generate controls, covariates, treatments and outcomes\n", " W = np.random.normal(0, 1, size=(n, n_w))\n", " X = np.random.uniform(0, 1, size=(n, n_x))\n", diff --git a/notebooks/Causal Model Selection with the RScorer.ipynb b/notebooks/Causal Model Selection with the RScorer.ipynb index 3caf4095..c964252f 100644 --- a/notebooks/Causal Model Selection with the RScorer.ipynb +++ b/notebooks/Causal Model Selection with the RScorer.ipynb @@ -119,11 +119,13 @@ "# Outcome support\n", "support_Y = np.random.choice(range(n_x), size=support_size, replace=False)\n", "coefs_Y = np.random.uniform(0, 1, size=support_size)\n", - "epsilon_sample = lambda n:np.random.uniform(-1, 1, size=n)\n", + "def epsilon_sample(n):\n", + " return np.random.uniform(-1, 1, size=n)\n", "# Treatment support\n", "support_T = support_Y\n", "coefs_T = np.random.uniform(0, 1, size=support_size)\n", - "eta_sample = lambda n: np.random.uniform(-1, 1, size=n) \n", + "def eta_sample(n):\n", + " return np.random.uniform(-1, 1, size=n) \n", "\n", "# Generate controls, covariates, treatments and outcomes\n", "X = np.random.uniform(0, 1, size=(n, n_x))\n", @@ -155,8 +157,10 @@ "metadata": {}, "outputs": [], "source": [ - "reg = lambda: RandomForestRegressor(min_samples_leaf=10)\n", - "clf = lambda: RandomForestClassifier(min_samples_leaf=10)" + "def reg():\n", + " return RandomForestRegressor(min_samples_leaf=10)\n", + "def clf():\n", + " return RandomForestClassifier(min_samples_leaf=10)" ] }, { @@ -421,8 +425,10 @@ "metadata": {}, "outputs": [], "source": [ - "reg = lambda: RandomForestRegressor(min_samples_leaf=10, random_state=123)\n", - "clf = lambda: RandomForestClassifier(min_samples_leaf=10, random_state=123)" + "def reg():\n", + " return RandomForestRegressor(min_samples_leaf=10, random_state=123)\n", + "def clf():\n", + " return RandomForestClassifier(min_samples_leaf=10, random_state=123)" ] }, { diff --git a/notebooks/Choosing First Stage Models.ipynb b/notebooks/Choosing First Stage Models.ipynb index e66c6a6b..7388e8c3 100644 --- a/notebooks/Choosing First Stage Models.ipynb +++ b/notebooks/Choosing First Stage Models.ipynb @@ -83,7 +83,8 @@ "p = 10\n", "W = np.random.uniform(size=(n, p))\n", "X = np.random.uniform(size=(n, 1))\n", - "true_effect = lambda x: x[:, 0] ** 2\n", + "def true_effect(x):\n", + " return x[:, 0] ** 2\n", "T = W[:, 0] + W[:, 1] ** 2 + np.random.uniform(-1, 1, size=n)\n", "Y = (\n", " true_effect(X) * T\n", @@ -158,12 +159,8 @@ } ], "source": [ - "first_stage = lambda: GridSearchCV(\n", - " estimator=GradientBoostingRegressor(),\n", - " param_grid={\"max_depth\": [3, 5, None], \"n_estimators\": (50, 100, 200)},\n", - " cv=2,\n", - " n_jobs=-1,\n", - ")\n", + "def first_stage():\n", + " return GridSearchCV(estimator=GradientBoostingRegressor(), param_grid={\"max_depth\": [3, 5, None], \"n_estimators\": (50, 100, 200)}, cv=2, n_jobs=-1)\n", "est = LinearDML(\n", " model_y=first_stage(),\n", " model_t=first_stage(),\n", @@ -381,14 +378,8 @@ } ], "source": [ - "first_stage = lambda: GridSearchCVList(\n", - " [Lasso(max_iter=10000), GradientBoostingRegressor()],\n", - " param_grid_list=[\n", - " {\"alpha\": [0.001, 0.01, 0.1, 1, 10]},\n", - " {\"max_depth\": [3, 5, None], \"n_estimators\": [50, 100, 200]},\n", - " ],\n", - " cv=2,\n", - ")" + "def first_stage():\n", + " return GridSearchCVList([Lasso(max_iter=10000), GradientBoostingRegressor()], param_grid_list=[{\"alpha\": [0.001, 0.01, 0.1, 1, 10]}, {\"max_depth\": [3, 5, None], \"n_estimators\": [50, 100, 200]}], cv=2)" ] }, { diff --git a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb index 710f637a..34a04e9c 100644 --- a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb +++ b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company - EconML + DoWhy.ipynb @@ -397,7 +397,8 @@ "outputs": [], "source": [ "# Define underlying treatment effect function\n", - "TE_fn = lambda X: np.hstack([5000 + 2 / 100 * X, 5 / 100 * X])\n", + "def TE_fn(X):\n", + " return np.hstack([5000 + 2 / 100 * X, 5 / 100 * X])\n", "true_TE = TE_fn(X)\n", "\n", "# Define true coefficients for the three treatments\n", diff --git a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb index 2f8d05b5..0e444e64 100644 --- a/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb +++ b/notebooks/CustomerScenarios/Case Study - Multi-investment Attribution at A Software Company.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "\n", "\n", @@ -12,11 +13,11 @@ "In an ideal world, the startup would run several randomized experiments where each customer would receive a random assortment of investments. However, this can be logistically prohibitive or strategically unsound: the startup might not have the resources to design such experiments or they might not want to risk losing out on big opportunities due to lack of incentives.\n", "\n", "In this customer scenario walkthrough, we show how tools from the [EconML](https://aka.ms/econml) library can use historical investment data to learn the effects of multiple investments." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### Summary\n", "\n", @@ -26,11 +27,11 @@ "4. [Understand Treatment Effects with EconML](#Understand-Treatment-Effects-with-EconML)\n", "5. [Make Policy Decisions with EconML](#Make-Policy-Decisions-with-EconML)\n", "6. [Conclusions](#Conclusions)" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Background\n", "\n", @@ -43,12 +44,13 @@ "The startup faces two challenges: 1) the dataset is biased because historically the larger customers received the most incentives and 2) the observed outcome combines effects from two different investments. Thus, they need a causal model that can accommodate multiple concurrent interventions. \n", "\n", "**Solution:** EconML’s `Doubly Robust Learner` model jointly estimates the effects of multiple discrete treatments. The model uses flexible functions of observed customer features to filter out spurious correlations in existing data and deliver the causal effect of each intervention on revenue.\n" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 1, + "metadata": {}, + "outputs": [], "source": [ "# Some imports to get us started\r\n", "import warnings\r\n", @@ -69,12 +71,11 @@ "import seaborn as sns\r\n", "\r\n", "%matplotlib inline" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Data\n", "\n", @@ -99,46 +100,26 @@ "**Revenue** | Y | \\\\$ Revenue from customer given by the amount of software purchased\n", "\n", "**To protect the privacy of the startup's customers, the data used in this scenario is synthetically generated and the feature distributions don't correspond to real distributions. However, the feature names have preserved their names and meaning.*" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 2, + "metadata": {}, + "outputs": [], "source": [ "# Import the sample multi-attribution data\n", "file_url = \"https://msalicedatapublic.z5.web.core.windows.net/datasets/ROI/multi_attribution_sample.csv\"\n", "multi_data = pd.read_csv(file_url)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": 3, - "source": [ - "# Data sample\n", - "multi_data.head()" - ], + "metadata": {}, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " Global Flag Major Flag SMC Flag Commercial Flag IT Spend \\\n", - "0 1 0 1 0 45537 \n", - "1 0 0 1 1 20842 \n", - "2 0 0 0 1 82171 \n", - "3 0 0 0 0 30288 \n", - "4 0 0 1 0 25930 \n", - "\n", - " Employee Count PC Count Size Tech Support Discount Revenue \n", - "0 26 26 152205 0 1 17688.36300 \n", - "1 107 70 159038 0 1 14981.43559 \n", - "2 10 7 264935 1 1 32917.13894 \n", - "3 40 39 77522 1 1 14773.76855 \n", - "4 37 43 91446 1 1 17098.69823 " - ], "text/html": [ "
\n", "