Merge pull request #24 from sonumehta/master

Added a notebook - using dowhy for ihdp dataset
This commit is contained in:
Amit Sharma 2019-01-21 15:05:41 +05:30 коммит произвёл GitHub
Родитель 408f0473bc 625fab9c43
Коммит b817812f9d
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
1 изменённых файлов: 708 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,708 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Do why example on ihdp(Infant Health and Development Program) dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# importing required libraries\n",
"import os, sys\n",
"sys.path.append(os.path.abspath(\"../../\"))\n",
"import dowhy\n",
"from dowhy.do_why import CausalModel\n",
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Loading Data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>treatment</th>\n",
" <th>y_factual</th>\n",
" <th>y_cfactual</th>\n",
" <th>mu0</th>\n",
" <th>mu1</th>\n",
" <th>x1</th>\n",
" <th>x2</th>\n",
" <th>x3</th>\n",
" <th>x4</th>\n",
" <th>x5</th>\n",
" <th>...</th>\n",
" <th>x16</th>\n",
" <th>x17</th>\n",
" <th>x18</th>\n",
" <th>x19</th>\n",
" <th>x20</th>\n",
" <th>x21</th>\n",
" <th>x22</th>\n",
" <th>x23</th>\n",
" <th>x24</th>\n",
" <th>x25</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>5.599916</td>\n",
" <td>4.318780</td>\n",
" <td>3.268256</td>\n",
" <td>6.854457</td>\n",
" <td>-0.528603</td>\n",
" <td>-0.343455</td>\n",
" <td>1.128554</td>\n",
" <td>0.161703</td>\n",
" <td>-0.316603</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>6.875856</td>\n",
" <td>7.856495</td>\n",
" <td>6.636059</td>\n",
" <td>7.562718</td>\n",
" <td>-1.736945</td>\n",
" <td>-1.802002</td>\n",
" <td>0.383828</td>\n",
" <td>2.244320</td>\n",
" <td>-0.629189</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2.996273</td>\n",
" <td>6.633952</td>\n",
" <td>1.570536</td>\n",
" <td>6.121617</td>\n",
" <td>-0.807451</td>\n",
" <td>-0.202946</td>\n",
" <td>-0.360898</td>\n",
" <td>-0.879606</td>\n",
" <td>0.808706</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>1.366206</td>\n",
" <td>5.697239</td>\n",
" <td>1.244738</td>\n",
" <td>5.889125</td>\n",
" <td>0.390083</td>\n",
" <td>0.596582</td>\n",
" <td>-1.850350</td>\n",
" <td>-0.879606</td>\n",
" <td>-0.004017</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>1.963538</td>\n",
" <td>6.202582</td>\n",
" <td>1.685048</td>\n",
" <td>6.191994</td>\n",
" <td>-1.045229</td>\n",
" <td>-0.602710</td>\n",
" <td>0.011465</td>\n",
" <td>0.161703</td>\n",
" <td>0.683672</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 30 columns</p>\n",
"</div>"
],
"text/plain": [
" treatment y_factual y_cfactual mu0 mu1 x1 x2 \\\n",
"0 1 5.599916 4.318780 3.268256 6.854457 -0.528603 -0.343455 \n",
"1 0 6.875856 7.856495 6.636059 7.562718 -1.736945 -1.802002 \n",
"2 0 2.996273 6.633952 1.570536 6.121617 -0.807451 -0.202946 \n",
"3 0 1.366206 5.697239 1.244738 5.889125 0.390083 0.596582 \n",
"4 0 1.963538 6.202582 1.685048 6.191994 -1.045229 -0.602710 \n",
"\n",
" x3 x4 x5 ... x16 x17 x18 x19 x20 x21 x22 x23 \\\n",
"0 1.128554 0.161703 -0.316603 ... 1 1 1 1 0 0 0 0 \n",
"1 0.383828 2.244320 -0.629189 ... 1 1 1 1 0 0 0 0 \n",
"2 -0.360898 -0.879606 0.808706 ... 1 0 1 1 0 0 0 0 \n",
"3 -1.850350 -0.879606 -0.004017 ... 1 0 1 1 0 0 0 0 \n",
"4 0.011465 0.161703 0.683672 ... 1 1 1 1 0 0 0 0 \n",
"\n",
" x24 x25 \n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
"[5 rows x 30 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data= pd.read_csv(\"C:/Users/someh/Desktop/dowhy/datasets/CEVAE/datasets/IHDP/csv/ihdp_npci_1.csv\", header = None)\n",
"col = [\"treatment\", \"y_factual\", \"y_cfactual\", \"mu0\", \"mu1\" ,]\n",
"\n",
"for i in range(1,26):\n",
" col.append(\"x\"+str(i))\n",
"data.columns = col\n",
"data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 1.Model"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:dowhy.do_why:Causal Graph not provided. DoWhy will construct a graph based on data inputs.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model to find the causal effect of treatment treatment on outcome y_factual\n"
]
}
],
"source": [
"# Create a causal model from the data and given common causes.\n",
"xs = \"\"\n",
"for i in range(1,26):\n",
" xs += (\"x\"+str(i)+\"+\")\n",
" \n",
"model=CausalModel(\n",
" data = data,\n",
" treatment='treatment',\n",
" outcome='y_factual',\n",
" common_causes=xs.split('+')\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 2.Identify"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_identifier:Common causes of treatment and outcome:{'', 'x1', 'x5', 'x6', 'x19', 'x21', 'x14', 'x12', 'x16', 'x3', 'x9', 'x8', 'x4', 'x17', 'x7', 'x23', 'x10', 'x13', 'x25', 'x22', 'x15', 'x20', 'x18', 'x24', 'x11', 'x2'}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'observed': 'no'}\n",
"There are unobserved common causes. Causal effect cannot be identified.\n",
"WARN: Do you want to continue by ignoring these unobserved confounders? [y/n] \n",
"Please respond with 'y' or 'n'\n",
"WARN: Do you want to continue by ignoring these unobserved confounders? [y/n] n\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]\n"
]
}
],
"source": [
"#Identify the causal effect\n",
"identified_estimand = model.identify_effect()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3. Estimate (using different methods)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.1 Using Linear Regression"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_estimator:INFO: Using Linear Regression Estimator\n",
"INFO:dowhy.causal_estimator:b: y_factual~treatment+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"LinearRegressionEstimator\n",
"*** Causal Estimate ***\n",
"\n",
"## Target estimand\n",
"Estimand type: ate\n",
"### Estimand : 1\n",
"Estimand name: backdoor\n",
"Estimand expression:\n",
" d \n",
"──────────(Expectation(y_factual|x1,x5,x6,x19,x21,x14,x12,x16,x3,x9,x8,x4,x17,\n",
"dtreatment \n",
"\n",
" \n",
"x7,x23,x10,x13,x25,x22,x15,x20,x18,x24,x11,x2))\n",
" \n",
"Estimand assumption 1, Unconfoundedness: If U→treatment and U→y_factual then P(y_factual|treatment,x1,x5,x6,x19,x21,x14,x12,x16,x3,x9,x8,x4,x17,x7,x23,x10,x13,x25,x22,x15,x20,x18,x24,x11,x2,U) = P(y_factual|treatment,x1,x5,x6,x19,x21,x14,x12,x16,x3,x9,x8,x4,x17,x7,x23,x10,x13,x25,x22,x15,x20,x18,x24,x11,x2)\n",
"### Estimand : 2\n",
"Estimand name: iv\n",
"No such variable found!\n",
"\n",
"## Realized estimand\n",
"b: y_factual~treatment+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2\n",
"## Estimate\n",
"Value: 3.928671750872711\n",
"\n",
"## Statistical Significance\n",
"p-value: 0.0\n",
"\n",
"Causal Estimate is 3.928671750872711\n",
"ATE 4.021121012430832\n"
]
}
],
"source": [
"# Estimate the causal effect and compare it with Average Treatment Effect\n",
"estimate = model.estimate_effect(identified_estimand,\n",
" method_name=\"backdoor.linear_regression\", test_significance=True\n",
")\n",
"\n",
"print(estimate)\n",
"\n",
"print(\"Causal Estimate is \" + str(estimate.value))\n",
"data_1 = data[data[\"treatment\"]==1]\n",
"data_0 = data[data[\"treatment\"]==0]\n",
"\n",
"print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.2 Using Propensity Score Matching"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_estimator:INFO: Using Propensity Score Matching Estimator\n",
"INFO:dowhy.causal_estimator:b: y_factual~treatment+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"PropensityScoreMatchingEstimator\n",
"Causal Estimate is 3.8436503200364402\n",
"ATE 4.021121012430832\n"
]
}
],
"source": [
"estimate = model.estimate_effect(identified_estimand,\n",
" method_name=\"backdoor.propensity_score_matching\"\n",
")\n",
"\n",
"print(\"Causal Estimate is \" + str(estimate.value))\n",
"\n",
"print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.3 Using Propensity Score Stratification"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_estimator:INFO: Using Propensity Score Stratification Estimator\n",
"INFO:dowhy.causal_estimator:b: y_factual~treatment+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"PropensityScoreStratificationEstimator\n",
"Causal Estimate is nan\n",
"ATE 4.021121012430832\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\someh\\Desktop\\dowhy\\dowhy\\dowhy\\causal_estimators\\propensity_score_stratification_estimator.py:70: RuntimeWarning: invalid value encountered in double_scalars\n",
" ate = (weightedoutcomes['effect'] * weightedoutcomes[treatment_sum_name]).sum() / totaltreatmentpopulation\n"
]
}
],
"source": [
"estimate = model.estimate_effect(identified_estimand,\n",
" method_name=\"backdoor.propensity_score_stratification\"\n",
")\n",
"\n",
"print(\"Causal Estimate is \" + str(estimate.value))\n",
"print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 3.4 Using Propensity Score Weighting? IPTW??"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_identifier:Common causes of treatment and outcome:{'', 'x1', 'x5', 'x6', 'x19', 'x21', 'x14', 'x12', 'x16', 'x3', 'x9', 'x8', 'x4', 'x17', 'x7', 'x23', 'x10', 'x13', 'x25', 'x22', 'x15', 'x20', 'x18', 'x24', 'x11', 'x2'}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'observed': 'no'}\n",
"There are unobserved common causes. Causal effect cannot be identified.\n",
"WARN: Do you want to continue by ignoring these unobserved confounders? [y/n] n\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]\n",
"INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
"INFO:dowhy.causal_estimator:b: y_factual~treatment+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"PropensityScoreWeightingEstimator\n",
"Causal Estimate is 4.047618153454541\n",
"ATE 4.021121012430832\n"
]
}
],
"source": [
"identified_estimand = model.identify_effect()\n",
"estimate = model.estimate_effect(identified_estimand,\n",
" method_name=\"backdoor.propensity_score_weighting\"\n",
")\n",
"\n",
"print(\"Causal Estimate is \" + str(estimate.value))\n",
"\n",
"print(\"ATE\", np.mean(data_1[\"y_factual\"])- np.mean(data_0[\"y_factual\"]))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4. Refute\n",
"##### Refute the obtained estimate using multiple robustness checks.\n",
"##### 4.1 Adding a random common cause"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
"INFO:dowhy.causal_estimator:b: y_factual~treatment+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2+w_random\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Refute: Add a Random Common Cause\n",
"Estimated effect:(4.047618153454541,)\n",
"New effect:(4.043273388112319,)\n",
"\n"
]
}
],
"source": [
"\n",
"refute_results=model.refute_estimate(identified_estimand, estimate,\n",
" method_name=\"random_common_cause\")\n",
"print(refute_results)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### 4.2 Using a placebo treatment"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
"INFO:dowhy.causal_estimator:b: y_factual~placebo+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Refute: Use a Placebo Treatment\n",
"Estimated effect:(4.047618153454541,)\n",
"New effect:(-0.4264218091545624,)\n",
"\n"
]
}
],
"source": [
"\n",
"res_placebo=model.refute_estimate(identified_estimand, estimate,\n",
" method_name=\"placebo_treatment_refuter\", placebo_type=\"permute\")\n",
"print(res_placebo)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### 4.3 Data Subset Refuter"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:dowhy.causal_estimator:INFO: Using Propensity Score Weighting Estimator\n",
"INFO:dowhy.causal_estimator:b: y_factual~treatment+x1+x5+x6+x19+x21+x14+x12+x16+x3+x9+x8+x4+x17+x7+x23+x10+x13+x25+x22+x15+x20+x18+x24+x11+x2\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Refute: Use a subset of data\n",
"Estimated effect:(4.047618153454541,)\n",
"New effect:(4.028518717202439,)\n",
"\n"
]
}
],
"source": [
"res_subset=model.refute_estimate(identified_estimand, estimate,\n",
" method_name=\"data_subset_refuter\", subset_fraction=0.9)\n",
"print(res_subset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}