Load Lalonde dataset natively with Python

This was copied from 5afc9088e9/examples/Lalonde/Lalonde_sample.ipynb?short_path=b1d451f#L94-L99 and uses the information provided on https://users.nber.org/~rdehejia/nswdata2.html

Signed-off-by: Peter Goetz <pego@amazon.com>
This commit is contained in:
Peter Goetz 2022-10-24 12:18:17 +02:00 коммит произвёл Peter Götz
Родитель 8eb9374d4f
Коммит 0578861299
3 изменённых файлов: 80 добавлений и 44 удалений

Просмотреть файл

@ -19,12 +19,7 @@
"sys.path.append(os.path.abspath(\"../../../\"))\n",
"\n",
"import dowhy\n",
"from dowhy import CausalModel\n",
"from rpy2.robjects import r as R\n",
"%load_ext rpy2.ipython\n",
"\n",
"#%R install.packages(\"Matching\")\n",
"%R library(Matching)\n"
"from dowhy import CausalModel"
]
},
{
@ -34,15 +29,40 @@
"## 1. Load the data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----------------------\n",
"The following code for loading the Lalonde dataset was copied from [wayfair/pylift](https://github.com/wayfair/pylift/blob/5afc9088e96f25672423663f5c9b4bb889b4dfc0/examples/Lalonde/Lalonde_sample.ipynb?short_path=b1d451f#L94-L99).\n",
"\n",
"_Copyright 2018, Wayfair, Inc._\n",
"\n",
"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n",
"\n",
"1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n",
"\n",
"2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n",
"\n",
"_THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE._\n",
"\n",
"----------------------"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%R data(lalonde)\n",
"%R -o lalonde\n",
"lalonde = lalonde.astype({'treat':'bool'}, copy=False)"
"import pandas as pd, numpy as np\n",
"\n",
"cols = ['treat', 'age', 'educ', 'black', 'hisp', 'married', 'nodegr','re74','re75','re78']\n",
"control = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_control.txt', sep='\\s+', header = None, names = cols)\n",
"treated = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_treated.txt', sep='\\s+', header = None, names = cols)\n",
"lalonde = pd.concat([control, treated], ignore_index=True).astype({'treat':'bool'}, copy=False)\n",
"lalonde['u74'] = np.where(lalonde['re74'] == 0, 1.0, 0.0)\n",
"lalonde['u75'] = np.where(lalonde['re75'] == 0, 1.0, 0.0)"
]
},
{

Просмотреть файл

@ -128,22 +128,23 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"cell_type": "markdown",
"metadata": {},
"source": [
"from rpy2.robjects import r as R\n",
"----------------------\n",
"The following code for loading the Lalonde dataset was copied from [wayfair/pylift](https://github.com/wayfair/pylift/blob/5afc9088e96f25672423663f5c9b4bb889b4dfc0/examples/Lalonde/Lalonde_sample.ipynb?short_path=b1d451f#L94-L99).\n",
"\n",
"from os.path import expanduser\n",
"home = expanduser(\"~\")\n",
"_Copyright 2018, Wayfair, Inc._\n",
"\n",
"%reload_ext rpy2.ipython\n",
"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n",
"\n",
"# %R install.packages(\"Matching\")\n",
"%R library(Matching)"
"1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n",
"\n",
"2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n",
"\n",
"_THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE._\n",
"\n",
"----------------------"
]
},
{
@ -152,9 +153,13 @@
"metadata": {},
"outputs": [],
"source": [
"%R data(lalonde)\n",
"%R -o lalonde\n",
"lalonde = lalonde.astype({'treat':'bool'}, copy=False)\n",
"cols = ['treat', 'age', 'educ', 'black', 'hisp', 'married', 'nodegr','re74','re75','re78']\n",
"control = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_control.txt', sep='\\s+', header = None, names = cols)\n",
"treated = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_treated.txt', sep='\\s+', header = None, names = cols)\n",
"lalonde = pd.concat([control, treated], ignore_index=True).astype({'treat':'bool'}, copy=False)\n",
"lalonde['u74'] = np.where(lalonde['re74'] == 0, 1.0, 0.0)\n",
"lalonde['u75'] = np.where(lalonde['re75'] == 0, 1.0, 0.0)\n",
"\n",
"lalonde.head()"
]
},

Просмотреть файл

@ -14,11 +14,7 @@
"source": [
"We'll run through a quick example using the high-level Python API for the DoSampler. The DoSampler is different from most classic causal effect estimators. Instead of estimating statistics under interventions, it aims to provide the generality of Pearlian causal inference. In that context, the joint distribution of the variables under an intervention is the quantity of interest. It's hard to represent a joint distribution nonparametrically, so instead we provide a sample from that distribution, which we call a \"do\" sample.\n",
"\n",
"Here, when you specify an outcome, that is the variable you're sampling under an intervention. We still have to do the usual process of making sure the quantity (the conditional interventional distribution of the outcome) is identifiable. We leverage the familiar components of the rest of the package to do that \"under the hood\". You'll notice some similarity in the kwargs for the DoSampler.\n",
"\n",
"## Getting the Data\n",
"\n",
"First, download the data from the LaLonde example."
"Here, when you specify an outcome, that is the variable you're sampling under an intervention. We still have to do the usual process of making sure the quantity (the conditional interventional distribution of the outcome) is identifiable. We leverage the familiar components of the rest of the package to do that \"under the hood\". You'll notice some similarity in the kwargs for the DoSampler."
]
},
{
@ -32,19 +28,32 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"from rpy2.robjects import r as R\n",
"## Getting the Data\n",
"\n",
"%load_ext rpy2.ipython\n",
"#%R install.packages(\"Matching\")\n",
"%R library(Matching)\n",
"%R data(lalonde)\n",
"%R -o lalonde\n",
"lalonde.to_csv(\"lalonde.csv\",index=False)"
"First, download the data from the LaLonde example."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"----------------------\n",
"The following code for loading the Lalonde dataset was copied from [wayfair/pylift](https://github.com/wayfair/pylift/blob/5afc9088e96f25672423663f5c9b4bb889b4dfc0/examples/Lalonde/Lalonde_sample.ipynb?short_path=b1d451f#L94-L99).\n",
"\n",
"_Copyright 2018, Wayfair, Inc._\n",
"\n",
"Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:\n",
"\n",
"1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.\n",
"\n",
"2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.\n",
"\n",
"_THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE._\n",
"\n",
"----------------------"
]
},
{
@ -53,12 +62,14 @@
"metadata": {},
"outputs": [],
"source": [
"# the data already loaded in the previous cell. we include the import\n",
"# here you so you don't have to keep re-downloading it.\n",
"import pandas as pd, numpy as np\n",
"\n",
"import pandas as pd\n",
"\n",
"lalonde=pd.read_csv(\"lalonde.csv\")"
"cols = ['treat', 'age', 'educ', 'black', 'hisp', 'married', 'nodegr','re74','re75','re78']\n",
"control = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_control.txt', sep='\\s+', header = None, names = cols)\n",
"treated = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_treated.txt', sep='\\s+', header = None, names = cols)\n",
"lalonde = pd.concat([control, treated], ignore_index=True).astype({'treat':'bool'}, copy=False)\n",
"lalonde['u74'] = np.where(lalonde['re74'] == 0, 1.0, 0.0)\n",
"lalonde['u75'] = np.where(lalonde['re75'] == 0, 1.0, 0.0)"
]
},
{