Download and clean stsbenchmark data

This commit is contained in:
Casey Hong 2019-04-15 17:45:40 -04:00 коммит произвёл Abhiram E
Родитель e0bd4a510b
Коммит f06630a55d
3 изменённых файлов: 1085 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,954 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Load & Prep\n",
"In this notebook we show how to download the [STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) data and prepare it for pre-processing. Because open-source data may have been curated for tasks that differ slightly from our own, it is useful to do some basic preliminary data exploration and clean the data before moving forward in the NLP pipeline. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 00 Global Settings"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n",
"[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append(\"../../\") ## set the environment path\n",
"\n",
"import os\n",
"\n",
"import pandas as pd\n",
"import azureml.dataprep as dp\n",
"\n",
"from utils_nlp.dataset.url_utils import maybe_download\n",
"from utils_nlp.dataset.stsbenchmark import extract_sts\n",
"\n",
"print(\"System version: {}\".format(sys.version))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"STS_URL = \"http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\"\n",
"BASE_DATA_PATH = \"../../data\"\n",
"RAW_DATA_PATH = os.path.join(BASE_DATA_PATH, \"raw\")\n",
"CLEAN_DATA_PATH = os.path.join(BASE_DATA_PATH, \"clean\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 01 Data Download"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Make a directory for THE data if it doesn't already exist, and then download."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(RAW_DATA_PATH):\n",
" os.makedirs(RAW_DATA_PATH)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def download_sts(url, dirpath):\n",
" zipfile = maybe_download(url, work_directory = dirpath)\n",
" unzipped = extract_sts(zipfile, target_dirpath = dirpath, tmode = \"r:gz\")\n",
" return zipfile, unzipped"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"418kB [00:02, 208kB/s] "
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data downloaded to ../../data/raw/stsbenchmark\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"tarfile, datapath = download_sts(STS_URL, RAW_DATA_PATH)\n",
"print(\"Data downloaded to {}\".format(datapath))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 02 Data Understanding\n",
"In this section we show how to: \n",
"* load raw data into a dataframe\n",
"* peek into the first n rows"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One way to do this is by checking the filetypes of the data we've downloaded and utilizing the appropriate pandas `read` function."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['sts-test.csv', 'sts-dev.csv', 'readme.txt', 'correlation.pl', 'LICENSE.txt', 'sts-train.csv']\n"
]
}
],
"source": [
"print(os.listdir(datapath))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Because the data is in csv format, we can use the pandas `read_csv` function."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "ParserError",
"evalue": "Error tokenizing data. C error: Expected 7 fields in line 2508, saw 8\n",
"output_type": "error",
"traceback": [
"\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-11-7c1611fa1e1b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## TODO figure out how to integrate the runtools extension that lets you run the entire NB at once, skipping errors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatapath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"sts-train.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 700\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 702\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 703\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 434\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 435\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 436\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1137\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1138\u001b[0m \u001b[0mnrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'nrows'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1139\u001b[0;31m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;31m# May alter columns / col_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1993\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1994\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1995\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1996\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1997\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
"\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 7 fields in line 2508, saw 8\n"
]
}
],
"source": [
"## TODO figure out how to integrate the runtools extension that lets you run the entire NB at once, skipping errors\n",
"df = pd.read_csv(os.path.join(datapath, \"sts-train.csv\"), sep='\\t')\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We see that this throws a parsing error: \"Expected 7 fields, saw 8\". One workaround is as follows:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>1</td>\n",
" <td>5.00</td>\n",
" <td>A plane is taking off.</td>\n",
" <td>An air plane is taking off.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>4</td>\n",
" <td>3.80</td>\n",
" <td>A man is playing a large flute.</td>\n",
" <td>A man is playing a flute.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>5</td>\n",
" <td>3.80</td>\n",
" <td>A man is spreading shreded cheese on a pizza.</td>\n",
" <td>A man is spreading shredded cheese on an uncoo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>6</td>\n",
" <td>2.60</td>\n",
" <td>Three men are playing chess.</td>\n",
" <td>Two men are playing chess.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>9</td>\n",
" <td>4.25</td>\n",
" <td>A man is playing the cello.</td>\n",
" <td>A man seated is playing the cello.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>11</td>\n",
" <td>4.25</td>\n",
" <td>Some men are fighting.</td>\n",
" <td>Two men are fighting.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>12</td>\n",
" <td>0.50</td>\n",
" <td>A man is smoking.</td>\n",
" <td>A man is skating.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>13</td>\n",
" <td>1.60</td>\n",
" <td>The man is playing the piano.</td>\n",
" <td>The man is playing the guitar.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>14</td>\n",
" <td>2.20</td>\n",
" <td>A man is playing on a guitar and singing.</td>\n",
" <td>A woman is playing an acoustic guitar and sing...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>16</td>\n",
" <td>5.00</td>\n",
" <td>A person is throwing a cat on to the ceiling.</td>\n",
" <td>A person throws a cat on the ceiling.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 \\\n",
"0 main-captions MSRvid 2012test 1 5.00 \n",
"1 main-captions MSRvid 2012test 4 3.80 \n",
"2 main-captions MSRvid 2012test 5 3.80 \n",
"3 main-captions MSRvid 2012test 6 2.60 \n",
"4 main-captions MSRvid 2012test 9 4.25 \n",
"5 main-captions MSRvid 2012test 11 4.25 \n",
"6 main-captions MSRvid 2012test 12 0.50 \n",
"7 main-captions MSRvid 2012test 13 1.60 \n",
"8 main-captions MSRvid 2012test 14 2.20 \n",
"9 main-captions MSRvid 2012test 16 5.00 \n",
"\n",
" 5 \\\n",
"0 A plane is taking off. \n",
"1 A man is playing a large flute. \n",
"2 A man is spreading shreded cheese on a pizza. \n",
"3 Three men are playing chess. \n",
"4 A man is playing the cello. \n",
"5 Some men are fighting. \n",
"6 A man is smoking. \n",
"7 The man is playing the piano. \n",
"8 A man is playing on a guitar and singing. \n",
"9 A person is throwing a cat on to the ceiling. \n",
"\n",
" 6 \n",
"0 An air plane is taking off. \n",
"1 A man is playing a flute. \n",
"2 A man is spreading shredded cheese on an uncoo... \n",
"3 Two men are playing chess. \n",
"4 A man seated is playing the cello. \n",
"5 Two men are fighting. \n",
"6 A man is skating. \n",
"7 The man is playing the guitar. \n",
"8 A woman is playing an acoustic guitar and sing... \n",
"9 A person throws a cat on the ceiling. "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(os.path.join(datapath, \"sts-train.csv\"), sep='\\t', names=list(range(7)))\n",
"df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We could alternatively use a `read` function that has built-in automatic filetype inference:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Column1</th>\n",
" <th>Column2</th>\n",
" <th>Column3</th>\n",
" <th>Column4</th>\n",
" <th>Column5</th>\n",
" <th>Column6</th>\n",
" <th>Column7</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>1</td>\n",
" <td>5.00</td>\n",
" <td>A plane is taking off.</td>\n",
" <td>An air plane is taking off.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>4</td>\n",
" <td>3.80</td>\n",
" <td>A man is playing a large flute.</td>\n",
" <td>A man is playing a flute.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>5</td>\n",
" <td>3.80</td>\n",
" <td>A man is spreading shreded cheese on a pizza.</td>\n",
" <td>A man is spreading shredded cheese on an uncoo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>6</td>\n",
" <td>2.60</td>\n",
" <td>Three men are playing chess.</td>\n",
" <td>Two men are playing chess.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>9</td>\n",
" <td>4.25</td>\n",
" <td>A man is playing the cello.</td>\n",
" <td>A man seated is playing the cello.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>11</td>\n",
" <td>4.25</td>\n",
" <td>Some men are fighting.</td>\n",
" <td>Two men are fighting.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>12</td>\n",
" <td>0.50</td>\n",
" <td>A man is smoking.</td>\n",
" <td>A man is skating.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>13</td>\n",
" <td>1.60</td>\n",
" <td>The man is playing the piano.</td>\n",
" <td>The man is playing the guitar.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>14</td>\n",
" <td>2.20</td>\n",
" <td>A man is playing on a guitar and singing.</td>\n",
" <td>A woman is playing an acoustic guitar and sing...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>main-captions</td>\n",
" <td>MSRvid</td>\n",
" <td>2012test</td>\n",
" <td>16</td>\n",
" <td>5.00</td>\n",
" <td>A person is throwing a cat on to the ceiling.</td>\n",
" <td>A person throws a cat on the ceiling.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Column1 Column2 Column3 Column4 Column5 \\\n",
"0 main-captions MSRvid 2012test 1 5.00 \n",
"1 main-captions MSRvid 2012test 4 3.80 \n",
"2 main-captions MSRvid 2012test 5 3.80 \n",
"3 main-captions MSRvid 2012test 6 2.60 \n",
"4 main-captions MSRvid 2012test 9 4.25 \n",
"5 main-captions MSRvid 2012test 11 4.25 \n",
"6 main-captions MSRvid 2012test 12 0.50 \n",
"7 main-captions MSRvid 2012test 13 1.60 \n",
"8 main-captions MSRvid 2012test 14 2.20 \n",
"9 main-captions MSRvid 2012test 16 5.00 \n",
"\n",
" Column6 \\\n",
"0 A plane is taking off. \n",
"1 A man is playing a large flute. \n",
"2 A man is spreading shreded cheese on a pizza. \n",
"3 Three men are playing chess. \n",
"4 A man is playing the cello. \n",
"5 Some men are fighting. \n",
"6 A man is smoking. \n",
"7 The man is playing the piano. \n",
"8 A man is playing on a guitar and singing. \n",
"9 A person is throwing a cat on to the ceiling. \n",
"\n",
" Column7 \n",
"0 An air plane is taking off. \n",
"1 A man is playing a flute. \n",
"2 A man is spreading shredded cheese on an uncoo... \n",
"3 Two men are playing chess. \n",
"4 A man seated is playing the cello. \n",
"5 Two men are fighting. \n",
"6 A man is skating. \n",
"7 The man is playing the guitar. \n",
"8 A woman is playing an acoustic guitar and sing... \n",
"9 A person throws a cat on the ceiling. "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dflow = dp.auto_read_file(path=os.path.join(datapath, \"sts-train.csv\"))\n",
"dflow.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `auto_read_file` function from the AzureML Data Prep module actually returns a `Dataflow` object, which you can read more about [here](https://docs.microsoft.com/en-us/python/api/azureml-dataprep/azureml.dataprep.dataflow?view=azure-dataprep-py). We can easily transfer the data into a Pandas DataFrame (as before) in a single line using the `to_pandas_dataframe` function, or we can continue manipulating the data as a Dataflow object using the AzureML Data Prep API. For the remainder of this notebook we will be doing the latter."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 03 Data Cleaning\n",
"Now that we know about the general shape of the data, we can clean it so that it is ready for further preprocessing. The main operation we need for the STS Benchmark data is to drop all of columns except for the sentence pairs."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>s1</th>\n",
" <th>s2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A plane is taking off.</td>\n",
" <td>An air plane is taking off.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A man is playing a large flute.</td>\n",
" <td>A man is playing a flute.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>A man is spreading shreded cheese on a pizza.</td>\n",
" <td>A man is spreading shredded cheese on an uncoo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Three men are playing chess.</td>\n",
" <td>Two men are playing chess.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>A man is playing the cello.</td>\n",
" <td>A man seated is playing the cello.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Some men are fighting.</td>\n",
" <td>Two men are fighting.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>A man is smoking.</td>\n",
" <td>A man is skating.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>The man is playing the piano.</td>\n",
" <td>The man is playing the guitar.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>A man is playing on a guitar and singing.</td>\n",
" <td>A woman is playing an acoustic guitar and sing...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>A person is throwing a cat on to the ceiling.</td>\n",
" <td>A person throws a cat on the ceiling.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" s1 \\\n",
"0 A plane is taking off. \n",
"1 A man is playing a large flute. \n",
"2 A man is spreading shreded cheese on a pizza. \n",
"3 Three men are playing chess. \n",
"4 A man is playing the cello. \n",
"5 Some men are fighting. \n",
"6 A man is smoking. \n",
"7 The man is playing the piano. \n",
"8 A man is playing on a guitar and singing. \n",
"9 A person is throwing a cat on to the ceiling. \n",
"\n",
" s2 \n",
"0 An air plane is taking off. \n",
"1 A man is playing a flute. \n",
"2 A man is spreading shredded cheese on an uncoo... \n",
"3 Two men are playing chess. \n",
"4 A man seated is playing the cello. \n",
"5 Two men are fighting. \n",
"6 A man is skating. \n",
"7 The man is playing the guitar. \n",
"8 A woman is playing an acoustic guitar and sing... \n",
"9 A person throws a cat on the ceiling. "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentences = dflow.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})\n",
"sentences.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will want to do this for all the datasets (train, dev, and test) and then persist the results into a new clean directory."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def clean_sts(src_dir, filenames, target_dir):\n",
" if not os.path.exists(target_dir):\n",
" os.makedirs(target_dir)\n",
" filepaths = [os.path.join(src_dir, f) for f in filenames]\n",
" for i,fp in enumerate(filepaths):\n",
" dat = dp.auto_read_file(path=fp)\n",
" s = dat.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})\n",
" sdf = s.to_pandas_dataframe().to_csv(os.path.join(target_dir, filenames[i]), sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"sts_files = [f for f in os.listdir(os.path.join(RAW_DATA_PATH, \"stsbenchmark\")) if f.endswith(\".csv\")]\n",
"clean_sts(os.path.join(RAW_DATA_PATH, \"stsbenchmark\"), sts_files, os.path.join(CLEAN_DATA_PATH, \"stsbenchmark\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### TODO: Remove or put somewhere else\n",
"You can also use our STSBenchmark utils to automatically download, extract, and persist the data. You can then load the sanitized data as a pandas DataFrame in one line. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from utils_nlp.dataset.stsbenchmark import STSBenchmark"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"418kB [00:04, 72.1kB/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data downloaded to ../../data/raw/stsbenchmark\n"
]
}
],
"source": [
"# Initializing this instance runs the downloader and extractor behind the scenes\n",
"sts_dev = STSBenchmark(\"dev\", base_data_path=BASE_DATA_PATH)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1500 sentence pairs\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>s1</th>\n",
" <th>s2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A man with a hard hat is dancing.</td>\n",
" <td>A man wearing a hard hat is dancing.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A young child is riding a horse.</td>\n",
" <td>A child is riding a horse.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>A man is feeding a mouse to a snake.</td>\n",
" <td>The man is feeding a mouse to the snake.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A woman is playing the guitar.</td>\n",
" <td>A man is playing guitar.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>A woman is playing the flute.</td>\n",
" <td>A man is playing a flute.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>A woman is cutting an onion.</td>\n",
" <td>A man is cutting onions.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>A man is erasing a chalk board.</td>\n",
" <td>The man is erasing the chalk board.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>A woman is carrying a boy.</td>\n",
" <td>A woman is carrying her baby.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Three men are playing guitars.</td>\n",
" <td>Three men are on stage playing guitars.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>A woman peels a potato.</td>\n",
" <td>A woman is peeling a potato.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" s1 \\\n",
"0 A man with a hard hat is dancing. \n",
"1 A young child is riding a horse. \n",
"2 A man is feeding a mouse to a snake. \n",
"3 A woman is playing the guitar. \n",
"4 A woman is playing the flute. \n",
"5 A woman is cutting an onion. \n",
"6 A man is erasing a chalk board. \n",
"7 A woman is carrying a boy. \n",
"8 Three men are playing guitars. \n",
"9 A woman peels a potato. \n",
"\n",
" s2 \n",
"0 A man wearing a hard hat is dancing. \n",
"1 A child is riding a horse. \n",
"2 The man is feeding a mouse to the snake. \n",
"3 A man is playing guitar. \n",
"4 A man is playing a flute. \n",
"5 A man is cutting onions. \n",
"6 The man is erasing the chalk board. \n",
"7 A woman is carrying her baby. \n",
"8 Three men are on stage playing guitars. \n",
"9 A woman is peeling a potato. "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = sts_dev.as_dataframe()\n",
"print(\"{} sentence pairs\".format(df.shape[0]))\n",
"df.head(10)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,60 @@
import os
import tarfile
import pandas as pd
import azureml.dataprep as dp
from utils_nlp.dataset.url_utils import maybe_download
"""
Download and extract data from http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz
"""
def download_sts(dirpath):
sts_url = "http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz"
filepath = maybe_download(sts_url, work_directory = dirpath)
extracted_path = extract_sts(filepath, target_dirpath = dirpath, tmode = "r:gz")
print("Data downloaded to {}".format(extracted_path))
return extracted_path
"""
Extract data from the sts tar.gz archive
"""
def extract_sts(tarpath, target_dirpath = ".", tmode = "r"):
with tarfile.open(tarpath, mode = tmode) as t:
t.extractall(target_dirpath)
extracted = t.getnames()[0]
os.remove(tarpath)
return os.path.join(target_dirpath, extracted)
"""
Drop columns containing irrelevant metadata and save as new csv files in the target_dir
"""
def clean_sts(filenames, src_dir, target_dir):
if not os.path.exists(target_dir):
os.makedirs(target_dir)
filepaths = [os.path.join(src_dir, f) for f in filenames]
for i,fp in enumerate(filepaths):
dat = dp.auto_read_file(path=fp)
s = dat.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})
sdf = s.to_pandas_dataframe().to_csv(os.path.join(target_dir, filenames[i]), sep='\t')
class STSBenchmark():
def __init__(self, which_split, base_data_path = "./data"):
assert which_split in set(["train", "test", "dev"])
self.base_data_path = base_data_path
self.filepath = os.path.join(self.base_data_path, "clean", "stsbenchmark", "sts-{}.csv".format(which_split))
self._maybe_download_and_extract()
def _maybe_download_and_extract(self):
if not os.path.exists(self.filepath):
raw_path = os.path.join(self.base_data_path, "raw")
if not os.path.exists(raw_path):
os.makedirs(raw_path)
sts_path = download_sts(raw_path)
sts_files = [f for f in os.listdir(sts_path) if f.endswith(".csv")]
clean_sts(sts_files, sts_path, os.path.join(self.base_data_path, "clean", "stsbenchmark"))
def as_dflow(self):
return dp.auto_read_file(self.filepath).drop_columns('Column1')
def as_dataframe(self):
return dp.auto_read_file(self.filepath).drop_columns('Column1').to_pandas_dataframe()

Просмотреть файл

@ -0,0 +1,71 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
from urllib.request import urlretrieve
import logging
from contextlib import contextmanager
from tempfile import TemporaryDirectory
from tqdm import tqdm
log = logging.getLogger(__name__)
class TqdmUpTo(tqdm):
"""Wrapper class for the progress bar tqdm to get `update_to(n)` functionality"""
def update_to(self, b=1, bsize=1, tsize=None):
"""A progress bar showing how much is left to finish the opperation
Args:
b (int): Number of blocks transferred so far.
bsize (int): Size of each block (in tqdm units).
tsize (int): Total size (in tqdm units).
"""
if tsize is not None:
self.total = tsize
self.update(b * bsize - self.n) # will also set self.n = b * bsize
def maybe_download(url, filename=None, work_directory=".", expected_bytes=None):
"""Download a file if it is not already downloaded.
Args:
filename (str): File name.
work_directory (str): Working directory.
url (str): URL of the file to download.
expected_bytes (int): Expected file size in bytes.
Returns:
str: File path of the file downloaded.
"""
if filename is None:
filename = url.split("/")[-1]
filepath = os.path.join(work_directory, filename)
if not os.path.exists(filepath):
with TqdmUpTo(unit="B", unit_scale=True) as t:
filepath, _ = urlretrieve(url, filepath, reporthook=t.update_to)
else:
log.debug("File {} already downloaded".format(filepath))
if expected_bytes is not None:
statinfo = os.stat(filepath)
if statinfo.st_size != expected_bytes:
os.remove(filepath)
raise IOError("Failed to verify {}".format(filepath))
return filepath
@contextmanager
def download_path(path):
tmp_dir = TemporaryDirectory()
if path is None:
path = tmp_dir.name
else:
path = os.path.realpath(path)
try:
yield path
finally:
tmp_dir.cleanup()