Download and clean stsbenchmark data

2019-04-15 17:45:40 -04:00 · 2019-04-15 17:45:40 -04:00 · f06630a55d
--- a/examples/01-prep-data/sts_load.ipynb
+++ b/examples/01-prep-data/sts_load.ipynb
@ -0,0 +1,954 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data Load & Prep\n",
+    "In this notebook we show how to download the [STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) data and prepare it for pre-processing. Because open-source data may have been curated for tasks that differ slightly from our own, it is useful to do some basic preliminary data exploration and clean the data before moving forward in the NLP pipeline.  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 00 Global Settings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n",
+      "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "sys.path.append(\"../../\") ## set the environment path\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "import pandas as pd\n",
+    "import azureml.dataprep as dp\n",
+    "\n",
+    "from utils_nlp.dataset.url_utils import maybe_download\n",
+    "from utils_nlp.dataset.stsbenchmark import extract_sts\n",
+    "\n",
+    "print(\"System version: {}\".format(sys.version))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "STS_URL = \"http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\"\n",
+    "BASE_DATA_PATH = \"../../data\"\n",
+    "RAW_DATA_PATH = os.path.join(BASE_DATA_PATH, \"raw\")\n",
+    "CLEAN_DATA_PATH = os.path.join(BASE_DATA_PATH, \"clean\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 01 Data Download"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make a directory for THE data if it doesn't already exist, and then download."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists(RAW_DATA_PATH):\n",
+    "    os.makedirs(RAW_DATA_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def download_sts(url, dirpath):\n",
+    "    zipfile = maybe_download(url, work_directory = dirpath)\n",
+    "    unzipped = extract_sts(zipfile, target_dirpath = dirpath, tmode = \"r:gz\")\n",
+    "    return zipfile, unzipped"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "418kB [00:02, 208kB/s]                             "
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data downloaded to ../../data/raw/stsbenchmark\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "tarfile, datapath = download_sts(STS_URL, RAW_DATA_PATH)\n",
+    "print(\"Data downloaded to {}\".format(datapath))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 02 Data Understanding\n",
+    "In this section we show how to: \n",
+    "* load raw data into a dataframe\n",
+    "* peek into the first n rows"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "One way to do this is by checking the filetypes of the data we've downloaded and utilizing the appropriate pandas `read` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['sts-test.csv', 'sts-dev.csv', 'readme.txt', 'correlation.pl', 'LICENSE.txt', 'sts-train.csv']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(os.listdir(datapath))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Because the data is in csv format, we can use the pandas `read_csv` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "ename": "ParserError",
+     "evalue": "Error tokenizing data. C error: Expected 7 fields in line 2508, saw 8\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m--------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mParserError\u001b[0m                        Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-11-7c1611fa1e1b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m## TODO figure out how to integrate the runtools extension that lets you run the entire NB at once, skipping errors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatapath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"sts-train.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m    700\u001b[0m                     skip_blank_lines=skip_blank_lines)\n\u001b[1;32m    701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 702\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    703\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    704\u001b[0m     \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m    433\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    434\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 435\u001b[0;31m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    436\u001b[0m     \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    437\u001b[0m         \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m   1137\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1138\u001b[0m         \u001b[0mnrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'nrows'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1139\u001b[0;31m         \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1141\u001b[0m         \u001b[0;31m# May alter columns / col_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m   1993\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1994\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1995\u001b[0;31m             \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1996\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1997\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 7 fields in line 2508, saw 8\n"
+     ]
+    }
+   ],
+   "source": [
+    "## TODO figure out how to integrate the runtools extension that lets you run the entire NB at once, skipping errors\n",
+    "df = pd.read_csv(os.path.join(datapath, \"sts-train.csv\"), sep='\\t')\n",
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We see that this throws a parsing error: \"Expected 7 fields, saw 8\". One workaround is as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "      <th>5</th>\n",
+       "      <th>6</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5.00</td>\n",
+       "      <td>A plane is taking off.</td>\n",
+       "      <td>An air plane is taking off.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>4</td>\n",
+       "      <td>3.80</td>\n",
+       "      <td>A man is playing a large flute.</td>\n",
+       "      <td>A man is playing a flute.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3.80</td>\n",
+       "      <td>A man is spreading shreded cheese on a pizza.</td>\n",
+       "      <td>A man is spreading shredded cheese on an uncoo...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>6</td>\n",
+       "      <td>2.60</td>\n",
+       "      <td>Three men are playing chess.</td>\n",
+       "      <td>Two men are playing chess.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>9</td>\n",
+       "      <td>4.25</td>\n",
+       "      <td>A man is playing the cello.</td>\n",
+       "      <td>A man seated is playing the cello.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>11</td>\n",
+       "      <td>4.25</td>\n",
+       "      <td>Some men are fighting.</td>\n",
+       "      <td>Two men are fighting.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.50</td>\n",
+       "      <td>A man is smoking.</td>\n",
+       "      <td>A man is skating.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>13</td>\n",
+       "      <td>1.60</td>\n",
+       "      <td>The man is playing the piano.</td>\n",
+       "      <td>The man is playing the guitar.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>14</td>\n",
+       "      <td>2.20</td>\n",
+       "      <td>A man is playing on a guitar and singing.</td>\n",
+       "      <td>A woman is playing an acoustic guitar and sing...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>16</td>\n",
+       "      <td>5.00</td>\n",
+       "      <td>A person is throwing a cat on to the ceiling.</td>\n",
+       "      <td>A person throws a cat on the ceiling.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               0       1         2   3     4  \\\n",
+       "0  main-captions  MSRvid  2012test   1  5.00   \n",
+       "1  main-captions  MSRvid  2012test   4  3.80   \n",
+       "2  main-captions  MSRvid  2012test   5  3.80   \n",
+       "3  main-captions  MSRvid  2012test   6  2.60   \n",
+       "4  main-captions  MSRvid  2012test   9  4.25   \n",
+       "5  main-captions  MSRvid  2012test  11  4.25   \n",
+       "6  main-captions  MSRvid  2012test  12  0.50   \n",
+       "7  main-captions  MSRvid  2012test  13  1.60   \n",
+       "8  main-captions  MSRvid  2012test  14  2.20   \n",
+       "9  main-captions  MSRvid  2012test  16  5.00   \n",
+       "\n",
+       "                                               5  \\\n",
+       "0                         A plane is taking off.   \n",
+       "1                A man is playing a large flute.   \n",
+       "2  A man is spreading shreded cheese on a pizza.   \n",
+       "3                   Three men are playing chess.   \n",
+       "4                    A man is playing the cello.   \n",
+       "5                         Some men are fighting.   \n",
+       "6                              A man is smoking.   \n",
+       "7                  The man is playing the piano.   \n",
+       "8      A man is playing on a guitar and singing.   \n",
+       "9  A person is throwing a cat on to the ceiling.   \n",
+       "\n",
+       "                                                   6  \n",
+       "0                        An air plane is taking off.  \n",
+       "1                          A man is playing a flute.  \n",
+       "2  A man is spreading shredded cheese on an uncoo...  \n",
+       "3                         Two men are playing chess.  \n",
+       "4                 A man seated is playing the cello.  \n",
+       "5                              Two men are fighting.  \n",
+       "6                                  A man is skating.  \n",
+       "7                     The man is playing the guitar.  \n",
+       "8  A woman is playing an acoustic guitar and sing...  \n",
+       "9              A person throws a cat on the ceiling.  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv(os.path.join(datapath, \"sts-train.csv\"), sep='\\t', names=list(range(7)))\n",
+    "df.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We could alternatively use a `read` function that has built-in automatic filetype inference:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Column1</th>\n",
+       "      <th>Column2</th>\n",
+       "      <th>Column3</th>\n",
+       "      <th>Column4</th>\n",
+       "      <th>Column5</th>\n",
+       "      <th>Column6</th>\n",
+       "      <th>Column7</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>1</td>\n",
+       "      <td>5.00</td>\n",
+       "      <td>A plane is taking off.</td>\n",
+       "      <td>An air plane is taking off.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>4</td>\n",
+       "      <td>3.80</td>\n",
+       "      <td>A man is playing a large flute.</td>\n",
+       "      <td>A man is playing a flute.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3.80</td>\n",
+       "      <td>A man is spreading shreded cheese on a pizza.</td>\n",
+       "      <td>A man is spreading shredded cheese on an uncoo...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>6</td>\n",
+       "      <td>2.60</td>\n",
+       "      <td>Three men are playing chess.</td>\n",
+       "      <td>Two men are playing chess.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>9</td>\n",
+       "      <td>4.25</td>\n",
+       "      <td>A man is playing the cello.</td>\n",
+       "      <td>A man seated is playing the cello.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>11</td>\n",
+       "      <td>4.25</td>\n",
+       "      <td>Some men are fighting.</td>\n",
+       "      <td>Two men are fighting.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>12</td>\n",
+       "      <td>0.50</td>\n",
+       "      <td>A man is smoking.</td>\n",
+       "      <td>A man is skating.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>13</td>\n",
+       "      <td>1.60</td>\n",
+       "      <td>The man is playing the piano.</td>\n",
+       "      <td>The man is playing the guitar.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>14</td>\n",
+       "      <td>2.20</td>\n",
+       "      <td>A man is playing on a guitar and singing.</td>\n",
+       "      <td>A woman is playing an acoustic guitar and sing...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>main-captions</td>\n",
+       "      <td>MSRvid</td>\n",
+       "      <td>2012test</td>\n",
+       "      <td>16</td>\n",
+       "      <td>5.00</td>\n",
+       "      <td>A person is throwing a cat on to the ceiling.</td>\n",
+       "      <td>A person throws a cat on the ceiling.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Column1 Column2   Column3  Column4  Column5  \\\n",
+       "0  main-captions  MSRvid  2012test        1     5.00   \n",
+       "1  main-captions  MSRvid  2012test        4     3.80   \n",
+       "2  main-captions  MSRvid  2012test        5     3.80   \n",
+       "3  main-captions  MSRvid  2012test        6     2.60   \n",
+       "4  main-captions  MSRvid  2012test        9     4.25   \n",
+       "5  main-captions  MSRvid  2012test       11     4.25   \n",
+       "6  main-captions  MSRvid  2012test       12     0.50   \n",
+       "7  main-captions  MSRvid  2012test       13     1.60   \n",
+       "8  main-captions  MSRvid  2012test       14     2.20   \n",
+       "9  main-captions  MSRvid  2012test       16     5.00   \n",
+       "\n",
+       "                                         Column6  \\\n",
+       "0                         A plane is taking off.   \n",
+       "1                A man is playing a large flute.   \n",
+       "2  A man is spreading shreded cheese on a pizza.   \n",
+       "3                   Three men are playing chess.   \n",
+       "4                    A man is playing the cello.   \n",
+       "5                         Some men are fighting.   \n",
+       "6                              A man is smoking.   \n",
+       "7                  The man is playing the piano.   \n",
+       "8      A man is playing on a guitar and singing.   \n",
+       "9  A person is throwing a cat on to the ceiling.   \n",
+       "\n",
+       "                                             Column7  \n",
+       "0                        An air plane is taking off.  \n",
+       "1                          A man is playing a flute.  \n",
+       "2  A man is spreading shredded cheese on an uncoo...  \n",
+       "3                         Two men are playing chess.  \n",
+       "4                 A man seated is playing the cello.  \n",
+       "5                              Two men are fighting.  \n",
+       "6                                  A man is skating.  \n",
+       "7                     The man is playing the guitar.  \n",
+       "8  A woman is playing an acoustic guitar and sing...  \n",
+       "9              A person throws a cat on the ceiling.  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dflow = dp.auto_read_file(path=os.path.join(datapath, \"sts-train.csv\"))\n",
+    "dflow.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `auto_read_file` function from the AzureML Data Prep module actually returns a `Dataflow` object, which you can read more about [here](https://docs.microsoft.com/en-us/python/api/azureml-dataprep/azureml.dataprep.dataflow?view=azure-dataprep-py). We can easily transfer the data into a Pandas DataFrame (as before) in a single line using the `to_pandas_dataframe` function, or we can continue manipulating the data as a Dataflow object using the AzureML Data Prep API. For the remainder of this notebook we will be doing the latter."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 03 Data Cleaning\n",
+    "Now that we know about the general shape of the data, we can clean it so that it is ready for further preprocessing. The main operation we need for the STS Benchmark data is to drop all of columns except for the sentence pairs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>s1</th>\n",
+       "      <th>s2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>A plane is taking off.</td>\n",
+       "      <td>An air plane is taking off.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>A man is playing a large flute.</td>\n",
+       "      <td>A man is playing a flute.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>A man is spreading shreded cheese on a pizza.</td>\n",
+       "      <td>A man is spreading shredded cheese on an uncoo...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Three men are playing chess.</td>\n",
+       "      <td>Two men are playing chess.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>A man is playing the cello.</td>\n",
+       "      <td>A man seated is playing the cello.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Some men are fighting.</td>\n",
+       "      <td>Two men are fighting.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>A man is smoking.</td>\n",
+       "      <td>A man is skating.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>The man is playing the piano.</td>\n",
+       "      <td>The man is playing the guitar.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>A man is playing on a guitar and singing.</td>\n",
+       "      <td>A woman is playing an acoustic guitar and sing...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>A person is throwing a cat on to the ceiling.</td>\n",
+       "      <td>A person throws a cat on the ceiling.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                              s1  \\\n",
+       "0                         A plane is taking off.   \n",
+       "1                A man is playing a large flute.   \n",
+       "2  A man is spreading shreded cheese on a pizza.   \n",
+       "3                   Three men are playing chess.   \n",
+       "4                    A man is playing the cello.   \n",
+       "5                         Some men are fighting.   \n",
+       "6                              A man is smoking.   \n",
+       "7                  The man is playing the piano.   \n",
+       "8      A man is playing on a guitar and singing.   \n",
+       "9  A person is throwing a cat on to the ceiling.   \n",
+       "\n",
+       "                                                  s2  \n",
+       "0                        An air plane is taking off.  \n",
+       "1                          A man is playing a flute.  \n",
+       "2  A man is spreading shredded cheese on an uncoo...  \n",
+       "3                         Two men are playing chess.  \n",
+       "4                 A man seated is playing the cello.  \n",
+       "5                              Two men are fighting.  \n",
+       "6                                  A man is skating.  \n",
+       "7                     The man is playing the guitar.  \n",
+       "8  A woman is playing an acoustic guitar and sing...  \n",
+       "9              A person throws a cat on the ceiling.  "
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentences = dflow.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})\n",
+    "sentences.head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will want to do this for all the datasets (train, dev, and test) and then persist the results into a new clean directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_sts(src_dir, filenames, target_dir):\n",
+    "    if not os.path.exists(target_dir):\n",
+    "        os.makedirs(target_dir)\n",
+    "    filepaths = [os.path.join(src_dir, f) for f in filenames]\n",
+    "    for i,fp in enumerate(filepaths):\n",
+    "        dat = dp.auto_read_file(path=fp)\n",
+    "        s = dat.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})\n",
+    "        sdf = s.to_pandas_dataframe().to_csv(os.path.join(target_dir, filenames[i]), sep='\\t')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sts_files = [f for f in os.listdir(os.path.join(RAW_DATA_PATH, \"stsbenchmark\")) if f.endswith(\".csv\")]\n",
+    "clean_sts(os.path.join(RAW_DATA_PATH, \"stsbenchmark\"), sts_files, os.path.join(CLEAN_DATA_PATH, \"stsbenchmark\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### TODO: Remove or put somewhere else\n",
+    "You can also use our STSBenchmark utils to automatically download, extract, and persist the data. You can then load the sanitized data as a pandas DataFrame in one line. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils_nlp.dataset.stsbenchmark import STSBenchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "418kB [00:04, 72.1kB/s]                            \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data downloaded to ../../data/raw/stsbenchmark\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initializing this instance runs the downloader and extractor behind the scenes\n",
+    "sts_dev = STSBenchmark(\"dev\", base_data_path=BASE_DATA_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1500 sentence pairs\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>s1</th>\n",
+       "      <th>s2</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>A man with a hard hat is dancing.</td>\n",
+       "      <td>A man wearing a hard hat is dancing.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>A young child is riding a horse.</td>\n",
+       "      <td>A child is riding a horse.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>A man is feeding a mouse to a snake.</td>\n",
+       "      <td>The man is feeding a mouse to the snake.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>A woman is playing the guitar.</td>\n",
+       "      <td>A man is playing guitar.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>A woman is playing the flute.</td>\n",
+       "      <td>A man is playing a flute.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>A woman is cutting an onion.</td>\n",
+       "      <td>A man is cutting onions.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>A man is erasing a chalk board.</td>\n",
+       "      <td>The man is erasing the chalk board.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>A woman is carrying a boy.</td>\n",
+       "      <td>A woman is carrying her baby.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Three men are playing guitars.</td>\n",
+       "      <td>Three men are on stage playing guitars.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>A woman peels a potato.</td>\n",
+       "      <td>A woman is peeling a potato.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     s1  \\\n",
+       "0     A man with a hard hat is dancing.   \n",
+       "1      A young child is riding a horse.   \n",
+       "2  A man is feeding a mouse to a snake.   \n",
+       "3        A woman is playing the guitar.   \n",
+       "4         A woman is playing the flute.   \n",
+       "5          A woman is cutting an onion.   \n",
+       "6       A man is erasing a chalk board.   \n",
+       "7            A woman is carrying a boy.   \n",
+       "8        Three men are playing guitars.   \n",
+       "9               A woman peels a potato.   \n",
+       "\n",
+       "                                         s2  \n",
+       "0      A man wearing a hard hat is dancing.  \n",
+       "1                A child is riding a horse.  \n",
+       "2  The man is feeding a mouse to the snake.  \n",
+       "3                  A man is playing guitar.  \n",
+       "4                 A man is playing a flute.  \n",
+       "5                  A man is cutting onions.  \n",
+       "6       The man is erasing the chalk board.  \n",
+       "7             A woman is carrying her baby.  \n",
+       "8   Three men are on stage playing guitars.  \n",
+       "9              A woman is peeling a potato.  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = sts_dev.as_dataframe()\n",
+    "print(\"{} sentence pairs\".format(df.shape[0]))\n",
+    "df.head(10)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/utils_nlp/dataset/stsbenchmark.py
+++ b/utils_nlp/dataset/stsbenchmark.py
@ -0,0 +1,60 @@
+import os
+import tarfile
+import pandas as pd
+import azureml.dataprep as dp
+
+from utils_nlp.dataset.url_utils import maybe_download
+
+"""
+Download and extract data from http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz 
+"""
+def download_sts(dirpath):
+	sts_url = "http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz"
+	filepath = maybe_download(sts_url, work_directory = dirpath)
+	extracted_path = extract_sts(filepath, target_dirpath = dirpath, tmode = "r:gz")
+	print("Data downloaded to {}".format(extracted_path))
+	return extracted_path
+
+"""
+Extract data from the sts tar.gz archive
+"""
+def extract_sts(tarpath, target_dirpath = ".", tmode = "r"):
+	with tarfile.open(tarpath, mode = tmode) as t:
+		t.extractall(target_dirpath)
+		extracted = t.getnames()[0]
+	os.remove(tarpath)
+	return os.path.join(target_dirpath, extracted)
+
+"""
+Drop columns containing irrelevant metadata and save as new csv files in the target_dir
+"""
+def clean_sts(filenames, src_dir, target_dir):
+	if not os.path.exists(target_dir):
+		os.makedirs(target_dir)
+	filepaths = [os.path.join(src_dir, f) for f in filenames]
+	for i,fp in enumerate(filepaths):
+		dat = dp.auto_read_file(path=fp)
+		s = dat.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})
+		sdf = s.to_pandas_dataframe().to_csv(os.path.join(target_dir, filenames[i]), sep='\t')
+
+class STSBenchmark():
+	def __init__(self, which_split, base_data_path = "./data"):
+		assert which_split in set(["train", "test", "dev"])
+		self.base_data_path = base_data_path
+		self.filepath = os.path.join(self.base_data_path, "clean", "stsbenchmark", "sts-{}.csv".format(which_split))
+		self._maybe_download_and_extract()
+
+	def _maybe_download_and_extract(self):
+		if not os.path.exists(self.filepath):
+			raw_path = os.path.join(self.base_data_path, "raw")
+			if not os.path.exists(raw_path):
+				os.makedirs(raw_path)
+			sts_path = download_sts(raw_path)
+			sts_files = [f for f in os.listdir(sts_path) if f.endswith(".csv")]
+			clean_sts(sts_files, sts_path, os.path.join(self.base_data_path, "clean", "stsbenchmark"))
+
+	def as_dflow(self):
+		return dp.auto_read_file(self.filepath).drop_columns('Column1')
+
+	def as_dataframe(self):
+		return dp.auto_read_file(self.filepath).drop_columns('Column1').to_pandas_dataframe()
--- a/utils_nlp/dataset/url_utils.py
+++ b/utils_nlp/dataset/url_utils.py
@ -0,0 +1,71 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+from urllib.request import urlretrieve
+import logging
+from contextlib import contextmanager
+from tempfile import TemporaryDirectory
+from tqdm import tqdm
+
+
+log = logging.getLogger(__name__)
+
+
+class TqdmUpTo(tqdm):
+    """Wrapper class for the progress bar tqdm to get `update_to(n)` functionality"""
+
+    def update_to(self, b=1, bsize=1, tsize=None):
+        """A progress bar showing how much is left to finish the opperation
+        
+        Args:
+            b (int): Number of blocks transferred so far.
+            bsize (int): Size of each block (in tqdm units).
+            tsize (int): Total size (in tqdm units). 
+        """
+        if tsize is not None:
+            self.total = tsize
+        self.update(b * bsize - self.n)  # will also set self.n = b * bsize
+
+
+def maybe_download(url, filename=None, work_directory=".", expected_bytes=None):
+    """Download a file if it is not already downloaded.
+    
+    Args:
+        filename (str): File name.
+        work_directory (str): Working directory.
+        url (str): URL of the file to download.
+        expected_bytes (int): Expected file size in bytes.
+
+    Returns:
+        str: File path of the file downloaded.
+    """
+    if filename is None:
+        filename = url.split("/")[-1]
+    filepath = os.path.join(work_directory, filename)
+    if not os.path.exists(filepath):
+        with TqdmUpTo(unit="B", unit_scale=True) as t:
+            filepath, _ = urlretrieve(url, filepath, reporthook=t.update_to)
+    else:
+        log.debug("File {} already downloaded".format(filepath))
+    if expected_bytes is not None:
+        statinfo = os.stat(filepath)
+        if statinfo.st_size != expected_bytes:
+            os.remove(filepath)
+            raise IOError("Failed to verify {}".format(filepath))
+
+    return filepath
+
+
+@contextmanager
+def download_path(path):
+    tmp_dir = TemporaryDirectory()
+    if path is None:
+        path = tmp_dir.name
+    else:
+        path = os.path.realpath(path)
+
+    try:
+        yield path
+    finally:
+        tmp_dir.cleanup()