From f06630a55d13b0785cbfc056db9b2fec54503b58 Mon Sep 17 00:00:00 2001 From: Casey Hong Date: Mon, 15 Apr 2019 17:45:40 -0400 Subject: [PATCH] Download and clean stsbenchmark data --- examples/01-prep-data/sts_load.ipynb | 954 +++++++++++++++++++++++++++ utils_nlp/dataset/stsbenchmark.py | 60 ++ utils_nlp/dataset/url_utils.py | 71 ++ 3 files changed, 1085 insertions(+) create mode 100644 examples/01-prep-data/sts_load.ipynb create mode 100644 utils_nlp/dataset/stsbenchmark.py create mode 100644 utils_nlp/dataset/url_utils.py diff --git a/examples/01-prep-data/sts_load.ipynb b/examples/01-prep-data/sts_load.ipynb new file mode 100644 index 0000000..727c9c5 --- /dev/null +++ b/examples/01-prep-data/sts_load.ipynb @@ -0,0 +1,954 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Load & Prep\n", + "In this notebook we show how to download the [STS Benchmark](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) data and prepare it for pre-processing. Because open-source data may have been curated for tasks that differ slightly from our own, it is useful to do some basic preliminary data exploration and clean the data before moving forward in the NLP pipeline. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 00 Global Settings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n", + "[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append(\"../../\") ## set the environment path\n", + "\n", + "import os\n", + "\n", + "import pandas as pd\n", + "import azureml.dataprep as dp\n", + "\n", + "from utils_nlp.dataset.url_utils import maybe_download\n", + "from utils_nlp.dataset.stsbenchmark import extract_sts\n", + "\n", + "print(\"System version: {}\".format(sys.version))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "STS_URL = \"http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\"\n", + "BASE_DATA_PATH = \"../../data\"\n", + "RAW_DATA_PATH = os.path.join(BASE_DATA_PATH, \"raw\")\n", + "CLEAN_DATA_PATH = os.path.join(BASE_DATA_PATH, \"clean\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 01 Data Download" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make a directory for THE data if it doesn't already exist, and then download." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(RAW_DATA_PATH):\n", + " os.makedirs(RAW_DATA_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def download_sts(url, dirpath):\n", + " zipfile = maybe_download(url, work_directory = dirpath)\n", + " unzipped = extract_sts(zipfile, target_dirpath = dirpath, tmode = \"r:gz\")\n", + " return zipfile, unzipped" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "418kB [00:02, 208kB/s] " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data downloaded to ../../data/raw/stsbenchmark\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "tarfile, datapath = download_sts(STS_URL, RAW_DATA_PATH)\n", + "print(\"Data downloaded to {}\".format(datapath))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 02 Data Understanding\n", + "In this section we show how to: \n", + "* load raw data into a dataframe\n", + "* peek into the first n rows" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One way to do this is by checking the filetypes of the data we've downloaded and utilizing the appropriate pandas `read` function." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['sts-test.csv', 'sts-dev.csv', 'readme.txt', 'correlation.pl', 'LICENSE.txt', 'sts-train.csv']\n" + ] + } + ], + "source": [ + "print(os.listdir(datapath))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Because the data is in csv format, we can use the pandas `read_csv` function." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "ename": "ParserError", + "evalue": "Error tokenizing data. C error: Expected 7 fields in line 2508, saw 8\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m--------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m## TODO figure out how to integrate the runtools extension that lets you run the entire NB at once, skipping errors\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdatapath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"sts-train.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'\\t'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)\u001b[0m\n\u001b[1;32m 700\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[1;32m 701\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 702\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 703\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 704\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 434\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 435\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 436\u001b[0m \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 437\u001b[0m \u001b[0mparser\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1137\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1138\u001b[0m \u001b[0mnrows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_validate_integer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'nrows'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1139\u001b[0;31m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1141\u001b[0m \u001b[0;31m# May alter columns / col_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda2/envs/nlp/lib/python3.6/site-packages/pandas/io/parsers.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1993\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1994\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1995\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1996\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1997\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_first_chunk\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_low_memory\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 7 fields in line 2508, saw 8\n" + ] + } + ], + "source": [ + "## TODO figure out how to integrate the runtools extension that lets you run the entire NB at once, skipping errors\n", + "df = pd.read_csv(os.path.join(datapath, \"sts-train.csv\"), sep='\\t')\n", + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that this throws a parsing error: \"Expected 7 fields, saw 8\". One workaround is as follows:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456
0main-captionsMSRvid2012test15.00A plane is taking off.An air plane is taking off.
1main-captionsMSRvid2012test43.80A man is playing a large flute.A man is playing a flute.
2main-captionsMSRvid2012test53.80A man is spreading shreded cheese on a pizza.A man is spreading shredded cheese on an uncoo...
3main-captionsMSRvid2012test62.60Three men are playing chess.Two men are playing chess.
4main-captionsMSRvid2012test94.25A man is playing the cello.A man seated is playing the cello.
5main-captionsMSRvid2012test114.25Some men are fighting.Two men are fighting.
6main-captionsMSRvid2012test120.50A man is smoking.A man is skating.
7main-captionsMSRvid2012test131.60The man is playing the piano.The man is playing the guitar.
8main-captionsMSRvid2012test142.20A man is playing on a guitar and singing.A woman is playing an acoustic guitar and sing...
9main-captionsMSRvid2012test165.00A person is throwing a cat on to the ceiling.A person throws a cat on the ceiling.
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 \\\n", + "0 main-captions MSRvid 2012test 1 5.00 \n", + "1 main-captions MSRvid 2012test 4 3.80 \n", + "2 main-captions MSRvid 2012test 5 3.80 \n", + "3 main-captions MSRvid 2012test 6 2.60 \n", + "4 main-captions MSRvid 2012test 9 4.25 \n", + "5 main-captions MSRvid 2012test 11 4.25 \n", + "6 main-captions MSRvid 2012test 12 0.50 \n", + "7 main-captions MSRvid 2012test 13 1.60 \n", + "8 main-captions MSRvid 2012test 14 2.20 \n", + "9 main-captions MSRvid 2012test 16 5.00 \n", + "\n", + " 5 \\\n", + "0 A plane is taking off. \n", + "1 A man is playing a large flute. \n", + "2 A man is spreading shreded cheese on a pizza. \n", + "3 Three men are playing chess. \n", + "4 A man is playing the cello. \n", + "5 Some men are fighting. \n", + "6 A man is smoking. \n", + "7 The man is playing the piano. \n", + "8 A man is playing on a guitar and singing. \n", + "9 A person is throwing a cat on to the ceiling. \n", + "\n", + " 6 \n", + "0 An air plane is taking off. \n", + "1 A man is playing a flute. \n", + "2 A man is spreading shredded cheese on an uncoo... \n", + "3 Two men are playing chess. \n", + "4 A man seated is playing the cello. \n", + "5 Two men are fighting. \n", + "6 A man is skating. \n", + "7 The man is playing the guitar. \n", + "8 A woman is playing an acoustic guitar and sing... \n", + "9 A person throws a cat on the ceiling. " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(os.path.join(datapath, \"sts-train.csv\"), sep='\\t', names=list(range(7)))\n", + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We could alternatively use a `read` function that has built-in automatic filetype inference:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Column1Column2Column3Column4Column5Column6Column7
0main-captionsMSRvid2012test15.00A plane is taking off.An air plane is taking off.
1main-captionsMSRvid2012test43.80A man is playing a large flute.A man is playing a flute.
2main-captionsMSRvid2012test53.80A man is spreading shreded cheese on a pizza.A man is spreading shredded cheese on an uncoo...
3main-captionsMSRvid2012test62.60Three men are playing chess.Two men are playing chess.
4main-captionsMSRvid2012test94.25A man is playing the cello.A man seated is playing the cello.
5main-captionsMSRvid2012test114.25Some men are fighting.Two men are fighting.
6main-captionsMSRvid2012test120.50A man is smoking.A man is skating.
7main-captionsMSRvid2012test131.60The man is playing the piano.The man is playing the guitar.
8main-captionsMSRvid2012test142.20A man is playing on a guitar and singing.A woman is playing an acoustic guitar and sing...
9main-captionsMSRvid2012test165.00A person is throwing a cat on to the ceiling.A person throws a cat on the ceiling.
\n", + "
" + ], + "text/plain": [ + " Column1 Column2 Column3 Column4 Column5 \\\n", + "0 main-captions MSRvid 2012test 1 5.00 \n", + "1 main-captions MSRvid 2012test 4 3.80 \n", + "2 main-captions MSRvid 2012test 5 3.80 \n", + "3 main-captions MSRvid 2012test 6 2.60 \n", + "4 main-captions MSRvid 2012test 9 4.25 \n", + "5 main-captions MSRvid 2012test 11 4.25 \n", + "6 main-captions MSRvid 2012test 12 0.50 \n", + "7 main-captions MSRvid 2012test 13 1.60 \n", + "8 main-captions MSRvid 2012test 14 2.20 \n", + "9 main-captions MSRvid 2012test 16 5.00 \n", + "\n", + " Column6 \\\n", + "0 A plane is taking off. \n", + "1 A man is playing a large flute. \n", + "2 A man is spreading shreded cheese on a pizza. \n", + "3 Three men are playing chess. \n", + "4 A man is playing the cello. \n", + "5 Some men are fighting. \n", + "6 A man is smoking. \n", + "7 The man is playing the piano. \n", + "8 A man is playing on a guitar and singing. \n", + "9 A person is throwing a cat on to the ceiling. \n", + "\n", + " Column7 \n", + "0 An air plane is taking off. \n", + "1 A man is playing a flute. \n", + "2 A man is spreading shredded cheese on an uncoo... \n", + "3 Two men are playing chess. \n", + "4 A man seated is playing the cello. \n", + "5 Two men are fighting. \n", + "6 A man is skating. \n", + "7 The man is playing the guitar. \n", + "8 A woman is playing an acoustic guitar and sing... \n", + "9 A person throws a cat on the ceiling. " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dflow = dp.auto_read_file(path=os.path.join(datapath, \"sts-train.csv\"))\n", + "dflow.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `auto_read_file` function from the AzureML Data Prep module actually returns a `Dataflow` object, which you can read more about [here](https://docs.microsoft.com/en-us/python/api/azureml-dataprep/azureml.dataprep.dataflow?view=azure-dataprep-py). We can easily transfer the data into a Pandas DataFrame (as before) in a single line using the `to_pandas_dataframe` function, or we can continue manipulating the data as a Dataflow object using the AzureML Data Prep API. For the remainder of this notebook we will be doing the latter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 03 Data Cleaning\n", + "Now that we know about the general shape of the data, we can clean it so that it is ready for further preprocessing. The main operation we need for the STS Benchmark data is to drop all of columns except for the sentence pairs." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
s1s2
0A plane is taking off.An air plane is taking off.
1A man is playing a large flute.A man is playing a flute.
2A man is spreading shreded cheese on a pizza.A man is spreading shredded cheese on an uncoo...
3Three men are playing chess.Two men are playing chess.
4A man is playing the cello.A man seated is playing the cello.
5Some men are fighting.Two men are fighting.
6A man is smoking.A man is skating.
7The man is playing the piano.The man is playing the guitar.
8A man is playing on a guitar and singing.A woman is playing an acoustic guitar and sing...
9A person is throwing a cat on to the ceiling.A person throws a cat on the ceiling.
\n", + "
" + ], + "text/plain": [ + " s1 \\\n", + "0 A plane is taking off. \n", + "1 A man is playing a large flute. \n", + "2 A man is spreading shreded cheese on a pizza. \n", + "3 Three men are playing chess. \n", + "4 A man is playing the cello. \n", + "5 Some men are fighting. \n", + "6 A man is smoking. \n", + "7 The man is playing the piano. \n", + "8 A man is playing on a guitar and singing. \n", + "9 A person is throwing a cat on to the ceiling. \n", + "\n", + " s2 \n", + "0 An air plane is taking off. \n", + "1 A man is playing a flute. \n", + "2 A man is spreading shredded cheese on an uncoo... \n", + "3 Two men are playing chess. \n", + "4 A man seated is playing the cello. \n", + "5 Two men are fighting. \n", + "6 A man is skating. \n", + "7 The man is playing the guitar. \n", + "8 A woman is playing an acoustic guitar and sing... \n", + "9 A person throws a cat on the ceiling. " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentences = dflow.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})\n", + "sentences.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will want to do this for all the datasets (train, dev, and test) and then persist the results into a new clean directory." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_sts(src_dir, filenames, target_dir):\n", + " if not os.path.exists(target_dir):\n", + " os.makedirs(target_dir)\n", + " filepaths = [os.path.join(src_dir, f) for f in filenames]\n", + " for i,fp in enumerate(filepaths):\n", + " dat = dp.auto_read_file(path=fp)\n", + " s = dat.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'})\n", + " sdf = s.to_pandas_dataframe().to_csv(os.path.join(target_dir, filenames[i]), sep='\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "sts_files = [f for f in os.listdir(os.path.join(RAW_DATA_PATH, \"stsbenchmark\")) if f.endswith(\".csv\")]\n", + "clean_sts(os.path.join(RAW_DATA_PATH, \"stsbenchmark\"), sts_files, os.path.join(CLEAN_DATA_PATH, \"stsbenchmark\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TODO: Remove or put somewhere else\n", + "You can also use our STSBenchmark utils to automatically download, extract, and persist the data. You can then load the sanitized data as a pandas DataFrame in one line. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from utils_nlp.dataset.stsbenchmark import STSBenchmark" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "418kB [00:04, 72.1kB/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data downloaded to ../../data/raw/stsbenchmark\n" + ] + } + ], + "source": [ + "# Initializing this instance runs the downloader and extractor behind the scenes\n", + "sts_dev = STSBenchmark(\"dev\", base_data_path=BASE_DATA_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1500 sentence pairs\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
s1s2
0A man with a hard hat is dancing.A man wearing a hard hat is dancing.
1A young child is riding a horse.A child is riding a horse.
2A man is feeding a mouse to a snake.The man is feeding a mouse to the snake.
3A woman is playing the guitar.A man is playing guitar.
4A woman is playing the flute.A man is playing a flute.
5A woman is cutting an onion.A man is cutting onions.
6A man is erasing a chalk board.The man is erasing the chalk board.
7A woman is carrying a boy.A woman is carrying her baby.
8Three men are playing guitars.Three men are on stage playing guitars.
9A woman peels a potato.A woman is peeling a potato.
\n", + "
" + ], + "text/plain": [ + " s1 \\\n", + "0 A man with a hard hat is dancing. \n", + "1 A young child is riding a horse. \n", + "2 A man is feeding a mouse to a snake. \n", + "3 A woman is playing the guitar. \n", + "4 A woman is playing the flute. \n", + "5 A woman is cutting an onion. \n", + "6 A man is erasing a chalk board. \n", + "7 A woman is carrying a boy. \n", + "8 Three men are playing guitars. \n", + "9 A woman peels a potato. \n", + "\n", + " s2 \n", + "0 A man wearing a hard hat is dancing. \n", + "1 A child is riding a horse. \n", + "2 The man is feeding a mouse to the snake. \n", + "3 A man is playing guitar. \n", + "4 A man is playing a flute. \n", + "5 A man is cutting onions. \n", + "6 The man is erasing the chalk board. \n", + "7 A woman is carrying her baby. \n", + "8 Three men are on stage playing guitars. \n", + "9 A woman is peeling a potato. " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = sts_dev.as_dataframe()\n", + "print(\"{} sentence pairs\".format(df.shape[0]))\n", + "df.head(10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/utils_nlp/dataset/stsbenchmark.py b/utils_nlp/dataset/stsbenchmark.py new file mode 100644 index 0000000..1da377e --- /dev/null +++ b/utils_nlp/dataset/stsbenchmark.py @@ -0,0 +1,60 @@ +import os +import tarfile +import pandas as pd +import azureml.dataprep as dp + +from utils_nlp.dataset.url_utils import maybe_download + +""" +Download and extract data from http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz +""" +def download_sts(dirpath): + sts_url = "http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz" + filepath = maybe_download(sts_url, work_directory = dirpath) + extracted_path = extract_sts(filepath, target_dirpath = dirpath, tmode = "r:gz") + print("Data downloaded to {}".format(extracted_path)) + return extracted_path + +""" +Extract data from the sts tar.gz archive +""" +def extract_sts(tarpath, target_dirpath = ".", tmode = "r"): + with tarfile.open(tarpath, mode = tmode) as t: + t.extractall(target_dirpath) + extracted = t.getnames()[0] + os.remove(tarpath) + return os.path.join(target_dirpath, extracted) + +""" +Drop columns containing irrelevant metadata and save as new csv files in the target_dir +""" +def clean_sts(filenames, src_dir, target_dir): + if not os.path.exists(target_dir): + os.makedirs(target_dir) + filepaths = [os.path.join(src_dir, f) for f in filenames] + for i,fp in enumerate(filepaths): + dat = dp.auto_read_file(path=fp) + s = dat.keep_columns(['Column6', 'Column7']).rename_columns({'Column6': 's1', 'Column7': 's2'}) + sdf = s.to_pandas_dataframe().to_csv(os.path.join(target_dir, filenames[i]), sep='\t') + +class STSBenchmark(): + def __init__(self, which_split, base_data_path = "./data"): + assert which_split in set(["train", "test", "dev"]) + self.base_data_path = base_data_path + self.filepath = os.path.join(self.base_data_path, "clean", "stsbenchmark", "sts-{}.csv".format(which_split)) + self._maybe_download_and_extract() + + def _maybe_download_and_extract(self): + if not os.path.exists(self.filepath): + raw_path = os.path.join(self.base_data_path, "raw") + if not os.path.exists(raw_path): + os.makedirs(raw_path) + sts_path = download_sts(raw_path) + sts_files = [f for f in os.listdir(sts_path) if f.endswith(".csv")] + clean_sts(sts_files, sts_path, os.path.join(self.base_data_path, "clean", "stsbenchmark")) + + def as_dflow(self): + return dp.auto_read_file(self.filepath).drop_columns('Column1') + + def as_dataframe(self): + return dp.auto_read_file(self.filepath).drop_columns('Column1').to_pandas_dataframe() \ No newline at end of file diff --git a/utils_nlp/dataset/url_utils.py b/utils_nlp/dataset/url_utils.py new file mode 100644 index 0000000..593ce19 --- /dev/null +++ b/utils_nlp/dataset/url_utils.py @@ -0,0 +1,71 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import os +from urllib.request import urlretrieve +import logging +from contextlib import contextmanager +from tempfile import TemporaryDirectory +from tqdm import tqdm + + +log = logging.getLogger(__name__) + + +class TqdmUpTo(tqdm): + """Wrapper class for the progress bar tqdm to get `update_to(n)` functionality""" + + def update_to(self, b=1, bsize=1, tsize=None): + """A progress bar showing how much is left to finish the opperation + + Args: + b (int): Number of blocks transferred so far. + bsize (int): Size of each block (in tqdm units). + tsize (int): Total size (in tqdm units). + """ + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) # will also set self.n = b * bsize + + +def maybe_download(url, filename=None, work_directory=".", expected_bytes=None): + """Download a file if it is not already downloaded. + + Args: + filename (str): File name. + work_directory (str): Working directory. + url (str): URL of the file to download. + expected_bytes (int): Expected file size in bytes. + + Returns: + str: File path of the file downloaded. + """ + if filename is None: + filename = url.split("/")[-1] + filepath = os.path.join(work_directory, filename) + if not os.path.exists(filepath): + with TqdmUpTo(unit="B", unit_scale=True) as t: + filepath, _ = urlretrieve(url, filepath, reporthook=t.update_to) + else: + log.debug("File {} already downloaded".format(filepath)) + if expected_bytes is not None: + statinfo = os.stat(filepath) + if statinfo.st_size != expected_bytes: + os.remove(filepath) + raise IOError("Failed to verify {}".format(filepath)) + + return filepath + + +@contextmanager +def download_path(path): + tmp_dir = TemporaryDirectory() + if path is None: + path = tmp_dir.name + else: + path = os.path.realpath(path) + + try: + yield path + finally: + tmp_dir.cleanup() \ No newline at end of file