feat(data_prep) SNLI notebook showcasing data prep, Corrected nltk util for column_name

This commit is contained in:
Janhavi Mahajan 2019-04-17 18:01:08 -04:00 коммит произвёл Casey Hong
Родитель 3964c04a7c
Коммит 6e46eade15
2 изменённых файлов: 926 добавлений и 2 удалений

Просмотреть файл

@ -0,0 +1,924 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SNLI Data Preparation notebook\n",
"\n",
"This notebook displays how to load SNLI dataset using provided util functions. \n",
"\n",
"## 0 Global Settings"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n"
]
}
],
"source": [
"import sys\n",
"sys.path.append(\"../../\")\n",
"\n",
"from utils_nlp.dataset import snli\n",
"\n",
"print(\"System version: {}\".format(sys.version))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR_PATH = '../../data'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1 Load SNLI Dataset into pandas dataframe\n",
"\n",
"\n",
"SNLI zip contains [snli_1.0_dev.txt, snli_1.0_train.txt, snli_1.0_test.tx, snli_1.0_dev.jsonl, snli_1.0_train.jsonl, snli_1.0_test.jsonl]\n",
"\n",
"Below are the steps when load_pandas_df() is called.<br>\n",
"\n",
"- Download snli zip file at specified directory location.\n",
"- Extracts the file based on the split mentioned.\n",
"- Loads the split into a pandas dataframe. "
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# defaults to txt\n",
"df = snli.load_pandas_df(DATA_DIR_PATH, 'train')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Glimpse of data"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gold_label</th>\n",
" <th>sentence1_binary_parse</th>\n",
" <th>sentence2_binary_parse</th>\n",
" <th>sentence1_parse</th>\n",
" <th>sentence2_parse</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" <th>captionID</th>\n",
" <th>pairID</th>\n",
" <th>label1</th>\n",
" <th>label2</th>\n",
" <th>label3</th>\n",
" <th>label4</th>\n",
" <th>label5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\n",
" <td>( ( A person ) ( ( is ( ( training ( his horse...</td>\n",
" <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\n",
" <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\n",
" <td>A person on a horse jumps over a broken down a...</td>\n",
" <td>A person is training his horse for a competition.</td>\n",
" <td>3416050480.jpg#4</td>\n",
" <td>3416050480.jpg#4r1n</td>\n",
" <td>neutral</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>contradiction</td>\n",
" <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\n",
" <td>( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...</td>\n",
" <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\n",
" <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\n",
" <td>A person on a horse jumps over a broken down a...</td>\n",
" <td>A person is at a diner, ordering an omelette.</td>\n",
" <td>3416050480.jpg#4</td>\n",
" <td>3416050480.jpg#4r1c</td>\n",
" <td>contradiction</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>entailment</td>\n",
" <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\n",
" <td>( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...</td>\n",
" <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\n",
" <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\n",
" <td>A person on a horse jumps over a broken down a...</td>\n",
" <td>A person is outdoors, on a horse.</td>\n",
" <td>3416050480.jpg#4</td>\n",
" <td>3416050480.jpg#4r1e</td>\n",
" <td>entailment</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>neutral</td>\n",
" <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\n",
" <td>( They ( are ( smiling ( at ( their parents ) ...</td>\n",
" <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\n",
" <td>(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...</td>\n",
" <td>Children smiling and waving at camera</td>\n",
" <td>They are smiling at their parents</td>\n",
" <td>2267923837.jpg#2</td>\n",
" <td>2267923837.jpg#2r1n</td>\n",
" <td>neutral</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>entailment</td>\n",
" <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\n",
" <td>( There ( ( are children ) present ) )</td>\n",
" <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\n",
" <td>(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...</td>\n",
" <td>Children smiling and waving at camera</td>\n",
" <td>There are children present</td>\n",
" <td>2267923837.jpg#2</td>\n",
" <td>2267923837.jpg#2r1e</td>\n",
" <td>entailment</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" gold_label sentence1_binary_parse \\\n",
"0 neutral ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump... \n",
"1 contradiction ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump... \n",
"2 entailment ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump... \n",
"3 neutral ( Children ( ( ( smiling and ) waving ) ( at c... \n",
"4 entailment ( Children ( ( ( smiling and ) waving ) ( at c... \n",
"\n",
" sentence2_binary_parse \\\n",
"0 ( ( A person ) ( ( is ( ( training ( his horse... \n",
"1 ( ( A person ) ( ( ( ( is ( at ( a diner ) ) )... \n",
"2 ( ( A person ) ( ( ( ( is outdoors ) , ) ( on ... \n",
"3 ( They ( are ( smiling ( at ( their parents ) ... \n",
"4 ( There ( ( are children ) present ) ) \n",
"\n",
" sentence1_parse \\\n",
"0 (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o... \n",
"1 (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o... \n",
"2 (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o... \n",
"3 (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil... \n",
"4 (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil... \n",
"\n",
" sentence2_parse \\\n",
"0 (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ... \n",
"1 (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ... \n",
"2 (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ... \n",
"3 (ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB... \n",
"4 (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN... \n",
"\n",
" sentence1 \\\n",
"0 A person on a horse jumps over a broken down a... \n",
"1 A person on a horse jumps over a broken down a... \n",
"2 A person on a horse jumps over a broken down a... \n",
"3 Children smiling and waving at camera \n",
"4 Children smiling and waving at camera \n",
"\n",
" sentence2 captionID \\\n",
"0 A person is training his horse for a competition. 3416050480.jpg#4 \n",
"1 A person is at a diner, ordering an omelette. 3416050480.jpg#4 \n",
"2 A person is outdoors, on a horse. 3416050480.jpg#4 \n",
"3 They are smiling at their parents 2267923837.jpg#2 \n",
"4 There are children present 2267923837.jpg#2 \n",
"\n",
" pairID label1 label2 label3 label4 label5 \n",
"0 3416050480.jpg#4r1n neutral NaN NaN NaN NaN \n",
"1 3416050480.jpg#4r1c contradiction NaN NaN NaN NaN \n",
"2 3416050480.jpg#4r1e entailment NaN NaN NaN NaN \n",
"3 2267923837.jpg#2r1n neutral NaN NaN NaN NaN \n",
"4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## load pandas df with snli_1.0_dev.txt "
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"txt_df = snli.load_pandas_df(DATA_DIR_PATH, 'dev', 'txt')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gold_label</th>\n",
" <th>sentence1_binary_parse</th>\n",
" <th>sentence2_binary_parse</th>\n",
" <th>sentence1_parse</th>\n",
" <th>sentence2_parse</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" <th>captionID</th>\n",
" <th>pairID</th>\n",
" <th>label1</th>\n",
" <th>label2</th>\n",
" <th>label3</th>\n",
" <th>label4</th>\n",
" <th>label5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>( ( The sisters ) ( ( are ( ( hugging goodbye ...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ...</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>The sisters are hugging goodbye while holding ...</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>4705552913.jpg#2r1n</td>\n",
" <td>neutral</td>\n",
" <td>entailment</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>entailment</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>( ( Two woman ) ( ( are ( holding packages ) )...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are...</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>Two woman are holding packages.</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>4705552913.jpg#2r1e</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>contradiction</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>( ( The men ) ( ( are ( fighting ( outside ( a...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>(ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)...</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>The men are fighting outside a deli.</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>4705552913.jpg#2r1c</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>entailment</td>\n",
" <td>( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...</td>\n",
" <td>( ( ( Two kids ) ( in ( numbered jerseys ) ) )...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>Two kids in numbered jerseys wash their hands.</td>\n",
" <td>2407214681.jpg#0</td>\n",
" <td>2407214681.jpg#0r1e</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>neutral</td>\n",
" <td>( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...</td>\n",
" <td>( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>Two kids at a ballgame wash their hands.</td>\n",
" <td>2407214681.jpg#0</td>\n",
" <td>2407214681.jpg#0r1n</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" gold_label sentence1_binary_parse \\\n",
"0 neutral ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"1 entailment ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"2 contradiction ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"3 entailment ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
"4 neutral ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
"\n",
" sentence2_binary_parse \\\n",
"0 ( ( The sisters ) ( ( are ( ( hugging goodbye ... \n",
"1 ( ( Two woman ) ( ( are ( holding packages ) )... \n",
"2 ( ( The men ) ( ( are ( fighting ( outside ( a... \n",
"3 ( ( ( Two kids ) ( in ( numbered jerseys ) ) )... \n",
"4 ( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w... \n",
"\n",
" sentence1_parse \\\n",
"0 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"1 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"2 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"3 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
"4 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
"\n",
" sentence2_parse \\\n",
"0 (ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ... \n",
"1 (ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are... \n",
"2 (ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)... \n",
"3 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... \n",
"4 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... \n",
"\n",
" sentence1 \\\n",
"0 Two women are embracing while holding to go pa... \n",
"1 Two women are embracing while holding to go pa... \n",
"2 Two women are embracing while holding to go pa... \n",
"3 Two young children in blue jerseys, one with t... \n",
"4 Two young children in blue jerseys, one with t... \n",
"\n",
" sentence2 captionID \\\n",
"0 The sisters are hugging goodbye while holding ... 4705552913.jpg#2 \n",
"1 Two woman are holding packages. 4705552913.jpg#2 \n",
"2 The men are fighting outside a deli. 4705552913.jpg#2 \n",
"3 Two kids in numbered jerseys wash their hands. 2407214681.jpg#0 \n",
"4 Two kids at a ballgame wash their hands. 2407214681.jpg#0 \n",
"\n",
" pairID label1 label2 label3 \\\n",
"0 4705552913.jpg#2r1n neutral entailment neutral \n",
"1 4705552913.jpg#2r1e entailment entailment entailment \n",
"2 4705552913.jpg#2r1c contradiction contradiction contradiction \n",
"3 2407214681.jpg#0r1e entailment entailment entailment \n",
"4 2407214681.jpg#0r1n neutral neutral neutral \n",
"\n",
" label4 label5 \n",
"0 neutral neutral \n",
"1 entailment entailment \n",
"2 contradiction contradiction \n",
"3 entailment entailment \n",
"4 entailment entailment "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"txt_df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## load pandas df with snli_1.0_dev.jsonl "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"jsonl_df = snli.load_pandas_df(DATA_DIR_PATH, 'dev', 'jsonl')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>annotator_labels</th>\n",
" <th>captionID</th>\n",
" <th>gold_label</th>\n",
" <th>pairID</th>\n",
" <th>sentence1</th>\n",
" <th>sentence1_binary_parse</th>\n",
" <th>sentence1_parse</th>\n",
" <th>sentence2</th>\n",
" <th>sentence2_binary_parse</th>\n",
" <th>sentence2_parse</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[neutral, entailment, neutral, neutral, neutral]</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>neutral</td>\n",
" <td>4705552913.jpg#2r1n</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>The sisters are hugging goodbye while holding ...</td>\n",
" <td>( ( The sisters ) ( ( are ( ( hugging goodbye ...</td>\n",
" <td>(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[entailment, entailment, entailment, entailmen...</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>entailment</td>\n",
" <td>4705552913.jpg#2r1e</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>Two woman are holding packages.</td>\n",
" <td>( ( Two woman ) ( ( are ( holding packages ) )...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[contradiction, contradiction, contradiction, ...</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>contradiction</td>\n",
" <td>4705552913.jpg#2r1c</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>The men are fighting outside a deli.</td>\n",
" <td>( ( The men ) ( ( are ( fighting ( outside ( a...</td>\n",
" <td>(ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[entailment, entailment, entailment, entailmen...</td>\n",
" <td>2407214681.jpg#0</td>\n",
" <td>entailment</td>\n",
" <td>2407214681.jpg#0r1e</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...</td>\n",
" <td>Two kids in numbered jerseys wash their hands.</td>\n",
" <td>( ( ( Two kids ) ( in ( numbered jerseys ) ) )...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[neutral, neutral, neutral, entailment, entail...</td>\n",
" <td>2407214681.jpg#0</td>\n",
" <td>neutral</td>\n",
" <td>2407214681.jpg#0r1n</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...</td>\n",
" <td>Two kids at a ballgame wash their hands.</td>\n",
" <td>( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" annotator_labels captionID \\\n",
"0 [neutral, entailment, neutral, neutral, neutral] 4705552913.jpg#2 \n",
"1 [entailment, entailment, entailment, entailmen... 4705552913.jpg#2 \n",
"2 [contradiction, contradiction, contradiction, ... 4705552913.jpg#2 \n",
"3 [entailment, entailment, entailment, entailmen... 2407214681.jpg#0 \n",
"4 [neutral, neutral, neutral, entailment, entail... 2407214681.jpg#0 \n",
"\n",
" gold_label pairID \\\n",
"0 neutral 4705552913.jpg#2r1n \n",
"1 entailment 4705552913.jpg#2r1e \n",
"2 contradiction 4705552913.jpg#2r1c \n",
"3 entailment 2407214681.jpg#0r1e \n",
"4 neutral 2407214681.jpg#0r1n \n",
"\n",
" sentence1 \\\n",
"0 Two women are embracing while holding to go pa... \n",
"1 Two women are embracing while holding to go pa... \n",
"2 Two women are embracing while holding to go pa... \n",
"3 Two young children in blue jerseys, one with t... \n",
"4 Two young children in blue jerseys, one with t... \n",
"\n",
" sentence1_binary_parse \\\n",
"0 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"1 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"2 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"3 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
"4 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
"\n",
" sentence1_parse \\\n",
"0 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"1 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"2 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"3 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
"4 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
"\n",
" sentence2 \\\n",
"0 The sisters are hugging goodbye while holding ... \n",
"1 Two woman are holding packages. \n",
"2 The men are fighting outside a deli. \n",
"3 Two kids in numbered jerseys wash their hands. \n",
"4 Two kids at a ballgame wash their hands. \n",
"\n",
" sentence2_binary_parse \\\n",
"0 ( ( The sisters ) ( ( are ( ( hugging goodbye ... \n",
"1 ( ( Two woman ) ( ( are ( holding packages ) )... \n",
"2 ( ( The men ) ( ( are ( fighting ( outside ( a... \n",
"3 ( ( ( Two kids ) ( in ( numbered jerseys ) ) )... \n",
"4 ( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w... \n",
"\n",
" sentence2_parse \n",
"0 (ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ... \n",
"1 (ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are... \n",
"2 (ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)... \n",
"3 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... \n",
"4 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"jsonl_df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## load azure_ml dataflow object with snli_1.0_dev.txt \n",
"\n",
" P.S : Does not create a dataflow object as expected with jsonl file, cannot read a file when temp dir created"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"\n",
"azureml_dataflow = snli.load_azureml_df(DATA_DIR_PATH) "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gold_label</th>\n",
" <th>sentence1_binary_parse</th>\n",
" <th>sentence2_binary_parse</th>\n",
" <th>sentence1_parse</th>\n",
" <th>sentence2_parse</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" <th>captionID</th>\n",
" <th>pairID</th>\n",
" <th>label1</th>\n",
" <th>label2</th>\n",
" <th>label3</th>\n",
" <th>label4</th>\n",
" <th>label5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\n",
" <td>( ( A person ) ( ( is ( ( training ( his horse...</td>\n",
" <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\n",
" <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\n",
" <td>A person on a horse jumps over a broken down a...</td>\n",
" <td>A person is training his horse for a competition.</td>\n",
" <td>3416050480.jpg#4</td>\n",
" <td>3416050480.jpg#4r1n</td>\n",
" <td>neutral</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>contradiction</td>\n",
" <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\n",
" <td>( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...</td>\n",
" <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\n",
" <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\n",
" <td>A person on a horse jumps over a broken down a...</td>\n",
" <td>A person is at a diner, ordering an omelette.</td>\n",
" <td>3416050480.jpg#4</td>\n",
" <td>3416050480.jpg#4r1c</td>\n",
" <td>contradiction</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>entailment</td>\n",
" <td>( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...</td>\n",
" <td>( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...</td>\n",
" <td>(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...</td>\n",
" <td>(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...</td>\n",
" <td>A person on a horse jumps over a broken down a...</td>\n",
" <td>A person is outdoors, on a horse.</td>\n",
" <td>3416050480.jpg#4</td>\n",
" <td>3416050480.jpg#4r1e</td>\n",
" <td>entailment</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>neutral</td>\n",
" <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\n",
" <td>( They ( are ( smiling ( at ( their parents ) ...</td>\n",
" <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\n",
" <td>(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...</td>\n",
" <td>Children smiling and waving at camera</td>\n",
" <td>They are smiling at their parents</td>\n",
" <td>2267923837.jpg#2</td>\n",
" <td>2267923837.jpg#2r1n</td>\n",
" <td>neutral</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>entailment</td>\n",
" <td>( Children ( ( ( smiling and ) waving ) ( at c...</td>\n",
" <td>( There ( ( are children ) present ) )</td>\n",
" <td>(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...</td>\n",
" <td>(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...</td>\n",
" <td>Children smiling and waving at camera</td>\n",
" <td>There are children present</td>\n",
" <td>2267923837.jpg#2</td>\n",
" <td>2267923837.jpg#2r1e</td>\n",
" <td>entailment</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" gold_label sentence1_binary_parse \\\n",
"0 neutral ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump... \n",
"1 contradiction ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump... \n",
"2 entailment ( ( ( A person ) ( on ( a horse ) ) ) ( ( jump... \n",
"3 neutral ( Children ( ( ( smiling and ) waving ) ( at c... \n",
"4 entailment ( Children ( ( ( smiling and ) waving ) ( at c... \n",
"\n",
" sentence2_binary_parse \\\n",
"0 ( ( A person ) ( ( is ( ( training ( his horse... \n",
"1 ( ( A person ) ( ( ( ( is ( at ( a diner ) ) )... \n",
"2 ( ( A person ) ( ( ( ( is outdoors ) , ) ( on ... \n",
"3 ( They ( are ( smiling ( at ( their parents ) ... \n",
"4 ( There ( ( are children ) present ) ) \n",
"\n",
" sentence1_parse \\\n",
"0 (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o... \n",
"1 (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o... \n",
"2 (ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o... \n",
"3 (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil... \n",
"4 (ROOT (NP (S (NP (NNP Children)) (VP (VBG smil... \n",
"\n",
" sentence2_parse \\\n",
"0 (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ... \n",
"1 (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ... \n",
"2 (ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ... \n",
"3 (ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB... \n",
"4 (ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN... \n",
"\n",
" sentence1 \\\n",
"0 A person on a horse jumps over a broken down a... \n",
"1 A person on a horse jumps over a broken down a... \n",
"2 A person on a horse jumps over a broken down a... \n",
"3 Children smiling and waving at camera \n",
"4 Children smiling and waving at camera \n",
"\n",
" sentence2 captionID \\\n",
"0 A person is training his horse for a competition. 3416050480.jpg#4 \n",
"1 A person is at a diner, ordering an omelette. 3416050480.jpg#4 \n",
"2 A person is outdoors, on a horse. 3416050480.jpg#4 \n",
"3 They are smiling at their parents 2267923837.jpg#2 \n",
"4 There are children present 2267923837.jpg#2 \n",
"\n",
" pairID label1 label2 label3 label4 label5 \n",
"0 3416050480.jpg#4r1n neutral \n",
"1 3416050480.jpg#4r1c contradiction \n",
"2 3416050480.jpg#4r1e entailment \n",
"3 2267923837.jpg#2r1n neutral \n",
"4 2267923837.jpg#2r1e entailment "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"azureml_dataflow.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -1,8 +1,8 @@
import nltk
def nltk_tokenizer(snli_df):
snli_df['sentence1_tokens'] = snli_df.apply(lambda row: nltk.word_tokenize(row['sentence1']), axis=1)
snli_df['sentence2_tokens'] = snli_df.apply(lambda row: nltk.word_tokenize(row['sentence2']), axis=1)
snli_df['sentence1_tokens'] = snli_df.apply(lambda row: nltk.word_tokenize(row['sentence_1']), axis=1)
snli_df['sentence2_tokens'] = snli_df.apply(lambda row: nltk.word_tokenize(row['sentence_2']), axis=1)
return snli_df