Merge pull request #47 from microsoft/maidap-sentence-similarity

Baseline model notebook and embeddings trainer notebook
This commit is contained in:
Said Bleik 2019-05-11 01:09:28 +00:00 коммит произвёл GitHub
Родитель 9707c35c68 9338f40cdc
Коммит 07ca05dd04
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 2848 добавлений и 700 удалений

Просмотреть файл

@ -0,0 +1,898 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Developing Word Embeddings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Rather than use pre-trained embeddings (as we did in the baseline_deep_dive notebook), we can train word embeddings using our own dataset. In this notebook, we demonstrate the training process for producing word embeddings using the word2vec, GloVe, and fastText models. We'll utilize the STS Benchmark dataset for this task. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Table of Contents\n",
"* [Data Loading and Preprocessing](#Load-and-Preprocess-Data)\n",
"* [Word2Vec](#Word2Vec)\n",
"* [fastText](#fastText)\n",
"* [GloVe](#GloVe)\n",
"* [Concluding Remarks](#Concluding-Remarks)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"import sys\n",
"# Set the environment path\n",
"sys.path.append(\"../../\") \n",
"import os\n",
"import numpy as np\n",
"from utils_nlp.dataset.preprocess import (\n",
" to_lowercase,\n",
" to_spacy_tokens,\n",
" rm_spacy_stopwords,\n",
")\n",
"from utils_nlp.dataset import stsbenchmark\n",
"from utils_nlp.common.timer import Timer\n",
"from gensim.models import Word2Vec\n",
"from gensim.models.fasttext import FastText"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Set the path for where your datasets are located\n",
"BASE_DATA_PATH = \"../../data\" \n",
"# Location to save embeddings\n",
"SAVE_FILES_PATH = BASE_DATA_PATH + \"/trained_word_embeddings/\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(SAVE_FILES_PATH):\n",
" os.makedirs(SAVE_FILES_PATH)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and Preprocess Data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Produce a pandas dataframe for the training set\n",
"sts_train = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.00</td>\n",
" <td>A plane is taking off.</td>\n",
" <td>An air plane is taking off.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3.80</td>\n",
" <td>A man is playing a large flute.</td>\n",
" <td>A man is playing a flute.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.80</td>\n",
" <td>A man is spreading shreded cheese on a pizza.</td>\n",
" <td>A man is spreading shredded cheese on an uncoo...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2.60</td>\n",
" <td>Three men are playing chess.</td>\n",
" <td>Two men are playing chess.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4.25</td>\n",
" <td>A man is playing the cello.</td>\n",
" <td>A man seated is playing the cello.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score sentence1 \\\n",
"0 5.00 A plane is taking off. \n",
"1 3.80 A man is playing a large flute. \n",
"2 3.80 A man is spreading shreded cheese on a pizza. \n",
"3 2.60 Three men are playing chess. \n",
"4 4.25 A man is playing the cello. \n",
"\n",
" sentence2 \n",
"0 An air plane is taking off. \n",
"1 A man is playing a flute. \n",
"2 A man is spreading shredded cheese on an uncoo... \n",
"3 Two men are playing chess. \n",
"4 A man seated is playing the cello. "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sts_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5749, 3)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check the size of our dataframe\n",
"sts_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Training set preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Convert all text to lowercase\n",
"df_low = to_lowercase(sts_train) \n",
"# Tokenize text\n",
"sts_tokenize = to_spacy_tokens(df_low) \n",
"# Tokenize with removal of stopwords\n",
"sts_train_stop = rm_spacy_stopwords(sts_tokenize) "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Append together the two sentence columns to get a list of all tokenized sentences.\n",
"all_sentences = sts_train_stop[[\"sentence1_tokens_rm_stopwords\", \"sentence2_tokens_rm_stopwords\"]]\n",
"# Flatten two columns into one list and remove all sentences that are size 0 after tokenization and stop word removal.\n",
"sentences = [i for i in all_sentences.values.flatten().tolist() if len(i) > 0]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"11492"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sentences)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Minimum sentence length is 1 tokens\n",
"Maximum sentence length is 43 tokens\n",
"Median sentence length is 6.0 tokens\n"
]
}
],
"source": [
"sentence_lengths = [len(i) for i in sentences]\n",
"print(\"Minimum sentence length is {} tokens\".format(min(sentence_lengths)))\n",
"print(\"Maximum sentence length is {} tokens\".format(max(sentence_lengths)))\n",
"print(\"Median sentence length is {} tokens\".format(np.median(sentence_lengths)))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['plane', 'taking', '.'],\n",
" ['air', 'plane', 'taking', '.'],\n",
" ['man', 'playing', 'large', 'flute', '.'],\n",
" ['man', 'playing', 'flute', '.'],\n",
" ['man', 'spreading', 'shreded', 'cheese', 'pizza', '.'],\n",
" ['man', 'spreading', 'shredded', 'cheese', 'uncooked', 'pizza', '.'],\n",
" ['men', 'playing', 'chess', '.'],\n",
" ['men', 'playing', 'chess', '.'],\n",
" ['man', 'playing', 'cello', '.'],\n",
" ['man', 'seated', 'playing', 'cello', '.']]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentences[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Word2Vec"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Word2vec is a predictive model for learning word embeddings from text (see [original research paper](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)). Word embeddings are learned such that words that share common contexts in the corpus will be close together in the vector space. There are two different model architectures that can be used to produce word2vec embeddings: continuous bag-of-words (CBOW) or continuous skip-gram. The former uses a window of surrounding words (the \"context\") to predict the current word and the latter uses the current word to predict the surrounding context words. See this [tutorial](https://www.guru99.com/word-embedding-word2vec.html#3) on word2vec for more detailed background on the model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The gensim Word2Vec model has many different parameters (see [here](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec)) but the ones that are useful to know about are: \n",
"- size: length of the word embedding/vector (defaults to 100)\n",
"- window: maximum distance between the word being predicted and the current word (defaults to 5)\n",
"- min_count: ignores all words that have a frequency lower than this value (defaults to 5)\n",
"- workers: number of worker threads used to train the model (defaults to 3)\n",
"- sg: training algorithm; 1 for skip-gram and 0 for CBOW (defaults to 0)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"# Set up a Timer to see how long the model takes to train\n",
"t = Timer()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"t.start()\n",
"\n",
"# Train the Word2vec model\n",
"word2vec_model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=3, sg=0)\n",
"\n",
"t.stop()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time elapsed: 0.3874\n"
]
}
],
"source": [
"print(\"Time elapsed: {}\".format(t))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that the model is trained we can:\n",
"\n",
"1. Query for the word embeddings of a given word. \n",
"2. Inspect the model vocabulary\n",
"3. Save the word embeddings"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embedding for apple: [ 0.1108162 0.24349137 -0.01440436 0.03533127 -0.06876028 0.07968962\n",
" 0.01578981 0.14264993 -0.06832716 0.00339077 0.07635406 0.06265593\n",
" 0.03414075 0.10075415 -0.05965225 0.00968812 0.16405381 -0.24480335\n",
" -0.06949984 -0.18414594 0.0465034 0.2028756 0.09074208 0.20703372\n",
" 0.1098601 -0.32350177 -0.10786435 0.08799383 -0.19245893 -0.09788057\n",
" 0.09563518 0.08567159 0.15692063 0.08486914 -0.10940372 0.10400604\n",
" 0.03643018 0.15096138 0.12341096 -0.06584675 -0.21533655 -0.01426107\n",
" -0.06800868 -0.03641699 -0.15752348 -0.01934456 0.0068708 -0.06268159\n",
" 0.04240354 -0.06285387 -0.0215644 -0.00047655 -0.0192252 -0.12477098\n",
" -0.08567388 0.08970863 0.07633136 0.21374965 0.19123942 0.01627954\n",
" 0.11209694 0.06009139 -0.03454148 0.0743629 0.03803044 0.059964\n",
" 0.08909379 -0.04600987 0.06926275 -0.09804282 0.02527839 0.16690746\n",
" -0.11900123 -0.0311705 -0.05939943 -0.14164011 0.22661647 0.08943615\n",
" -0.03721635 0.03887443 -0.15312009 0.06582782 0.13990967 0.08372186\n",
" -0.03915371 0.09002874 0.14046906 -0.04060138 0.11289847 0.0010503\n",
" -0.1014872 -0.08762068 -0.19562078 -0.03109288 -0.16293499 -0.00314896\n",
" -0.02791101 0.04398078 0.04605171 -0.08095105]\n",
"\n",
"First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\n"
]
}
],
"source": [
"# 1. Let's see the word embedding for \"apple\" by accessing the \"wv\" attribute and passing in \"apple\" as the key.\n",
"print(\"Embedding for apple:\", word2vec_model.wv[\"apple\"])\n",
"\n",
"# 2. Inspect the model vocabulary by accessing keys of the \"wv.vocab\" attribute. We'll print the first 20 words.\n",
"print(\"\\nFirst 30 vocabulary words:\", list(word2vec_model.wv.vocab)[:20])\n",
"\n",
"# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format.\n",
"word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"word2vec_model\", binary=True) # binary format\n",
"word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"word2vec_model\", binary=False) # ASCII format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## fastText"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"fastText is an unsupervised algorithm created by Facebook Research for efficiently learning word embeddings (see [original research paper](https://arxiv.org/pdf/1607.04606.pdf)). fastText is significantly different than word2vec or GloVe in that these two algorithms treat each word as the smallest possible unit to find an embedding for. Conversely, fastText assumes that words are formed by an n-gram of characters (i.e. 2-grams of the word \"language\" would be {la, an, ng, gu, ua, ag, ge}). The embedding for a word is then composed of the sum of these character n-grams. This has advantages when finding word embeddings for rare words and words not present in the dictionary, as these words can still be broken down into character n-grams. Typically, for smaller datasets, fastText performs better than word2vec or GloVe. See this [tutorial](https://fasttext.cc/docs/en/unsupervised-tutorial.html) on fastText for more detail."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The gensim fastText model has many different parameters (see [here](https://radimrehurek.com/gensim/models/fasttext.html#gensim.models.fasttext.FastText)) but the ones that are useful to know about are: \n",
"- size: length of the word embedding/vector (defaults to 100)\n",
"- window: maximum distance between the word being predicted and the current word (defaults to 5)\n",
"- min_count: ignores all words that have a frequency lower than this value (defaults to 5)\n",
"- workers: number of worker threads used to train the model (defaults to 3)\n",
"- sg: training algorithm- 1 for skip-gram and 0 for CBOW (defaults to 0)\n",
"- iter: number of epochs (defaults to 5)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Set up a Timer to see how long the model takes to train\n",
"t = Timer()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"t.start()\n",
"\n",
"# Train the FastText model\n",
"fastText_model = FastText(size=100, window=5, min_count=5, sentences=sentences, iter=5)\n",
"\n",
"t.stop()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time elapsed: 10.4061\n"
]
}
],
"source": [
"print(\"Time elapsed: {}\".format(t))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can utilize the same attributes as we saw above for word2vec due to them both originating from the gensim package"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embedding for apple: [ 0.24594913 0.0478383 0.576843 -0.14472146 -0.13372016 0.3994271\n",
" -0.18761183 -0.10253572 -0.5489808 0.3115678 0.18665203 0.08805989\n",
" 0.565551 0.26285723 0.01494028 0.39692047 -0.39978772 -0.30473194\n",
" 0.05508447 0.10066988 0.20679028 0.30775183 0.0472638 -0.239493\n",
" 0.12949444 -0.20410636 -0.13940431 -0.03945793 0.4396631 -0.08924853\n",
" 0.08834386 -0.22228362 0.28431413 0.18899629 0.3427995 -0.2114068\n",
" -0.01075403 0.8549923 0.09068774 -0.04244559 -0.22046468 0.06916029\n",
" -0.31791446 0.11447909 -0.05693823 0.10290135 -0.09406947 -0.26463747\n",
" -0.17336299 0.07076416 -0.26909345 0.1761348 0.14077482 0.24621071\n",
" -0.0408617 -0.3031526 0.10244257 0.4772046 0.25927255 -0.02917116\n",
" 0.2211562 0.04355185 0.19956268 0.13878216 0.28868207 -0.5039835\n",
" 0.41010958 0.07107946 -0.09606131 -0.22969621 0.05883528 -0.01241339\n",
" 0.00676485 0.311163 0.08247512 -0.13799056 0.15181121 0.08045118\n",
" -0.06654785 0.04279696 0.532607 0.2505259 0.10194286 0.05519621\n",
" -0.451315 -0.24121635 0.10120259 0.36105216 0.47429752 0.4230102\n",
" -0.07235575 -0.16397384 0.28193682 -0.21931437 -0.16088559 -0.03915804\n",
" 0.41476008 -0.03525754 0.34007013 -0.152273 ]\n",
"\n",
"First 30 vocabulary words: ['plane', 'taking', '.', 'air', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'pizza', 'men', 'seated', 'fighting', 'smoking', 'piano', 'guitar', 'singing', 'woman', 'person']\n"
]
}
],
"source": [
"# 1. Let's see the word embedding for \"apple\" by accessing the \"wv\" attribute and passing in \"apple\" as the key.\n",
"print(\"Embedding for apple:\", fastText_model.wv[\"apple\"])\n",
"\n",
"# 2. Inspect the model vocabulary by accessing keys of the \"wv.vocab\" attribute. We'll print the first 20 words.\n",
"print(\"\\nFirst 30 vocabulary words:\", list(fastText_model.wv.vocab)[:20])\n",
"\n",
"# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format.\n",
"fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"fastText_model\", binary=True) # binary format\n",
"fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"fastText_model\", binary=False) # ASCII format"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GloVe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"GloVe is an unsupervised algorithm for obtaining word embeddings created by the Stanford NLP group (see [original research paper](https://nlp.stanford.edu/pubs/glove.pdf)). Training occurs on word-word co-occurrence statistics with the objective of learning word embeddings such that the dot product of two words' embeddings is equal to the words' probability of co-occurrence. See this [tutorial](https://nlp.stanford.edu/projects/glove/) on GloVe for more detailed background on the model. \n",
"\n",
"Gensim doesn't have an implementation of the GloVe model and the other python packages that implement GloVe are unstable, so we suggest getting the code directly from the Stanford NLP [repo](https://github.com/stanfordnlp/GloVe). Run the following commands to clone the repo and then make. Clone the repo in the same location as this notebook! Otherwise, the paths below will need to be modified. \n",
"\n",
" git clone http://github.com/stanfordnlp/glove \n",
" cd glove && make "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train GloVe vectors"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Training GloVe embeddings requires some data prep and then 4 steps (also documented in the original Stanford NLP repo [here](https://github.com/stanfordnlp/GloVe/tree/master/src))."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 0: Prepare Data**\n",
" \n",
"In order to train our GloVe vectors, we first need to save our corpus as a text file with all words separated by 1+ spaces or tabs. Each document/sentence is separated by a new line character."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"# Save our corpus as tokens delimited by spaces with new line characters in between sentences.\n",
"with open(BASE_DATA_PATH+'/clean/stsbenchmark/training-corpus-cleaned.txt', 'w', encoding='utf8') as file:\n",
" for sent in sentences:\n",
" file.write(\" \".join(sent) + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# Set up a Timer to see how long the model takes to train\n",
"t = Timer()\n",
"t.start()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 1: Build Vocabulary**\n",
"\n",
"Run the vocab_count executable. There are 3 optional parameters:\n",
"1. min-count: lower limit on how many times a word must appear in dataset. Otherwise the word is discarded from our vocabulary.\n",
"2. max-vocab: upper bound on the number of vocabulary words to keep\n",
"3. verbose: 0, 1, or 2 (default)\n",
"\n",
"Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the vocabulary to "
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"BUILDING VOCABULARY\n",
"Processed 0 tokens.Processed 84997 tokens.\n",
"Counted 11716 unique words.\n",
"Truncating vocabulary at min count 5.\n",
"Using vocabulary of size 2943.\n",
"\n"
]
}
],
"source": [
"!\"glove/build/vocab_count\" -min-count 5 -verbose 2 <\"../../data/clean/stsbenchmark/training-corpus-cleaned.txt\"> \"../../data/trained_word_embeddings/vocab.txt\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 2: Construct Word Co-occurrence Statistics**\n",
"\n",
"Run the cooccur executable. There are many optional parameters, but we list the top ones here:\n",
"1. symmetric: 0 for only looking at left context, 1 (default) for looking at both left and right context\n",
"2. window-size: number of context words to use (default 15)\n",
"3. verbose: 0, 1, or 2 (default)\n",
"4. vocab-file: path/name of the vocabulary file created in Step 1\n",
"5. memory: soft limit for memory consumption, default 4\n",
"6. max-product: limit the size of dense co-occurrence array by specifying the max product (integer) of the frequency counts of the two co-occurring words\n",
"\n",
"Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the co-occurrences to"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"COUNTING COOCCURRENCES\n",
"window size: 15\n",
"context: symmetric\n",
"max product: 13752509\n",
"overflow length: 38028356\n",
"Reading vocab from file \"../../data/trained_word_embeddings/vocab.txt\"...loaded 2943 words.\n",
"Building lookup table...table contains 8661250 elements.\n",
"Processing token: 0Processed 84997 tokens.\n",
"Writing cooccurrences to disk......2 files in total.\n",
"Merging cooccurrence files: processed 0 lines.0 lines.100000 lines.Merging cooccurrence files: processed 187717 lines.\n",
"\n"
]
}
],
"source": [
"!\"glove/build/cooccur\" -memory 4 -vocab-file \"../../data/trained_word_embeddings/vocab.txt\" -verbose 2 -window-size 15 <\"../../data/clean/stsbenchmark/training-corpus-cleaned.txt\"> \"../../data/trained_word_embeddings/cooccurrence.bin\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 3: Shuffle the Co-occurrences**\n",
"\n",
"Run the shuffle executable. The parameters are as follows:\n",
"1. verbose: 0, 1, or 2 (default)\n",
"2. memory: soft limit for memory consumption, default 4\n",
"3. array-size: limit to the length of the buffer which stores chunks of data to shuffle before writing to disk\n",
"\n",
"Then provide the path to the co-occurrence file we created in Step 2 followed by a file path that we'll save the shuffled co-occurrences to"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"SHUFFLING COOCCURRENCES\n",
"array size: 255013683\n",
"Shuffling by chunks: processed 0 lines.processed 187717 lines.\n",
"Wrote 1 temporary file(s).\n",
"Merging temp files: processed 0 lines.187717 lines.Merging temp files: processed 187717 lines.\n",
"\n"
]
}
],
"source": [
"!\"glove/build/shuffle\" -memory 4 -verbose 2 <\"../../data/trained_word_embeddings/cooccurrence.bin\"> \"../../data/trained_word_embeddings/cooccurrence.shuf.bin\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Step 4: Train GloVe model**\n",
"\n",
"Run the glove executable. There are many parameter options, but the top ones are listed below:\n",
"1. verbose: 0, 1, or 2 (default)\n",
"2. vector-size: dimension of word embeddings (50 is default)\n",
"3. threads: number threads, default 8\n",
"4. iter: number of iterations, default 25\n",
"5. eta: learning rate, default 0.05\n",
"6. binary: whether to save binary format (0: text = default, 1: binary, 2: both)\n",
"7. x-max: cutoff for weighting function, default is 100\n",
"8. vocab-file: file containing vocabulary as produced in Step 1\n",
"9. save-file: filename to save vectors to \n",
"10. input-file: filename with co-occurrences as returned from Step 3"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"TRAINING MODEL\n",
"Read 187717 lines.\n",
"Initializing parameters...done.\n",
"vector size: 50\n",
"vocab size: 2943\n",
"x_max: 10.000000\n",
"alpha: 0.750000\n",
"05/09/19 - 03:10.13PM, iter: 001, cost: 0.078329\n",
"05/09/19 - 03:10.13PM, iter: 002, cost: 0.072090\n",
"05/09/19 - 03:10.13PM, iter: 003, cost: 0.070081\n",
"05/09/19 - 03:10.13PM, iter: 004, cost: 0.067171\n",
"05/09/19 - 03:10.13PM, iter: 005, cost: 0.063501\n",
"05/09/19 - 03:10.13PM, iter: 006, cost: 0.060700\n",
"05/09/19 - 03:10.13PM, iter: 007, cost: 0.058092\n",
"05/09/19 - 03:10.13PM, iter: 008, cost: 0.056080\n",
"05/09/19 - 03:10.13PM, iter: 009, cost: 0.054016\n",
"05/09/19 - 03:10.13PM, iter: 010, cost: 0.051806\n",
"05/09/19 - 03:10.13PM, iter: 011, cost: 0.049565\n",
"05/09/19 - 03:10.13PM, iter: 012, cost: 0.047378\n",
"05/09/19 - 03:10.13PM, iter: 013, cost: 0.045232\n",
"05/09/19 - 03:10.13PM, iter: 014, cost: 0.043136\n",
"05/09/19 - 03:10.13PM, iter: 015, cost: 0.041132\n"
]
}
],
"source": [
"!\"glove/build/glove\" -save-file \"../../data/trained_word_embeddings/GloVe_vectors\" -threads 8 -input-file \\\n",
"\"../../data/trained_word_embeddings/cooccurrence.shuf.bin\" -x-max 10 -iter 15 -vector-size 50 -binary 2 \\\n",
"-vocab-file \"../../data/trained_word_embeddings/vocab.txt\" -verbose 2"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"t.stop()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Time elapsed: 8.1586\n"
]
}
],
"source": [
"print(\"Time elapsed: {}\".format(t))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Inspect Word Vectors"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Like we did above for the word2vec and fastText models, let's now inspect our word embeddings"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"#load in the saved word vectors.\n",
"glove_wv = {}\n",
"with open(\"../../data/trained_word_embeddings/GloVe_vectors.txt\", encoding='utf-8') as f:\n",
" for line in f:\n",
" split_line = line.split(\" \")\n",
" glove_wv[split_line[0]] = [float(i) for i in split_line[1:]]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Embedding for apple: [0.123773, -0.053006, 0.070493, 0.108794, 0.056317, -0.121031, 0.031882, 0.036723, -0.080099, 0.070415, -0.049969, 0.13519, 0.02835, 0.077195, 0.038348, -0.07014, 0.064163, -0.073477, 0.054575, 0.000798, 0.144856, 0.129294, 0.088421, 0.098318, -0.208831, 0.003972, 0.043487, 0.098745, -0.135213, -0.080192, 0.033854, -0.092947, -0.086098, 0.063487, -0.003857, -0.040265, 0.006533, -0.028026, -0.0315, -0.046298, 0.053757, -0.038117, 0.008664, -0.141584, 0.105524, 0.106604, -0.102875, 0.062868, -0.185542, -0.002386]\n",
"\n",
"First 30 vocabulary words: ['.', ',', 'man', '-', 'woman', \"'\", 'said', 'dog', '\"', 'playing', ':', 'white', 'black', '$', 'killed', 'percent', 'new', 'syria', 'people', 'china']\n"
]
}
],
"source": [
"# 1. Let's see the word embedding for \"apple\" by passing in \"apple\" as the key.\n",
"print(\"Embedding for apple:\", glove_wv[\"apple\"])\n",
"\n",
"# 2. Inspect the model vocabulary by accessing keys of the \"wv.vocab\" attribute. We'll print the first 20 words.\n",
"print(\"\\nFirst 30 vocabulary words:\", list(glove_wv.keys())[:20])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Concluding Remarks\n",
"\n",
"In this notebook we have shown how to train word2vec, GloVe, and fastText word embeddings on the STS Benchmark dataset. We also inspected how long each model took to train on our dataset: word2vec took 0.39 seconds, GloVe took 8.16 seconds, and fastText took 10.41 seconds.\n",
"\n",
"FastText is typically regarded as the best baseline for word embeddings (see [blog](https://medium.com/huggingface/universal-word-sentence-embeddings-ce48ddc8fc3a)) and is a good place to start when generating word embeddings. Now that we generated word embeddings on our dataset, we could also repeat the baseline_deep_dive notebook using these embeddings (versus the pre-trained ones from the internet). "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -38,7 +38,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n"
"System version: 3.6.8 |Anaconda, Inc.| (default, Dec 29 2018, 19:04:46) \n",
"[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]\n"
]
}
],
@ -90,16 +91,13 @@
"# defaults to txt\n",
"train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
"\n",
"#load dataframe from jsonl file format\n",
"dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\", file_type=\"jsonl\")\n",
"\n",
"#specify txt format \n",
"test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\", file_type=\"txt\")\n"
"# or, load dataframe from jsonl\n",
"dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\", file_type=\"jsonl\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -280,7 +278,7 @@
"4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN "
]
},
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -289,422 +287,44 @@
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>annotator_labels</th>\n",
" <th>captionID</th>\n",
" <th>gold_label</th>\n",
" <th>pairID</th>\n",
" <th>sentence1</th>\n",
" <th>sentence1_binary_parse</th>\n",
" <th>sentence1_parse</th>\n",
" <th>sentence2</th>\n",
" <th>sentence2_binary_parse</th>\n",
" <th>sentence2_parse</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[neutral, entailment, neutral, neutral, neutral]</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>neutral</td>\n",
" <td>4705552913.jpg#2r1n</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>The sisters are hugging goodbye while holding ...</td>\n",
" <td>( ( The sisters ) ( ( are ( ( hugging goodbye ...</td>\n",
" <td>(ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[entailment, entailment, entailment, entailmen...</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>entailment</td>\n",
" <td>4705552913.jpg#2r1e</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>Two woman are holding packages.</td>\n",
" <td>( ( Two woman ) ( ( are ( holding packages ) )...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[contradiction, contradiction, contradiction, ...</td>\n",
" <td>4705552913.jpg#2</td>\n",
" <td>contradiction</td>\n",
" <td>4705552913.jpg#2r1c</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>( ( Two women ) ( ( are ( embracing ( while ( ...</td>\n",
" <td>(ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar...</td>\n",
" <td>The men are fighting outside a deli.</td>\n",
" <td>( ( The men ) ( ( are ( fighting ( outside ( a...</td>\n",
" <td>(ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[entailment, entailment, entailment, entailmen...</td>\n",
" <td>2407214681.jpg#0</td>\n",
" <td>entailment</td>\n",
" <td>2407214681.jpg#0r1e</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...</td>\n",
" <td>Two kids in numbered jerseys wash their hands.</td>\n",
" <td>( ( ( Two kids ) ( in ( numbered jerseys ) ) )...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[neutral, neutral, neutral, entailment, entail...</td>\n",
" <td>2407214681.jpg#0</td>\n",
" <td>neutral</td>\n",
" <td>2407214681.jpg#0r1n</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>( ( ( Two ( young children ) ) ( in ( ( ( ( ( ...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil...</td>\n",
" <td>Two kids at a ballgame wash their hands.</td>\n",
" <td>( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w...</td>\n",
" <td>(ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" annotator_labels captionID \\\n",
"0 [neutral, entailment, neutral, neutral, neutral] 4705552913.jpg#2 \n",
"1 [entailment, entailment, entailment, entailmen... 4705552913.jpg#2 \n",
"2 [contradiction, contradiction, contradiction, ... 4705552913.jpg#2 \n",
"3 [entailment, entailment, entailment, entailmen... 2407214681.jpg#0 \n",
"4 [neutral, neutral, neutral, entailment, entail... 2407214681.jpg#0 \n",
"\n",
" gold_label pairID \\\n",
"0 neutral 4705552913.jpg#2r1n \n",
"1 entailment 4705552913.jpg#2r1e \n",
"2 contradiction 4705552913.jpg#2r1c \n",
"3 entailment 2407214681.jpg#0r1e \n",
"4 neutral 2407214681.jpg#0r1n \n",
"\n",
" sentence1 \\\n",
"0 Two women are embracing while holding to go pa... \n",
"1 Two women are embracing while holding to go pa... \n",
"2 Two women are embracing while holding to go pa... \n",
"3 Two young children in blue jerseys, one with t... \n",
"4 Two young children in blue jerseys, one with t... \n",
"\n",
" sentence1_binary_parse \\\n",
"0 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"1 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"2 ( ( Two women ) ( ( are ( embracing ( while ( ... \n",
"3 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
"4 ( ( ( Two ( young children ) ) ( in ( ( ( ( ( ... \n",
"\n",
" sentence1_parse \\\n",
"0 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"1 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"2 (ROOT (S (NP (CD Two) (NNS women)) (VP (VBP ar... \n",
"3 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
"4 (ROOT (S (NP (NP (CD Two) (JJ young) (NNS chil... \n",
"\n",
" sentence2 \\\n",
"0 The sisters are hugging goodbye while holding ... \n",
"1 Two woman are holding packages. \n",
"2 The men are fighting outside a deli. \n",
"3 Two kids in numbered jerseys wash their hands. \n",
"4 Two kids at a ballgame wash their hands. \n",
"\n",
" sentence2_binary_parse \\\n",
"0 ( ( The sisters ) ( ( are ( ( hugging goodbye ... \n",
"1 ( ( Two woman ) ( ( are ( holding packages ) )... \n",
"2 ( ( The men ) ( ( are ( fighting ( outside ( a... \n",
"3 ( ( ( Two kids ) ( in ( numbered jerseys ) ) )... \n",
"4 ( ( ( Two kids ) ( at ( a ballgame ) ) ) ( ( w... \n",
"\n",
" sentence2_parse \n",
"0 (ROOT (S (NP (DT The) (NNS sisters)) (VP (VBP ... \n",
"1 (ROOT (S (NP (CD Two) (NN woman)) (VP (VBP are... \n",
"2 (ROOT (S (NP (DT The) (NNS men)) (VP (VBP are)... \n",
"3 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... \n",
"4 (ROOT (S (NP (NP (CD Two) (NNS kids)) (PP (IN ... "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>gold_label</th>\n",
" <th>sentence1_binary_parse</th>\n",
" <th>sentence2_binary_parse</th>\n",
" <th>sentence1_parse</th>\n",
" <th>sentence2_parse</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" <th>captionID</th>\n",
" <th>pairID</th>\n",
" <th>label1</th>\n",
" <th>label2</th>\n",
" <th>label3</th>\n",
" <th>label4</th>\n",
" <th>label5</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" <td>( ( This ( church choir ) ) ( ( ( sings ( to (...</td>\n",
" <td>( ( The church ) ( ( has ( cracks ( in ( the c...</td>\n",
" <td>(ROOT (S (NP (DT This) (NN church) (NN choir))...</td>\n",
" <td>(ROOT (S (NP (DT The) (NN church)) (VP (VBZ ha...</td>\n",
" <td>This church choir sings to the masses as they ...</td>\n",
" <td>The church has cracks in the ceiling.</td>\n",
" <td>2677109430.jpg#1</td>\n",
" <td>2677109430.jpg#1r1n</td>\n",
" <td>neutral</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>entailment</td>\n",
" <td>( ( This ( church choir ) ) ( ( ( sings ( to (...</td>\n",
" <td>( ( The church ) ( ( is ( filled ( with song )...</td>\n",
" <td>(ROOT (S (NP (DT This) (NN church) (NN choir))...</td>\n",
" <td>(ROOT (S (NP (DT The) (NN church)) (VP (VBZ is...</td>\n",
" <td>This church choir sings to the masses as they ...</td>\n",
" <td>The church is filled with song.</td>\n",
" <td>2677109430.jpg#1</td>\n",
" <td>2677109430.jpg#1r1e</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>neutral</td>\n",
" <td>entailment</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>contradiction</td>\n",
" <td>( ( This ( church choir ) ) ( ( ( sings ( to (...</td>\n",
" <td>( ( ( A choir ) ( singing ( at ( a ( baseball ...</td>\n",
" <td>(ROOT (S (NP (DT This) (NN church) (NN choir))...</td>\n",
" <td>(ROOT (NP (NP (DT A) (NN choir)) (VP (VBG sing...</td>\n",
" <td>This church choir sings to the masses as they ...</td>\n",
" <td>A choir singing at a baseball game.</td>\n",
" <td>2677109430.jpg#1</td>\n",
" <td>2677109430.jpg#1r1c</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" <td>contradiction</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>neutral</td>\n",
" <td>( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...</td>\n",
" <td>( ( The woman ) ( ( is young ) . ) )</td>\n",
" <td>(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)...</td>\n",
" <td>(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)...</td>\n",
" <td>A woman with a green headscarf, blue shirt and...</td>\n",
" <td>The woman is young.</td>\n",
" <td>6160193920.jpg#4</td>\n",
" <td>6160193920.jpg#4r1n</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>entailment</td>\n",
" <td>( ( ( A woman ) ( with ( ( ( ( ( a ( green hea...</td>\n",
" <td>( ( The woman ) ( ( is ( very happy ) ) . ) )</td>\n",
" <td>(ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)...</td>\n",
" <td>(ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)...</td>\n",
" <td>A woman with a green headscarf, blue shirt and...</td>\n",
" <td>The woman is very happy.</td>\n",
" <td>6160193920.jpg#4</td>\n",
" <td>6160193920.jpg#4r1e</td>\n",
" <td>entailment</td>\n",
" <td>entailment</td>\n",
" <td>contradiction</td>\n",
" <td>entailment</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" gold_label sentence1_binary_parse \\\n",
"0 neutral ( ( This ( church choir ) ) ( ( ( sings ( to (... \n",
"1 entailment ( ( This ( church choir ) ) ( ( ( sings ( to (... \n",
"2 contradiction ( ( This ( church choir ) ) ( ( ( sings ( to (... \n",
"3 neutral ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... \n",
"4 entailment ( ( ( A woman ) ( with ( ( ( ( ( a ( green hea... \n",
"\n",
" sentence2_binary_parse \\\n",
"0 ( ( The church ) ( ( has ( cracks ( in ( the c... \n",
"1 ( ( The church ) ( ( is ( filled ( with song )... \n",
"2 ( ( ( A choir ) ( singing ( at ( a ( baseball ... \n",
"3 ( ( The woman ) ( ( is young ) . ) ) \n",
"4 ( ( The woman ) ( ( is ( very happy ) ) . ) ) \n",
"\n",
" sentence1_parse \\\n",
"0 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n",
"1 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n",
"2 (ROOT (S (NP (DT This) (NN church) (NN choir))... \n",
"3 (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... \n",
"4 (ROOT (NP (NP (DT A) (NN woman)) (PP (IN with)... \n",
"\n",
" sentence2_parse \\\n",
"0 (ROOT (S (NP (DT The) (NN church)) (VP (VBZ ha... \n",
"1 (ROOT (S (NP (DT The) (NN church)) (VP (VBZ is... \n",
"2 (ROOT (NP (NP (DT A) (NN choir)) (VP (VBG sing... \n",
"3 (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... \n",
"4 (ROOT (S (NP (DT The) (NN woman)) (VP (VBZ is)... \n",
"\n",
" sentence1 \\\n",
"0 This church choir sings to the masses as they ... \n",
"1 This church choir sings to the masses as they ... \n",
"2 This church choir sings to the masses as they ... \n",
"3 A woman with a green headscarf, blue shirt and... \n",
"4 A woman with a green headscarf, blue shirt and... \n",
"\n",
" sentence2 captionID \\\n",
"0 The church has cracks in the ceiling. 2677109430.jpg#1 \n",
"1 The church is filled with song. 2677109430.jpg#1 \n",
"2 A choir singing at a baseball game. 2677109430.jpg#1 \n",
"3 The woman is young. 6160193920.jpg#4 \n",
"4 The woman is very happy. 6160193920.jpg#4 \n",
"\n",
" pairID label1 label2 label3 \\\n",
"0 2677109430.jpg#1r1n neutral contradiction contradiction \n",
"1 2677109430.jpg#1r1e entailment entailment entailment \n",
"2 2677109430.jpg#1r1c contradiction contradiction contradiction \n",
"3 6160193920.jpg#4r1n neutral neutral neutral \n",
"4 6160193920.jpg#4r1e entailment entailment contradiction \n",
"\n",
" label4 label5 \n",
"0 neutral neutral \n",
"1 neutral entailment \n",
"2 contradiction contradiction \n",
"3 neutral neutral \n",
"4 entailment neutral "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 02 Tokenize\n",
"\n",
"We have loaded the dataset into pandas.DataFrame, we now convert sentences to tokens.\n",
"Now that we've loaded the data into a pandas.DataFrame, we can tokenize the sentences.\n",
"We also clean the data before tokenizing. This includes dropping unneccessary columns and renaming the relevant columns as score, sentence_1, and sentence_2."
]
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def clean(df, file_split):\n",
" src_file_path = os.path.join(BASE_DATA_PATH, \"raw/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n",
" return snli.clean_snli(src_file_path).dropna() # drop rows with any NaN vals"
" if not os.path.exists(os.path.join(BASE_DATA_PATH, \"clean/snli_1.0\")):\n",
" os.makedirs(os.path.join(BASE_DATA_PATH, \"clean/snli_1.0\"))\n",
" dest_file_path = os.path.join(BASE_DATA_PATH, \"clean/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n",
" clean_df = snli.clean_snli(src_file_path).dropna() # drop rows with any NaN vals\n",
" clean_df.to_csv(dest_file_path)\n",
" return clean_df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"train = clean(train, 'train')\n",
"dev = clean(dev, 'dev')\n",
"test = clean(test, 'test')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Glimpse of the data"
"train = clean(train, 'train')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -784,7 +404,7 @@
"4 There are children present "
]
},
"execution_count": 11,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -793,225 +413,34 @@
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>The sisters are hugging goodbye while holding ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>entailment</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>Two woman are holding packages.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>contradiction</td>\n",
" <td>Two women are embracing while holding to go pa...</td>\n",
" <td>The men are fighting outside a deli.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>entailment</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>Two kids in numbered jerseys wash their hands.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>neutral</td>\n",
" <td>Two young children in blue jerseys, one with t...</td>\n",
" <td>Two kids at a ballgame wash their hands.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score sentence1 \\\n",
"0 neutral Two women are embracing while holding to go pa... \n",
"1 entailment Two women are embracing while holding to go pa... \n",
"2 contradiction Two women are embracing while holding to go pa... \n",
"3 entailment Two young children in blue jerseys, one with t... \n",
"4 neutral Two young children in blue jerseys, one with t... \n",
"\n",
" sentence2 \n",
"0 The sisters are hugging goodbye while holding ... \n",
"1 Two woman are holding packages. \n",
"2 The men are fighting outside a deli. \n",
"3 Two kids in numbered jerseys wash their hands. \n",
"4 Two kids at a ballgame wash their hands. "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>score</th>\n",
" <th>sentence1</th>\n",
" <th>sentence2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" <td>This church choir sings to the masses as they ...</td>\n",
" <td>The church has cracks in the ceiling.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>entailment</td>\n",
" <td>This church choir sings to the masses as they ...</td>\n",
" <td>The church is filled with song.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>contradiction</td>\n",
" <td>This church choir sings to the masses as they ...</td>\n",
" <td>A choir singing at a baseball game.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>neutral</td>\n",
" <td>A woman with a green headscarf, blue shirt and...</td>\n",
" <td>The woman is young.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>entailment</td>\n",
" <td>A woman with a green headscarf, blue shirt and...</td>\n",
" <td>The woman is very happy.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" score sentence1 \\\n",
"0 neutral This church choir sings to the masses as they ... \n",
"1 entailment This church choir sings to the masses as they ... \n",
"2 contradiction This church choir sings to the masses as they ... \n",
"3 neutral A woman with a green headscarf, blue shirt and... \n",
"4 entailment A woman with a green headscarf, blue shirt and... \n",
"\n",
" sentence2 \n",
"0 The church has cracks in the ceiling. \n",
"1 The church is filled with song. \n",
"2 A choir singing at a baseball game. \n",
"3 The woman is young. \n",
"4 The woman is very happy. "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK] (https://www.nltk.org/) library for tokenization."
"Once we have the clean pandas dataframes, we do lowercase standardization and tokenization. We use the [NLTK](https://www.nltk.org/) library for tokenization."
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\jamahaja\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"train_tok = to_nltk_tokens(to_lowercase(train))\n",
"dev_tok = to_nltk_tokens(to_lowercase(dev))\n",
"test_tok = to_nltk_tokens(to_lowercase(test))"
"train_tok = to_nltk_tokens(to_lowercase(train))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -1046,42 +475,42 @@
" <tr>\n",
" <th>0</th>\n",
" <td>neutral</td>\n",
" <td>two women are embracing while holding to go pa...</td>\n",
" <td>the sisters are hugging goodbye while holding ...</td>\n",
" <td>[two, women, are, embracing, while, holding, t...</td>\n",
" <td>[the, sisters, are, hugging, goodbye, while, h...</td>\n",
" <td>a person on a horse jumps over a broken down a...</td>\n",
" <td>a person is training his horse for a competition.</td>\n",
" <td>[a, person, on, a, horse, jumps, over, a, brok...</td>\n",
" <td>[a, person, is, training, his, horse, for, a, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>entailment</td>\n",
" <td>two women are embracing while holding to go pa...</td>\n",
" <td>two woman are holding packages.</td>\n",
" <td>[two, women, are, embracing, while, holding, t...</td>\n",
" <td>[two, woman, are, holding, packages, .]</td>\n",
" <td>contradiction</td>\n",
" <td>a person on a horse jumps over a broken down a...</td>\n",
" <td>a person is at a diner, ordering an omelette.</td>\n",
" <td>[a, person, on, a, horse, jumps, over, a, brok...</td>\n",
" <td>[a, person, is, at, a, diner, ,, ordering, an,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>contradiction</td>\n",
" <td>two women are embracing while holding to go pa...</td>\n",
" <td>the men are fighting outside a deli.</td>\n",
" <td>[two, women, are, embracing, while, holding, t...</td>\n",
" <td>[the, men, are, fighting, outside, a, deli, .]</td>\n",
" <td>entailment</td>\n",
" <td>a person on a horse jumps over a broken down a...</td>\n",
" <td>a person is outdoors, on a horse.</td>\n",
" <td>[a, person, on, a, horse, jumps, over, a, brok...</td>\n",
" <td>[a, person, is, outdoors, ,, on, a, horse, .]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>entailment</td>\n",
" <td>two young children in blue jerseys, one with t...</td>\n",
" <td>two kids in numbered jerseys wash their hands.</td>\n",
" <td>[two, young, children, in, blue, jerseys, ,, o...</td>\n",
" <td>[two, kids, in, numbered, jerseys, wash, their...</td>\n",
" <td>neutral</td>\n",
" <td>children smiling and waving at camera</td>\n",
" <td>they are smiling at their parents</td>\n",
" <td>[children, smiling, and, waving, at, camera]</td>\n",
" <td>[they, are, smiling, at, their, parents]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>neutral</td>\n",
" <td>two young children in blue jerseys, one with t...</td>\n",
" <td>two kids at a ballgame wash their hands.</td>\n",
" <td>[two, young, children, in, blue, jerseys, ,, o...</td>\n",
" <td>[two, kids, at, a, ballgame, wash, their, hand...</td>\n",
" <td>entailment</td>\n",
" <td>children smiling and waving at camera</td>\n",
" <td>there are children present</td>\n",
" <td>[children, smiling, and, waving, at, camera]</td>\n",
" <td>[there, are, children, present]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@ -1089,41 +518,41 @@
],
"text/plain": [
" score sentence1 \\\n",
"0 neutral two women are embracing while holding to go pa... \n",
"1 entailment two women are embracing while holding to go pa... \n",
"2 contradiction two women are embracing while holding to go pa... \n",
"3 entailment two young children in blue jerseys, one with t... \n",
"4 neutral two young children in blue jerseys, one with t... \n",
"0 neutral a person on a horse jumps over a broken down a... \n",
"1 contradiction a person on a horse jumps over a broken down a... \n",
"2 entailment a person on a horse jumps over a broken down a... \n",
"3 neutral children smiling and waving at camera \n",
"4 entailment children smiling and waving at camera \n",
"\n",
" sentence2 \\\n",
"0 the sisters are hugging goodbye while holding ... \n",
"1 two woman are holding packages. \n",
"2 the men are fighting outside a deli. \n",
"3 two kids in numbered jerseys wash their hands. \n",
"4 two kids at a ballgame wash their hands. \n",
"0 a person is training his horse for a competition. \n",
"1 a person is at a diner, ordering an omelette. \n",
"2 a person is outdoors, on a horse. \n",
"3 they are smiling at their parents \n",
"4 there are children present \n",
"\n",
" sentence1_tokens \\\n",
"0 [two, women, are, embracing, while, holding, t... \n",
"1 [two, women, are, embracing, while, holding, t... \n",
"2 [two, women, are, embracing, while, holding, t... \n",
"3 [two, young, children, in, blue, jerseys, ,, o... \n",
"4 [two, young, children, in, blue, jerseys, ,, o... \n",
"0 [a, person, on, a, horse, jumps, over, a, brok... \n",
"1 [a, person, on, a, horse, jumps, over, a, brok... \n",
"2 [a, person, on, a, horse, jumps, over, a, brok... \n",
"3 [children, smiling, and, waving, at, camera] \n",
"4 [children, smiling, and, waving, at, camera] \n",
"\n",
" sentence2_tokens \n",
"0 [the, sisters, are, hugging, goodbye, while, h... \n",
"1 [two, woman, are, holding, packages, .] \n",
"2 [the, men, are, fighting, outside, a, deli, .] \n",
"3 [two, kids, in, numbered, jerseys, wash, their... \n",
"4 [two, kids, at, a, ballgame, wash, their, hand... "
"0 [a, person, is, training, his, horse, for, a, ... \n",
"1 [a, person, is, at, a, diner, ,, ordering, an,... \n",
"2 [a, person, is, outdoors, ,, on, a, horse, .] \n",
"3 [they, are, smiling, at, their, parents] \n",
"4 [there, are, children, present] "
]
},
"execution_count": 15,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dev_tok.head()"
"train_tok.head()"
]
},
{
@ -1138,10 +567,35 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 10,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /Users/caseyhong/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
"dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n",
"test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")\n",
"\n",
"clean_train = clean(train, file_split=\"train\")\n",
"clean_dev = clean(dev, file_split=\"dev\")\n",
"clean_test = clean(dev, file_split=\"test\")\n",
"\n",
"train_tok = to_nltk_tokens(to_lowercase(clean_train))\n",
"dev_tok = to_nltk_tokens(to_lowercase(clean_dev))\n",
"test_tok = to_nltk_tokens(to_lowercase(clean_test))\n",
"\n",
"split_map = {'train': train_tok, 'dev': dev_tok, 'test': test_tok}\n",
"for file_split, df in split_map.items():\n",
" base_txt_path = os.path.join(BASE_DATA_PATH, \"clean/snli_1.0/snli_1.0_{}.txt\".format(file_split))\n",
@ -1155,7 +609,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@ -1176,13 +630,6 @@
" shutil.move(\"{}.tmp\".format(s1_tok_path), s1_tok_path)\n",
" shutil.move(\"{}.tmp\".format(s2_tok_path), s2_tok_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,42 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pytest
from utils_nlp.pretrained_embeddings.word2vec import (
load_pretrained_vectors as load_word2vec,
)
from utils_nlp.pretrained_embeddings.glove import (
load_pretrained_vectors as load_glove,
)
from utils_nlp.pretrained_embeddings.fasttext import (
load_pretrained_vectors as load_fasttext,
)
@pytest.mark.smoke
def test_load_pretrained_vectors_word2vec(tmp_path):
filename = "word2vec.bin"
load_word2vec(tmp_path, filename)
filepath = os.path.join(os.path.join(tmp_path, "word2vec"), filename)
statinfo = os.stat(filepath)
assert statinfo.st_size == 3644258522
@pytest.mark.smoke
def test_load_pretrained_vectors_glove(tmp_path):
filename = "glove.840B.300d.txt"
load_glove(tmp_path)
filepath = os.path.join(os.path.join(tmp_path, "gloVe"), filename)
statinfo = os.stat(filepath)
assert statinfo.st_size == 5646236541
@pytest.mark.smoke
def test_load_pretrained_vectors_fasttext(tmp_path):
filename = "wiki.en.bin"
load_fasttext(tmp_path)
filepath = os.path.join(os.path.join(tmp_path, "fastText"), filename)
statinfo = os.stat(filepath)
assert statinfo.st_size == 8493673445

Просмотреть файл

@ -0,0 +1,59 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import shutil
from pathlib import Path
from gensim.models.fasttext import FastText
from gensim.models.keyedvectors import Word2VecKeyedVectors
from utils_nlp.pretrained_embeddings.fasttext import (
load_pretrained_vectors as load_fasttext,
)
from utils_nlp.pretrained_embeddings.glove import (
load_pretrained_vectors as load_glove,
)
from utils_nlp.pretrained_embeddings.word2vec import (
load_pretrained_vectors as load_word2vec,
)
def test_load_pretrained_vectors_word2vec():
dir_path = "temp_data/"
file_path = os.path.join(
os.path.join(dir_path, "word2vec"),
"GoogleNews-vectors-negative300.bin",
)
assert isinstance(load_word2vec(dir_path), Word2VecKeyedVectors)
file_path = Path(file_path)
assert file_path.is_file()
shutil.rmtree(os.path.join(os.getcwd(), dir_path))
def test_load_pretrained_vectors_glove():
dir_path = "temp_data/"
file_path = os.path.join(
os.path.join(dir_path, "gloVe"), "glove.840B.300d.txt"
)
assert isinstance(load_glove(dir_path), Word2VecKeyedVectors)
file_path = Path(file_path)
assert file_path.is_file()
shutil.rmtree(os.path.join(os.getcwd(), dir_path))
def test_load_pretrained_vectors_fasttext():
dir_path = "temp_data/"
file_path = os.path.join(os.path.join(dir_path, "fastText"), "wiki.en.bin")
assert isinstance(load_fasttext(dir_path), FastText)
file_path = Path(file_path)
assert file_path.is_file()
shutil.rmtree(os.path.join(os.getcwd(), dir_path))

Просмотреть файл

@ -7,19 +7,39 @@ import nltk
from nltk.corpus import stopwords
def to_lowercase(df):
def to_lowercase_all(df):
"""
This function transforms all strings in the dataframe to lowercase
This function transforms all strings in the dataframe to lowercase
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
Returns:
pd.DataFrame: Dataframe with lowercase standardization.
"""
Returns:
pd.DataFrame: Dataframe with lowercase standardization.
"""
return df.applymap(lambda s: s.lower() if type(s) == str else s)
def to_lowercase(df, column_names=[]):
"""
This function transforms strings of the column names in the dataframe passed to lowercase
Args:
df (pd.DataFrame): Raw dataframe with some text columns.
column_names(list, optional): column names to be changed to lowercase.
Returns:
pd.DataFrame: Dataframe with columns with lowercase standardization.
"""
if not column_names:
to_lowercase_all(df)
else:
df[column_names] = df[column_names].applymap(
lambda s: s.lower() if type(s) == str else s
)
return df
def to_spacy_tokens(
df,
sentence_cols=["sentence1", "sentence2"],
@ -81,7 +101,7 @@ def rm_spacy_stopwords(
tok_df = nlp_df.applymap(
lambda doc: [token.text for token in doc if not token.is_stop]
)
tok_df.columns = token_cols
tok_df.columns = stop_cols
tokenized = pd.concat([df, tok_df], axis=1)
return tokenized
@ -92,62 +112,54 @@ def to_nltk_tokens(
token_cols=["sentence1_tokens", "sentence2_tokens"],
):
"""
This function converts a sentence to word tokens using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw sentence pairs.
token_cols (list, optional): Column names for the tokenized sentences.
Returns:
pd.DataFrame: Dataframe with new columns token_cols, each containing a
list of tokens for their respective sentences.
"""
nltk.download("punkt")
df[token_cols[0]] = df.apply(
lambda row: nltk.word_tokenize(row[sentence_cols[0]]), axis=1
)
df[token_cols[1]] = df.apply(
lambda row: nltk.word_tokenize(row[sentence_cols[1]]), axis=1
)
This function converts a sentence to word tokens using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw sentences.
token_cols (list, optional): Column names for the tokenized sentences.
Returns:
pd.DataFrame: Dataframe with new columns token_cols, each containing a
list of tokens for their respective sentences.
"""
nltk.download("punkt")
df[token_cols] = df[sentence_cols].applymap(
lambda sentence: nltk.word_tokenize(sentence)
)
pd.concat([df[sentence_cols], df[token_cols]], axis=1)
return df
def rm_nltk_stopwords(
df,
token_cols=["sentence1_tokens", "sentence2_tokens"],
sentence_cols=["sentence1", "sentence2"],
stop_cols=[
"sentence1_tokens_rm_stopwords",
"sentence2_tokens_rm_stopwords",
],
):
"""
This function removes stop words from a sentence using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
token_cols (list, optional): Column names for the tokenized sentence
pairs.
stop_cols (list, optional): Column names for the tokenized sentences
without stop words.
Returns:
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
list of tokens for their respective sentences.
"""
if not set(tok_cols).issubset(df.columns):
df = to_nltk_tokens(df)
This function removes stop words from a sentence using nltk.
Args:
df (pd.DataFrame): Dataframe with columns sentence_cols to tokenize.
sentence_cols (list, optional): Column names for the raw entences.
stop_cols (list, optional): Column names for the tokenized sentences
without stop words.
Returns:
pd.DataFrame: Dataframe with new columns stop_cols, each containing a
list of tokens for their respective sentences.
"""
stop_words = tuple(stopwords.words("english"))
df[stop_cols[0]] = [
[word for word in row if word not in stop_words]
for row in df[token_cols[0]]
]
df[stop_cols[1]] = [
[word for word in row if word not in stop_words]
for row in df[token_cols[1]]
]
df[stop_cols] = (
df[sentence_cols]
.applymap(lambda sentence: nltk.word_tokenize(sentence))
.applymap(lambda l: [word for word in l if word not in stop_words])
)
return df

Просмотреть файл

@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
WORD2VEC_URL = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
FASTTEXT_EN_URL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip'
GLOVE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

Просмотреть файл

@ -0,0 +1,93 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import zipfile
from gensim.models.fasttext import load_facebook_model
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.pretrained_embeddings import FASTTEXT_EN_URL
def _extract_fasttext_vectors(zip_path, dest_path="."):
""" Extracts fastText embeddings from zip file.
Args:
zip_path(str): Path to the downloaded compressed zip file.
dest_path(str): Final destination directory path to the extracted zip file.
Picks the current working directory by default.
Returns:
str: Returns the absolute path to the extracted folder.
"""
if os.path.exists(zip_path):
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(path=dest_path)
else:
raise Exception("Zipped file not found!")
os.remove(zip_path)
return dest_path
def _download_fasttext_vectors(download_dir, file_name="wiki.en.zip"):
""" Downloads pre-trained word vectors for English, trained on Wikipedia using
fastText. You can directly download the vectors from here:
https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
Args:
download_dir (str): File path to download the file
file_name (str) : File name given by default but can be changed by the user.
Returns:
str: file_path to the downloaded vectors.
"""
return maybe_download(
FASTTEXT_EN_URL, filename=file_name, work_directory=download_dir
)
def _maybe_download_and_extract(dest_path, file_name):
""" Downloads and extracts fastText vectors if they dont already exist
Args:
dest_path(str): Final path where the vectors will be extracted.
file_name(str): File name of the fastText vector file.
Returns:
str: File path to the fastText vector file.
"""
dir_path = os.path.join(dest_path, "fastText")
file_path = os.path.join(dir_path, file_name)
if not os.path.exists(file_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
zip_path = _download_fasttext_vectors(dir_path)
_extract_fasttext_vectors(zip_path, dir_path)
else:
print("Vector file already exists. No changes made.")
return file_path
def load_pretrained_vectors(dest_path, file_name="wiki.en.bin"):
""" Method that loads fastText vectors. Downloads if it doesn't exist.
Args:
file_name(str): Name of the fastText file.
dest_path(str): Path to the directory where fastText vectors exist or will be
downloaded.
Returns:
gensim.models.fasttext.load_facebook_model: Loaded word2vectors
"""
file_path = _maybe_download_and_extract(dest_path, file_name)
model = load_facebook_model(file_path)
return model

Просмотреть файл

@ -0,0 +1,96 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import zipfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.pretrained_embeddings import GLOVE_URL
def _extract_glove_vectors(zip_path, dest_path="."):
""" Extracts gloVe embeddings from zip file.
Args:
zip_path(str): Path to the downloaded compressed zip file.
dest_path(str): Final destination directory path to the extracted zip file.
Picks the current working directory by default.
Returns:
str: Returns the absolute path to the extracted folder.
"""
if os.path.exists(zip_path):
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(path=dest_path)
else:
raise Exception("Zipped file not found!")
os.remove(zip_path)
return dest_path
def _download_glove_vectors(download_dir, file_name="glove.840B.300d.zip"):
""" Downloads gloVe word vectors trained on Common Crawl corpus. You can
directly download the vectors from here:
http://nlp.stanford.edu/data/glove.840B.300d.zip
Args:
download_dir (str): File path to download the file
file_name (str) : File name given by default but can be changed by the user.
Returns:
str: file_path to the downloaded vectors.
"""
return maybe_download(GLOVE_URL, filename=file_name, work_directory=download_dir)
def _maybe_download_and_extract(dest_path, file_name):
""" Downloads and extracts gloVe vectors if they dont already exist
Args:
dest_path(str): Final path where the vectors will be extracted.
file_name(str): File name of the gloVe vector file.
Returns:
str: File path to the gloVe vector file.
"""
dir_path = os.path.join(dest_path, "gloVe")
file_path = os.path.join(dir_path, file_name)
if not os.path.exists(file_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
filepath = _download_glove_vectors(dir_path)
_extract_glove_vectors(filepath, dir_path)
else:
print("Vector file already exists. No changes made.")
return file_path
def load_pretrained_vectors(dir_path, file_name="glove.840B.300d.txt"):
""" Method that loads gloVe vectors. Downloads if it doesn't exist.
Args:
file_name(str): Name of the gloVe file.
dir_path(str): Path to the directory where gloVe vectors exist or will be
downloaded.
Returns:
gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors
"""
file_path = _maybe_download_and_extract(dir_path, file_name)
tmp_file = get_tmpfile("test_word2vec.txt")
_ = glove2word2vec(file_path, tmp_file)
model = KeyedVectors.load_word2vec_format(tmp_file)
return model

Просмотреть файл

@ -0,0 +1,95 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import gzip
import os
from gensim.models.keyedvectors import KeyedVectors
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.pretrained_embeddings import WORD2VEC_URL
def _extract_word2vec_vectors(zip_path, dest_filepath):
""" Extracts word2vec embeddings from bin.gz archive
Args:
zip_path: Path to the downloaded compressed file.
dest_filepath: Final destination file path to the extracted zip file.
"""
if os.path.exists(zip_path):
with gzip.GzipFile(zip_path, "rb") as f_in, open(
dest_filepath, "wb"
) as f_out:
f_out.writelines(f_in)
else:
raise Exception("Zipped file not found!")
os.remove(zip_path)
def _download_word2vec_vectors(
download_dir, file_name="GoogleNews-vectors-negative300.bin.gz"
):
""" Downloads pretrained word vectors trained on GoogleNews corpus. You can
directly download the vectors from here:
https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Args:
download_dir (str): File path to download the file
file_name (str) : File name given by default but can be changed by the user.
Returns:
str: file_path to the downloaded vectors.
"""
return maybe_download(WORD2VEC_URL, filename=file_name, work_directory=download_dir)
def _maybe_download_and_extract(dest_path, file_name):
""" Downloads and extracts Word2vec vectors if they dont already exist
Args:
dest_path: Path to the directory where the vectors will be extracted.
file_name: File name of the word2vec vector file.
Returns:
str: File path to the word2vec vector file.
"""
dir_path = os.path.join(dest_path, "word2vec")
file_path = os.path.join(dir_path, file_name)
if not os.path.exists(file_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
filepath = _download_word2vec_vectors(dir_path)
_extract_word2vec_vectors(filepath, file_path)
else:
print("Vector file already exists. No changes made.")
return file_path
def load_pretrained_vectors(
dir_path, file_name="GoogleNews-vectors-negative300.bin"
):
""" Method that loads word2vec vectors. Downloads if it doesn't exist.
Args:
file_name(str): Name of the word2vec file.
dir_path(str): Path to the directory where word2vec vectors exist or will be
downloaded.
Returns:
gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors
"""
file_path = _maybe_download_and_extract(dir_path, file_name)
word2vec_vectors = KeyedVectors.load_word2vec_format(
file_path, binary=True
)
return word2vec_vectors