Edits to embedding trainer and baselines

2019-05-01 09:04:56 -04:00 · 2019-05-01 09:04:56 -04:00 · 5d8583601c
--- a/sentence_similarity/notebooks/02-model/baseline_deep_dive.ipynb
+++ b/sentence_similarity/notebooks/02-model/baseline_deep_dive.ipynb
--- a/sentence_similarity/notebooks/02-model/embedding_trainer.ipynb
+++ b/sentence_similarity/notebooks/02-model/embedding_trainer.ipynb
@ -33,7 +33,8 @@
   "outputs": [],
   "source": [
    "import gensim\n",
-    "import sys"
+    "import sys\n",
+    "import os"
   ]
  },
  {
@ -50,15 +51,18 @@
   "outputs": [],
   "source": [
    "sys.path.append(\"../../../\")  ## set the environment path\n",
-    "\n",
    "BASE_DATA_PATH = \"../../../data\"\n",
+    "SAVE_FILES_PATH = BASE_DATA_PATH + \"/trained_word_embeddings/\"\n",
    "\n",
-    "from utils_nlp.dataset.stsbenchmark import STSBenchmark\n",
+    "if not os.path.exists(SAVE_FILES_PATH):\n",
+    "    os.makedirs(SAVE_FILES_PATH)\n",
+    "    \n",
    "from utils_nlp.dataset.preprocess import (\n",
    "    to_lowercase,\n",
    "    to_spacy_tokens,\n",
    "    rm_spacy_stopwords,\n",
-    ")"
+    ")\n",
+    "from utils_nlp.dataset import stsbenchmark"
   ]
  },
  {
@ -67,8 +71,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Initializing this instance runs the downloader and extractor behind the scenes, then convert to dataframe\n",
-    "stsTrain = STSBenchmark(\"train\", base_data_path=BASE_DATA_PATH).as_dataframe()"
+    "# Produce a pandas dataframe for the training set\n",
+    "stsTrain = stsbenchmark.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")"
   ]
  },
  {
@ -180,31 +184,23 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Embedding for apple: [-1.30064473e-01  1.84295833e-01 -1.53965428e-01 -9.69498605e-02\n",
-      "  4.99420874e-02 -1.23197936e-01  7.28140250e-02 -4.12699208e-02\n",
-      "  2.47626036e-01 -2.69805547e-04 -7.65557750e-04  2.08947986e-01\n",
-      "  7.81186996e-03  5.42742060e-03  5.25087006e-02  2.47807354e-01\n",
-      " -2.48165410e-02  9.91394650e-03  3.54040265e-02 -2.14830145e-01\n",
-      "  2.24868301e-02  1.52286962e-01  1.85761824e-01  2.33249858e-01\n",
-      " -1.46878466e-01 -7.60829672e-02  4.50950442e-03  1.15145534e-01\n",
-      " -9.11297649e-02  6.20169528e-02 -5.24968617e-02 -8.68254527e-02\n",
-      " -1.77496113e-04  8.58828798e-02  1.19839951e-01  2.51445977e-04\n",
-      " -3.06774918e-02  2.70280894e-03 -9.14655998e-02  5.54770082e-02\n",
-      "  6.70319721e-02 -1.10063367e-01 -9.94274616e-02 -1.62537303e-02\n",
-      "  1.07709818e-01 -1.17890313e-01 -1.68436840e-02  2.67276943e-01\n",
-      "  1.66485235e-02 -1.05556019e-01  8.72049183e-02 -2.79379219e-01\n",
-      " -7.61673898e-02 -1.26047105e-01 -2.10570037e-01  1.06335968e-01\n",
-      " -1.13933079e-01  8.91806409e-02  2.40348503e-02  1.27991261e-02\n",
-      " -9.80987865e-03 -8.29416886e-02 -1.05351470e-01  9.63128060e-02\n",
-      " -1.32907405e-01 -5.90794981e-02 -1.05936542e-01  5.24872467e-02\n",
-      " -1.62810262e-04  1.90204114e-03 -1.07438803e-01 -1.86693370e-02\n",
-      " -1.74428806e-01 -2.69948710e-02 -4.38663997e-02 -4.28975448e-02\n",
-      "  9.05705541e-02 -2.10348725e-01 -1.16732195e-01  3.60293575e-02\n",
-      " -2.08853818e-02  2.63118356e-01  1.76015347e-01  1.23300500e-01\n",
-      " -3.50267850e-02 -4.52703685e-02 -1.70624122e-01 -3.28516886e-02\n",
-      "  5.28835841e-02  8.53991881e-02 -8.47622007e-02  2.25594401e-01\n",
-      " -1.77075803e-01 -5.37518365e-03  9.42931976e-03  1.78159177e-02\n",
-      " -7.26433992e-02 -3.52309011e-02 -1.68363556e-01  2.79879309e-02]\n",
+      "Embedding for apple: [-0.09213913 -0.02462959 -0.11255068  0.11652157 -0.18142793 -0.17555593\n",
+      "  0.07121698  0.086779   -0.03097944 -0.01890221 -0.04537104 -0.10696206\n",
+      "  0.02276987  0.08645772  0.09701958 -0.22489007  0.03993007 -0.0748188\n",
+      "  0.0185363  -0.257262    0.06551826  0.01579769 -0.18179104 -0.22390445\n",
+      " -0.06907904 -0.08859113  0.00603421 -0.01953833 -0.0306666  -0.20717207\n",
+      " -0.07466035 -0.10690664 -0.06131361 -0.0747569  -0.03541371 -0.02307771\n",
+      " -0.04890924  0.09401437  0.14955166  0.03299814 -0.20348735  0.1091179\n",
+      " -0.05915498  0.07897269 -0.0392515  -0.1337506   0.16920352  0.00084969\n",
+      "  0.09151786 -0.07067705 -0.00130636 -0.00040609 -0.09070218 -0.05848758\n",
+      "  0.01417456  0.12759478  0.06773403 -0.03618362  0.05180905 -0.03987553\n",
+      "  0.15119544  0.1374909  -0.2100861  -0.12180148 -0.01784294  0.09922534\n",
+      " -0.01852375  0.2757332  -0.07551172  0.06188574 -0.0189024   0.08390908\n",
+      "  0.06324708 -0.02126443  0.07884526 -0.06014811 -0.1291807   0.03968196\n",
+      " -0.00395843 -0.05398612  0.25687164  0.06331551 -0.07450255 -0.12246329\n",
+      " -0.1481028   0.11168568 -0.24994832 -0.05962377  0.04101507  0.06981998\n",
+      "  0.02528387  0.1725297   0.10974599  0.12216322 -0.16961183  0.0819602\n",
+      "  0.15518941  0.12973912  0.09754901 -0.0033999 ]\n",
      "\n",
      "First 30 vocabulary words: ['a', 'plane', 'is', 'taking', 'off', '.', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'on', 'pizza', 'three', 'men', 'are', 'the', 'some', 'fighting']\n"
     ]
@ -218,8 +214,8 @@
    "print(\"\\nFirst 30 vocabulary words:\", list(word2vec_model.wv.vocab)[:20])\n",
    "\n",
    "# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format\n",
-    "word2vec_model.wv.save_word2vec_format(\"word2vec_model\", binary=True)  # binary format\n",
-    "word2vec_model.wv.save_word2vec_format(\"word2vec_model\", binary=False)  # ASCII format"
+    "word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"word2vec_model\", binary=True)  # binary format\n",
+    "word2vec_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"word2vec_model\", binary=False)  # ASCII format"
   ]
  },
  {
@ -276,23 +272,31 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Embedding for apple: [-0.19466035  0.02329457  0.11905755  0.43202105  0.29234868 -0.4173747\n",
-      " -0.42871934 -0.587514   -0.24620762 -0.30886024 -0.04068367  0.20132142\n",
-      " -0.1593995  -0.34693947 -0.05454068  0.21118519  0.20061074  0.33920124\n",
-      "  0.13465068 -0.16492505 -0.01792471  0.3517471  -0.42507643 -0.14185262\n",
-      "  0.6766511  -0.35682997  0.38852996  0.08338872 -0.16927068  0.00101932\n",
-      "  0.01033709 -0.00513317 -0.15251048 -0.07668231  0.02508747 -0.16725563\n",
-      "  0.13578647  0.5188022   0.4219404  -0.29186445 -0.35036987  0.04769979\n",
-      " -0.23967543 -0.03550959 -0.4072291   0.4920213   0.30146047 -0.569966\n",
-      "  0.12033249 -0.24960376 -0.20398642 -0.37427858  0.04139522  0.28986236\n",
-      " -0.31172943  0.7363574  -0.43040937  0.24302956 -0.2891899  -0.12707426\n",
-      " -0.26763597 -0.3471016   0.08912586 -0.20722611  0.1529707   0.39230242\n",
-      " -0.23503402 -0.00332095 -0.04347242 -0.00989339  0.08801552 -0.36916256\n",
-      " -0.13720557  0.40390077 -0.21936806 -0.10426865 -0.18858872  0.15547332\n",
-      " -0.3519439   0.00505178  0.1029634  -0.00991125  0.41537017 -0.10500967\n",
-      "  0.43521944  0.26955605 -0.23591378  0.14193945  0.08484828  0.57761383\n",
-      " -0.31014645  0.63834554 -0.15213463 -0.46310434  0.10502262 -0.03921723\n",
-      "  0.21358919 -0.17636251  0.14675795  0.15879233]\n",
+      "Embedding for apple: [-2.1927688e-01  2.9813698e-02  6.7616858e-02  3.6836052e-01\n",
+      "  2.9166859e-01 -4.3027815e-01 -4.3850473e-01 -5.5472869e-01\n",
+      " -2.4860071e-01 -2.8481758e-01 -8.5550338e-02  2.0373566e-01\n",
+      " -8.8941768e-02 -3.5824496e-01 -7.3820040e-02  1.9162497e-01\n",
+      "  1.9164029e-01  3.2222369e-01  1.7169371e-01 -1.8063694e-01\n",
+      " -2.5478544e-02  3.8527763e-01 -4.4661409e-01 -1.9077049e-01\n",
+      "  6.3831955e-01 -3.4981030e-01  3.6546609e-01  7.3591776e-02\n",
+      " -1.7809562e-01 -3.0694399e-02 -6.5486156e-04  2.8458415e-02\n",
+      " -1.4853548e-01 -1.1247496e-01  2.6613681e-02 -1.5886196e-01\n",
+      "  1.0738261e-01  5.2269661e-01  4.1452998e-01 -2.4978566e-01\n",
+      " -3.6866227e-01  4.5613028e-02 -2.5554851e-01 -2.9870963e-02\n",
+      " -3.4256181e-01  4.1204464e-01  3.3703518e-01 -5.3163689e-01\n",
+      "  2.7413066e-02 -3.2481736e-01 -2.1018679e-01 -3.5171476e-01\n",
+      "  5.6522321e-02  3.2140371e-01 -3.0404109e-01  7.3594677e-01\n",
+      " -4.7126335e-01  2.5894231e-01 -2.6430738e-01 -1.1617108e-01\n",
+      " -2.7015641e-01 -3.2107431e-01  8.0991395e-02 -1.8977067e-01\n",
+      "  1.6966967e-01  3.6855596e-01 -2.0167376e-01 -1.6917199e-02\n",
+      " -4.0029153e-02  8.3818562e-02  8.8887364e-02 -3.4052727e-01\n",
+      " -1.5159512e-01  4.2969501e-01 -1.8632193e-01 -4.8835874e-02\n",
+      " -1.9202119e-01  1.5949497e-01 -3.4046504e-01  4.6990579e-03\n",
+      "  9.2628546e-02  1.6060786e-02  3.8600260e-01 -8.4986687e-02\n",
+      "  4.4739038e-01  2.1059968e-01 -1.9877617e-01  1.8113001e-01\n",
+      "  9.4012588e-02  5.5849826e-01 -3.2842401e-01  6.3832772e-01\n",
+      " -1.1614193e-01 -4.4778910e-01  1.4173931e-01 -2.4079295e-02\n",
+      "  1.8156306e-01 -1.9836307e-01  1.4190227e-01  1.5471222e-01]\n",
      "\n",
      "First 30 vocabulary words: ['a', 'plane', 'is', 'taking', 'off', '.', 'man', 'playing', 'large', 'flute', 'spreading', 'cheese', 'on', 'pizza', 'three', 'men', 'are', 'the', 'some', 'fighting']\n"
     ]
@ -306,8 +310,8 @@
    "print(\"\\nFirst 30 vocabulary words:\", list(fastText_model.wv.vocab)[:20])\n",
    "\n",
    "# 3. Save the word embeddings. We can save as binary format (to save space) or ASCII format\n",
-    "fastText_model.wv.save_word2vec_format(\"fastText_model\", binary=True)  # binary format\n",
-    "fastText_model.wv.save_word2vec_format(\"fastText_model\", binary=False)  # ASCII format"
+    "fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"fastText_model\", binary=True)  # binary format\n",
+    "fastText_model.wv.save_word2vec_format(SAVE_FILES_PATH+\"fastText_model\", binary=False)  # ASCII format"
   ]
  },
  {
@ -359,7 +363,7 @@
   "outputs": [],
   "source": [
    "#save our corpus as tokens delimited by spaces with new line characters in between sentences\n",
-    "with open('sentences.txt', 'w', encoding='utf8') as file:\n",
+    "with open(BASE_DATA_PATH+'/clean/stsbenchmark/training-corpus-cleaned.txt', 'w', encoding='utf8') as file:\n",
    "    for sent in sentences:\n",
    "        file.write(\" \".join(sent) + \"\\n\")"
   ]
@ -375,7 +379,7 @@
    "2. max-vocab: upper bound on the number of vocabulary words to keep\n",
    "3. verbose: 0, 1, or 2 (default)\n",
    "\n",
-    "Then provide the path to the text file we created in Step 0 (<\"sentences.txt\">) followed by a file path that we'll save the vocabulary to (\"glove/build/vocab.txt\")"
+    "Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the vocabulary to "
   ]
  },
  {
@ -397,7 +401,7 @@
    }
   ],
   "source": [
-    "!\"glove/build/vocab_count\" -min-count 5 -verbose 2 <\"sentences.txt\"> \"glove/build/vocab.txt\""
+    "!\"glove/build/vocab_count\" -min-count 5 -verbose 2 <\"../../../data/clean/stsbenchmark/training-corpus-cleaned.txt\"> \"../../../data/trained_word_embeddings/vocab.txt\""
   ]
  },
  {
@ -414,7 +418,7 @@
    "5. memory: soft limit for memory consumption, default 4\n",
    "6. max-product: limit the size of dense co-occurrence array by specifying the max product (integer) of the frequency counts of the two co-occurring words\n",
    "\n",
-    "Then provide the path to the text file we created in Step 0 (<\"sentences.txt\">) followed by a file path that we'll save the co-occurrences to (\"glove/build/cooccurrence.bin\")"
+    "Then provide the path to the text file we created in Step 0 followed by a file path that we'll save the co-occurrences to"
   ]
  },
  {
@ -431,7 +435,7 @@
      "context: symmetric\n",
      "max product: 13752509\n",
      "overflow length: 38028356\n",
-      "Reading vocab from file \"glove/build/vocab.txt\"...loaded 3166 words.\n",
+      "Reading vocab from file \"../../../data/trained_word_embeddings/vocab.txt\"...loaded 3166 words.\n",
      "Building lookup table...table contains 10023557 elements.\n",
      "Processing token: 0100000Processed 129989 tokens.\n",
      "Writing cooccurrences to disk.......2 files in total.\n",
@ -441,7 +445,7 @@
    }
   ],
   "source": [
-    "!\"glove/build/cooccur\" -memory 4 -vocab-file \"glove/build/vocab.txt\" -verbose 2 -window-size 15 <\"sentences.txt\"> \"glove/build/cooccurrence.bin\""
+    "!\"glove/build/cooccur\" -memory 4 -vocab-file \"../../../data/trained_word_embeddings/vocab.txt\" -verbose 2 -window-size 15 <\"../../../data/clean/stsbenchmark/training-corpus-cleaned.txt\"> \"../../../data/trained_word_embeddings/cooccurrence.bin\""
   ]
  },
  {
@ -455,7 +459,7 @@
    "2. memory: soft limit for memory consumption, default 4\n",
    "3. array-size: limit to the length of the buffer which stores chunks of data to shuffle before writing to disk\n",
    "\n",
-    "Then provide the path to the co-occurrence file we created in Step 2 (<\"glove/build/cooccurrence.bin\">) followed by a file path that we'll save the shuffled co-occurrences to (\"glove/build/cooccurrence.shuf.bin\")"
+    "Then provide the path to the co-occurrence file we created in Step 2 followed by a file path that we'll save the shuffled co-occurrences to"
   ]
  },
  {
@ -477,7 +481,7 @@
    }
   ],
   "source": [
-    "!\"glove/build/shuffle\" -memory 4 -verbose 2 <\"glove/build/cooccurrence.bin\"> \"glove/build/cooccurrence.shuf.bin\""
+    "!\"glove/build/shuffle\" -memory 4 -verbose 2 <\"../../../data/trained_word_embeddings/cooccurrence.bin\"> \"../../../data/trained_word_embeddings/cooccurrence.shuf.bin\""
   ]
  },
  {
@ -515,28 +519,28 @@
      "vocab size: 3166\n",
      "x_max: 10.000000\n",
      "alpha: 0.750000\n",
-      "04/29/19 - 01:26.33PM, iter: 001, cost: 0.098453\n",
-      "04/29/19 - 01:26.33PM, iter: 002, cost: 0.084751\n",
-      "04/29/19 - 01:26.33PM, iter: 003, cost: 0.074604\n",
-      "04/29/19 - 01:26.33PM, iter: 004, cost: 0.071038\n",
-      "04/29/19 - 01:26.33PM, iter: 005, cost: 0.067709\n",
-      "04/29/19 - 01:26.33PM, iter: 006, cost: 0.064181\n",
-      "04/29/19 - 01:26.33PM, iter: 007, cost: 0.059996\n",
-      "04/29/19 - 01:26.33PM, iter: 008, cost: 0.055268\n",
-      "04/29/19 - 01:26.33PM, iter: 009, cost: 0.050708\n",
-      "04/29/19 - 01:26.33PM, iter: 010, cost: 0.046754\n",
-      "04/29/19 - 01:26.33PM, iter: 011, cost: 0.043402\n",
-      "04/29/19 - 01:26.33PM, iter: 012, cost: 0.040575\n",
-      "04/29/19 - 01:26.33PM, iter: 013, cost: 0.038056\n",
-      "04/29/19 - 01:26.33PM, iter: 014, cost: 0.035843\n",
-      "04/29/19 - 01:26.33PM, iter: 015, cost: 0.033807\n"
+      "04/30/19 - 10:33.02AM, iter: 001, cost: 0.098433\n",
+      "04/30/19 - 10:33.02AM, iter: 002, cost: 0.084675\n",
+      "04/30/19 - 10:33.02AM, iter: 003, cost: 0.074585\n",
+      "04/30/19 - 10:33.02AM, iter: 004, cost: 0.071048\n",
+      "04/30/19 - 10:33.02AM, iter: 005, cost: 0.067768\n",
+      "04/30/19 - 10:33.02AM, iter: 006, cost: 0.064212\n",
+      "04/30/19 - 10:33.02AM, iter: 007, cost: 0.060040\n",
+      "04/30/19 - 10:33.02AM, iter: 008, cost: 0.055310\n",
+      "04/30/19 - 10:33.02AM, iter: 009, cost: 0.050727\n",
+      "04/30/19 - 10:33.02AM, iter: 010, cost: 0.046803\n",
+      "04/30/19 - 10:33.02AM, iter: 011, cost: 0.043456\n",
+      "04/30/19 - 10:33.02AM, iter: 012, cost: 0.040570\n",
+      "04/30/19 - 10:33.02AM, iter: 013, cost: 0.038074\n",
+      "04/30/19 - 10:33.02AM, iter: 014, cost: 0.035818\n",
+      "04/30/19 - 10:33.02AM, iter: 015, cost: 0.033807\n"
     ]
    }
   ],
   "source": [
-    "!\"glove/build/glove\" -save-file \"glove/build/GloVe_vectors\" -threads 8 -input-file \\\n",
-    "\"glove/build/cooccurrence.shuf.bin\" -x-max 10 -iter 15 -vector-size 50 -binary 2 \\\n",
-    "-vocab-file \"glove/build/vocab.txt\" -verbose 2"
+    "!\"glove/build/glove\" -save-file \"../../../data/trained_word_embeddings/GloVe_vectors\" -threads 8 -input-file \\\n",
+    "\"../../../data/trained_word_embeddings/cooccurrence.shuf.bin\" -x-max 10 -iter 15 -vector-size 50 -binary 2 \\\n",
+    "-vocab-file \"../../../data/trained_word_embeddings/vocab.txt\" -verbose 2"
   ]
  },
  {
@ -561,7 +565,7 @@
   "source": [
    "#load in the saved word vectors\n",
    "glove_wv = {}\n",
-    "with open(\"glove/build/GloVe_vectors.txt\", encoding='utf-8') as f:\n",
+    "with open(\"../../../data/trained_word_embeddings/GloVe_vectors.txt\", encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        split_line = line.split(\" \")\n",
    "        glove_wv[split_line[0]] = [float(i) for i in split_line[1:]]"
@ -576,7 +580,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Embedding for apple: [-0.015303, -0.0512, -0.011988, 0.429914, 0.246523, 0.009762, 0.153154, -0.178636, 0.061723, 0.108515, -0.166807, -0.033258, -0.046394, 0.081953, -0.209458, 0.194758, 0.179153, 0.23262, -0.118717, -0.053151, -0.018892, -0.037714, -0.067396, 0.057499, 0.179459, 0.004552, -0.203058, 0.243629, -0.294976, 0.123971, 0.368613, 0.190665, -0.16738, -0.0599, 0.119195, -0.030108, -0.254778, -0.007862, -0.036998, 0.060919, -0.210459, 0.293917, 0.045603, -0.01104, 0.075651, -0.120635, -0.133497, -0.372606, -0.152981, 0.009014]\n",
+      "Embedding for apple: [0.007199, -0.055337, -0.048813, 0.463647, 0.233898, -0.020051, 0.18876, -0.19439, 0.014477, 0.122465, -0.145506, -0.056616, -0.076315, 0.051205, -0.197457, 0.197818, 0.191692, 0.259758, -0.088431, -0.101713, -0.024687, -0.083431, -0.056415, 0.08024, 0.150831, 0.030778, -0.176252, 0.291561, -0.298596, 0.111546, 0.385694, 0.184508, -0.133928, 0.007924, 0.088849, 0.016869, -0.195535, 0.002015, -0.053591, 0.043867, -0.195157, 0.270429, -0.003891, -0.033436, 0.077898, -0.083324, -0.135095, -0.419319, -0.140611, 0.000322]\n",
      "\n",
      "First 30 vocabulary words: ['.', 'a', 'the', 'in', ',', 'is', 'to', 'of', 'and', 'on', 'man', '-', \"'s\", 'with', 'for', 'at', 'woman', 'are', 'that', 'two']\n"
     ]