Updated notebook with latest results and refactored imports

2019-06-25 16:41:54 -04:00 · 2019-06-25 16:41:54 -04:00 · 078466259f
--- a/scenarios/sentence_similarity/gensen_local.ipynb
+++ b/scenarios/sentence_similarity/gensen_local.ipynb
@ -159,13 +159,6 @@
    }
   },
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 92.3k/92.3k [00:04<00:00, 21.8kKB/s]\n"
-     ]
-    },
    {
     "data": {
      "text/html": [
@ -573,7 +566,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
@ -601,7 +594,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
   "metadata": {
    "nbpresent": {
     "id": "641a9c74-974c-4aac-8c16-3b44d686f0f3"
@ -610,6 +603,9 @@
   },
   "outputs": [],
   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "\n",
    "config_filepath = 'gensen_config.json'\n",
    "clf = GenSenClassifier(config_file = config_filepath, \n",
    "                       pretrained_embedding_path = pretrained_embedding_path,\n",
@ -630,7 +626,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {
    "nbpresent": {
     "id": "6ea45671-c7a5-4fe8-a450-8b54161f26c5"
@ -657,20 +653,7 @@
      "/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/horovod/torch/__init__.py:163: UserWarning: optimizer.step(synchronize=True) called after optimizer.synchronize(). This can cause training slowdown. You may want to consider using optimizer.step(synchronize=False) if you use optimizer.synchronize() in your code.\n",
      "  warnings.warn(\"optimizer.step(synchronize=True) called after \"\n",
      "../../scenarios/sentence_similarity/gensen_train.py:238: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
-      "  f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "NLI Dev Acc : 0.32869\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
+      "  f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n",
      "../../scenarios/sentence_similarity/gensen_train.py:257: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
      "  f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n"
     ]
@ -679,107 +662,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "NLI Test Acc : 0.32767\n",
-      "NLI Dev Acc : 0.32869\n",
-      "NLI Test Acc : 0.32767\n",
-      "NLI Dev Acc : 0.34485\n",
-      "NLI Test Acc : 0.34334\n",
-      "NLI Dev Acc : 0.33306\n",
-      "NLI Test Acc : 0.32950\n",
-      "NLI Dev Acc : 0.33824\n",
-      "NLI Test Acc : 0.34283\n",
-      "NLI Dev Acc : 0.36507\n",
-      "NLI Test Acc : 0.36482\n",
-      "NLI Dev Acc : 0.34983\n",
-      "NLI Test Acc : 0.34762\n",
-      "NLI Dev Acc : 0.35216\n",
-      "NLI Test Acc : 0.34792\n",
-      "NLI Dev Acc : 0.33824\n",
-      "NLI Test Acc : 0.34283\n",
-      "NLI Dev Acc : 0.33835\n",
-      "NLI Test Acc : 0.34294\n",
-      "NLI Dev Acc : 0.33306\n",
-      "NLI Test Acc : 0.32950\n",
-      "NLI Dev Acc : 0.33306\n",
-      "NLI Test Acc : 0.32950\n",
-      "NLI Dev Acc : 0.34302\n",
-      "NLI Test Acc : 0.34161\n",
-      "NLI Dev Acc : 0.36385\n",
-      "NLI Test Acc : 0.37154\n",
-      "NLI Dev Acc : 0.38295\n",
-      "NLI Test Acc : 0.38386\n",
-      "NLI Dev Acc : 0.38793\n",
-      "NLI Test Acc : 0.38742\n",
-      "NLI Dev Acc : 0.39138\n",
-      "NLI Test Acc : 0.38976\n",
-      "NLI Dev Acc : 0.35135\n",
-      "NLI Test Acc : 0.35393\n",
-      "NLI Dev Acc : 0.34007\n",
-      "NLI Test Acc : 0.33744\n",
-      "NLI Dev Acc : 0.33306\n",
-      "NLI Test Acc : 0.32950\n",
-      "NLI Dev Acc : 0.33306\n",
-      "NLI Test Acc : 0.32950\n",
-      "NLI Dev Acc : 0.33804\n",
-      "NLI Test Acc : 0.34263\n",
-      "NLI Dev Acc : 0.34617\n",
-      "NLI Test Acc : 0.35413\n",
-      "NLI Dev Acc : 0.35034\n",
-      "NLI Test Acc : 0.34772\n",
-      "NLI Dev Acc : 0.33306\n",
-      "NLI Test Acc : 0.32950\n",
-      "NLI Dev Acc : 0.33560\n",
-      "NLI Test Acc : 0.33184\n",
-      "NLI Dev Acc : 0.35298\n",
-      "NLI Test Acc : 0.35922\n",
-      "NLI Dev Acc : 0.34363\n",
-      "NLI Test Acc : 0.34009\n",
-      "NLI Dev Acc : 0.36365\n",
-      "NLI Test Acc : 0.36238\n",
-      "NLI Dev Acc : 0.35145\n",
-      "NLI Test Acc : 0.35077\n",
-      "NLI Dev Acc : 0.33367\n",
-      "NLI Test Acc : 0.33092\n",
-      "NLI Dev Acc : 0.36141\n",
-      "NLI Test Acc : 0.35882\n",
-      "NLI Dev Acc : 0.35369\n",
-      "NLI Test Acc : 0.35678\n",
-      "NLI Dev Acc : 0.32869\n",
-      "NLI Test Acc : 0.32767\n",
-      "NLI Dev Acc : 0.32869\n",
-      "NLI Test Acc : 0.32767\n",
-      "NLI Dev Acc : 0.32869\n",
-      "NLI Test Acc : 0.32767\n",
-      "NLI Dev Acc : 0.32910\n",
-      "NLI Test Acc : 0.32807\n",
-      "NLI Dev Acc : 0.35470\n",
-      "NLI Test Acc : 0.35230\n",
-      "NLI Dev Acc : 0.40469\n",
-      "NLI Test Acc : 0.40869\n",
-      "NLI Dev Acc : 0.37106\n",
-      "NLI Test Acc : 0.36594\n",
-      "NLI Dev Acc : 0.37939\n",
-      "NLI Test Acc : 0.37246\n",
-      "NLI Dev Acc : 0.38254\n",
-      "NLI Test Acc : 0.37724\n",
-      "NLI Dev Acc : 0.37309\n",
-      "NLI Test Acc : 0.37449\n",
-      "NLI Dev Acc : 0.33936\n",
-      "NLI Test Acc : 0.33876\n",
-      "NLI Dev Acc : 0.34820\n",
-      "NLI Test Acc : 0.34864\n",
-      "NLI Dev Acc : 0.38387\n",
-      "NLI Test Acc : 0.38060\n",
-      "NLI Dev Acc : 0.40073\n",
-      "NLI Test Acc : 0.40564\n",
-      "NLI Dev Acc : 0.35867\n",
-      "NLI Test Acc : 0.36553\n",
-      "NLI Dev Acc : 0.35277\n",
-      "NLI Test Acc : 0.36044\n",
-      "NLI Dev Acc : 0.40205\n",
-      "NLI Test Acc : 0.39963\n",
-      "CPU times: user 1h 8min 52s, sys: 19min 2s, total: 1h 27min 55s\n",
-      "Wall time: 1h 28min\n"
+      "CPU times: user 29min 7s, sys: 7min 43s, total: 36min 51s\n",
+      "Wall time: 36min 59s\n"
     ]
    }
   ],
@ -797,9 +681,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "******** Similarity Score for sentences **************\n",
+      "         0        1\n",
+      "0  1.00000  0.96147\n",
+      "1  0.96147  1.00000\n"
+     ]
+    },
    {
     "data": {
      "application/papermill.record+json": {
@ -811,11 +705,11 @@
        "data": [
         [
          1,
-          0.9761484548147126
+          0.9614701535385097
         ],
         [
-          0.9761484548147126,
-          1
+          0.9614701535385097,
+          0.9999999999999998
         ]
        ],
        "index": [
@ -832,10 +726,12 @@
   "source": [
    "sentences = [\n",
    "        'the quick brown fox jumped over the lazy dog',\n",
-    "        'it is going to be a bright sunshiny day tomorrow'\n",
+    "        'bright sunshiny day tomorrow.'\n",
    "    ]\n",
    "\n",
    "results = clf.predict(sentences)\n",
+    "print(\"******** Similarity Score for sentences **************\")\n",
+    "print(results)\n",
    "pm.record(\"results\", results.to_dict(orient='split'))"
   ]
  },
@ -848,13 +744,6 @@
    "1. Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J, [*Learning general purpose distributed sentence representations via large scale multi-task learning*](https://arxiv.org/abs/1804.00079), ICLR, 2018.\n",
    "3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/scenarios/sentence_similarity/gensen_wrapper.py
+++ b/scenarios/sentence_similarity/gensen_wrapper.py
@ -2,8 +2,9 @@
 # Licensed under the MIT License.
 import json
 import os
-import pandas as pd
+
 import numpy as np
+import pandas as pd

 from scenarios.sentence_similarity.gensen_train import train
 from utils_nlp.gensen.create_gensen_model import (
--- a/utils_nlp/models/gensen/preprocess_utils.py
+++ b/utils_nlp/models/gensen/preprocess_utils.py
@ -90,13 +90,13 @@ def _split_and_cleanup(split_map, data_path):
            "snli_1.0_{}.txt.s2.tok".format(file_split),
        )
        with open(s1_tok_path, "r") as fin, open(
-                "{}.tmp".format(s1_tok_path), "w"
+            "{}.tmp".format(s1_tok_path), "w"
        ) as tmp:
            for line in fin:
                s = line.replace('"', "")
                tmp.write(s)
        with open(s2_tok_path, "r") as fin, open(
-                "{}.tmp".format(s2_tok_path), "w"
+            "{}.tmp".format(s2_tok_path), "w"
        ) as tmp:
            for line in fin:
                s = line.replace('"', "")