Training Gensen on local

1. Updates to env.yaml to install pytorch and related dependencies. 2. Fixing path for validating params in Gensen class. 3. Minor changes to notebook.
2019-05-16 13:55:25 -04:00 · 2019-05-16 13:55:25 -04:00 · a259d7683d
--- a/models/gensen/localcode/example_config.json
+++ b/models/gensen/localcode/example_config.json
@ -1,52 +0,0 @@
-{
-  "training": {
-    "optimizer": "adam",
-    "clip_c": 1,
-    "lrate": 0.0001,
-    "batch_size": 48,
-    "n_gpus": 1
-  },
-  "management": {
-    "monitor_loss": 9600,
-    "print_samples": 12800,
-    "checkpoint_freq": 640000,
-    "eval_freq": 4800000
-  },
-  "data": {"paths": [
-        {
-            "train_src": "data/corpora/nmt/training/train.nmt.de-en.en.tok",
-            "train_trg": "data/corpora/nmt/training/train.nmt.de-en.de.tok",
-            "val_src": "data/corpora/nmt/training/dev.nmt.de-en.en.tok",
-            "val_trg": "data/corpora/nmt/training/dev.nmt.de-en.de.tok",
-            "taskname": "de-en"
-        },
-        {
-            "train_src": "data/corpora/nmt/training/train.nmt.fr-en.en.tok",
-            "train_trg": "data/corpora/nmt/training/train.nmt.fr-en.fr.tok",
-            "val_src": "data/corpora/nmt/training/dev.nmt.fr-en.en.tok",
-            "val_trg": "data/corpora/nmt/training/dev.nmt.fr-en.fr.tok",
-            "taskname": "fr-en"
-        }
-    ],
-        "max_src_length": 90,
-        "max_trg_length": 90,
-        "task": "multi-seq2seq-nli",
-        "save_dir": "data/models/example",
-        "load_dir": "auto",
-        "nli_train": "data/corpora/allnli.train.txt.clean.noblank",
-        "nli_dev": "data/corpora/snli_1.0_dev.txt.clean.noblank",
-        "nli_test": "data/corpora/snli_1.0_test.txt.clean.noblank"
-	},
-    "model": {
-    	"dim_src": 2048,
-    	"dim_trg": 2048,
-    	"dim_word_src": 512,
-    	"dim_word_trg": 512,
-    	"n_words_src": 80000,
-    	"n_words_trg": 30000,
-    	"n_layers_src": 1,
-    	"bidirectional": true,
-        "layernorm": false,
-        "dropout": 0.3
-    }
-}
--- a/models/gensen/localcode/train.py
+++ b/models/gensen/localcode/train.py
@ -4,25 +4,25 @@
 import logging
 import os
 import sys
-
-import numpy as np
-
-sys.path.append(".")  # Required to run on the MILA machines with SLURM
-
-import json
 import time

+import numpy as np
 import torch
-import torch.nn as nn
-import torch.optim as optim
 import torch.backends.cudnn as cudnn
+import torch.nn as nn
 import torch.nn.functional as F
-
-from utils import BufferedDataIterator, NLIIterator, compute_validation_loss
-from models import MultitaskModel
-
+import torch.optim as optim
 from azureml.core.run import Run

+from models.gensen.localcode.models import MultitaskModel
+from models.gensen.localcode.utils import (
+    BufferedDataIterator,
+    NLIIterator,
+    compute_validation_loss,
+)
+
+sys.path.append(".")  # Required to run on the MILA machines with SLURM
+
 # parser = argparse.ArgumentParser()
 # parser.add_argument('--data_folder', type=str, help='data folder')
 #
@ -38,7 +38,7 @@ run = Run.get_context()

 cudnn.benchmark = True

-'''
+"""
 parser = argparse.ArgumentParser()
 parser.add_argument("--config", help="path to json config", required=True)
 parser.add_argument("--data_folder", type=str, help="data folder")
@ -46,27 +46,20 @@ parser.add_argument("--data_folder", type=str, help="data folder")
 parser.add_argument(
    "--learning_rate", type=float, default=0.0001, help="learning rate"
 )
-'''
+"""


-def read_config(json_file):
-    """Read JSON config."""
-    json_object = json.load(open(json_file, "r", encoding="utf-8"))
-    return json_object
-
-
-def train(config_file_path, data_folder, learning_rate=0.0001):
+def train(config, data_folder, learning_rate=0.0001):
    """ Train the Gensen model.

    Args:
-        config_file_path(str): Path to the config json file.
+        config(dict): Loaded json file as a python object.
        data_folder(str): Path to the folder containing the data.
        learning_rate(float): Learning rate for the model.

    """
    # os.chdir(data_folder)

-    config = read_config(config_file_path)
    save_dir = config["data"]["save_dir"]
    load_dir = config["data"]["load_dir"]

--- a/scenarios/sentence_similarity/02-model-deep-dive/gensen_train.py.ipynb
+++ b/scenarios/sentence_similarity/02-model-deep-dive/gensen_train.py.ipynb
@ -38,7 +38,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
@ -99,7 +99,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
@ -280,7 +280,7 @@
       "4  2267923837.jpg#2r1e     entailment    NaN    NaN    NaN    NaN  "
      ]
     },
-     "execution_count": 3,
+     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -304,7 +304,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -331,7 +331,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -358,7 +358,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
@ -464,7 +464,7 @@
       "4  [two, kids, at, a, ballgame, wash, their, hand...  "
      ]
     },
-     "execution_count": 8,
+     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -498,12 +498,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
-    "config_filepath = 'example_config.json'\n",
-    "clf = GenSenClassifier(config_filepath = config_filepath, learning_rate = 0.0001, cache_dir=BASE_DATA_PATH)"
+    "config_filepath = '../../../utils_nlp/model/local_config.json'\n",
+    "clf = GenSenClassifier(config_file = config_filepath, learning_rate = 0.0001, cache_dir=BASE_DATA_PATH)"
   ]
  },
  {
@ -515,23 +515,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "Provided config file does not exist!",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-10-6ac659b52cdf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_tok\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_tok\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32mE:\\Projects\\NLP-BP\\NLP\\utils_nlp\\model\\gensen.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, train_df, dev_df)\u001b[0m\n\u001b[0;32m     63\u001b[0m         \"\"\"\n\u001b[0;32m     64\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 65\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     66\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_gensen_tokens\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_df\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_df\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     67\u001b[0m         \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32mE:\\Projects\\NLP-BP\\NLP\\utils_nlp\\model\\gensen.py\u001b[0m in \u001b[0;36m_validate_params\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m     35\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     36\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconfig_filepath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 37\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mFileNotFoundError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Provided config file does not exist!\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     39\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_get_gensen_tokens\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtest_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mFileNotFoundError\u001b[0m: Provided config file does not exist!"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "clf.fit(train_tok, dev_tok)"
   ]
--- a/utils_nlp/model/gensen.py
+++ b/utils_nlp/model/gensen.py
@ -1,8 +1,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
+import json
 import os.path

-#from models.gensen.localcode import train
+from models.gensen.localcode import train
 from utils_nlp.model.gensen_utils import gensen_preprocess


@ -11,29 +12,29 @@ class GenSenClassifier:

    learning_rate (str): The learning rate for the model.

-    config_filepath (str) : Configuration file that is used to train the model. This
+    config_file (str) : Configuration file that is used to train the model. This
    specifies the batch size, directories to load and save the model.

    cache_dir (str) : Location of GenSen's data directory.

    """

-    def __init__(self, config_filepath, learning_rate=0.0001, cache_dir="."):
+    def __init__(self, config_file, learning_rate=0.0001, cache_dir="."):
        self.learning_rate = learning_rate
-        self.config_filepath = config_filepath
-        self.cache_dir = os.path.join(cache_dir, "clean/snli_1.0/")
+        self.config_file = config_file
+        self.cache_dir = cache_dir

    def _validate_params(self):
        """Validate input params."""

        if not isinstance(self.learning_rate, float) or (
-                self.learning_rate <= 0.0
+            self.learning_rate <= 0.0
        ):
            raise ValueError(
                "Learning rate must be of type float and greater than 0"
            )

-        if not os.path.exists(self.config_filepath):
+        if not os.path.isfile(os.path.join(os.getcwd(), self.config_file)):
            raise FileNotFoundError("Provided config file does not exist!")

    def _get_gensen_tokens(self, train_df=None, dev_df=None, test_df=None):
@ -52,6 +53,19 @@ class GenSenClassifier:
        """
        return gensen_preprocess(train_df, dev_df, test_df, self.cache_dir)

+    @staticmethod
+    def _read_config(config_file):
+        """ Read JSON config.
+
+        Args:
+            config_file: Path to the config file.
+
+        Returns(dict): The loaded json file as python object
+
+        """
+        json_object = json.load(open(config_file, "r", encoding="utf-8"))
+        return json_object
+
    def fit(self, train_df, dev_df):

        """ Method to train the Gensen model.
@ -63,19 +77,20 @@ class GenSenClassifier:
        """

        self._validate_params()
+        config = self._read_config(self.config_file)
        self._get_gensen_tokens(train_df, dev_df)
-        print(self.cache_dir)
-        # train.train(
-        #    data_folder=self.cache_dir,
-        #    config_file_path=self.config_filepath,
-        #    learning_rate=self.learning_rate,
-        # )

-    def eval(self, test_df):
+        train.train(
+            data_folder=self.cache_dir,
+            config=config,
+            learning_rate=self.learning_rate,
+        )
+
+    def predict(self, test_df):

        """

-        Method to evaluate the model on the test dataset. This uses SentEval utils.
+        Method to predict the model on the test dataset. This uses SentEval utils.
        Returns: None

        """
--- a/utils_nlp/model/local_config.json
+++ b/utils_nlp/model/local_config.json
@ -0,0 +1,45 @@
+{
+  "training": {
+    "optimizer": "adam",
+    "clip_c": 1,
+    "lrate": 0.0001,
+    "batch_size": 16,
+    "n_gpus": 1
+  },
+  "management": {
+    "monitor_loss": 9600,
+    "print_samples": 12800,
+    "checkpoint_freq": 640000,
+    "eval_freq": 4800000
+  },
+  "data": {"paths": [
+        {
+            "train_src": "snli_1.0_train.txt.s1.tok",
+            "train_trg": "snli_1.0_train.txt.s2.tok",
+            "val_src": "snli_1.0_dev.txt.s1.tok",
+            "val_trg": "snli_1.0_dev.txt.s2.tok",
+            "taskname": "sent-sim"
+        }
+    ],
+        "max_src_length": 90,
+        "max_trg_length": 90,
+        "task": "multi-seq2seq-nli",
+        "save_dir": ".",
+        "load_dir": "auto",
+        "nli_train": "snli_1.0_train.txt.clean",
+        "nli_dev": "snli_1.0_dev.txt.clean",
+        "nli_test": "snli_1.0_test.txt.clean"
+	},
+    "model": {
+    	"dim_src": 2048,
+    	"dim_trg": 2048,
+    	"dim_word_src": 512,
+    	"dim_word_trg": 512,
+    	"n_words_src": 80000,
+    	"n_words_trg": 30000,
+    	"n_layers_src": 1,
+    	"bidirectional": true,
+        "layernorm": false,
+        "dropout": 0.3
+    }
+}