1. Updates to env.yaml to install pytorch and related dependencies.
2. Fixing path for validating params in Gensen class.
3. Minor changes to notebook.
This commit is contained in:
Abhiram E 2019-05-16 13:55:25 -04:00
Родитель 5ca065c964
Коммит a259d7683d
5 изменённых файлов: 103 добавлений и 116 удалений

Просмотреть файл

@ -1,52 +0,0 @@
{
"training": {
"optimizer": "adam",
"clip_c": 1,
"lrate": 0.0001,
"batch_size": 48,
"n_gpus": 1
},
"management": {
"monitor_loss": 9600,
"print_samples": 12800,
"checkpoint_freq": 640000,
"eval_freq": 4800000
},
"data": {"paths": [
{
"train_src": "data/corpora/nmt/training/train.nmt.de-en.en.tok",
"train_trg": "data/corpora/nmt/training/train.nmt.de-en.de.tok",
"val_src": "data/corpora/nmt/training/dev.nmt.de-en.en.tok",
"val_trg": "data/corpora/nmt/training/dev.nmt.de-en.de.tok",
"taskname": "de-en"
},
{
"train_src": "data/corpora/nmt/training/train.nmt.fr-en.en.tok",
"train_trg": "data/corpora/nmt/training/train.nmt.fr-en.fr.tok",
"val_src": "data/corpora/nmt/training/dev.nmt.fr-en.en.tok",
"val_trg": "data/corpora/nmt/training/dev.nmt.fr-en.fr.tok",
"taskname": "fr-en"
}
],
"max_src_length": 90,
"max_trg_length": 90,
"task": "multi-seq2seq-nli",
"save_dir": "data/models/example",
"load_dir": "auto",
"nli_train": "data/corpora/allnli.train.txt.clean.noblank",
"nli_dev": "data/corpora/snli_1.0_dev.txt.clean.noblank",
"nli_test": "data/corpora/snli_1.0_test.txt.clean.noblank"
},
"model": {
"dim_src": 2048,
"dim_trg": 2048,
"dim_word_src": 512,
"dim_word_trg": 512,
"n_words_src": 80000,
"n_words_trg": 30000,
"n_layers_src": 1,
"bidirectional": true,
"layernorm": false,
"dropout": 0.3
}
}

Просмотреть файл

@ -4,25 +4,25 @@
import logging
import os
import sys
import numpy as np
sys.path.append(".") # Required to run on the MILA machines with SLURM
import json
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.functional as F
from utils import BufferedDataIterator, NLIIterator, compute_validation_loss
from models import MultitaskModel
import torch.optim as optim
from azureml.core.run import Run
from models.gensen.localcode.models import MultitaskModel
from models.gensen.localcode.utils import (
BufferedDataIterator,
NLIIterator,
compute_validation_loss,
)
sys.path.append(".") # Required to run on the MILA machines with SLURM
# parser = argparse.ArgumentParser()
# parser.add_argument('--data_folder', type=str, help='data folder')
#
@ -38,7 +38,7 @@ run = Run.get_context()
cudnn.benchmark = True
'''
"""
parser = argparse.ArgumentParser()
parser.add_argument("--config", help="path to json config", required=True)
parser.add_argument("--data_folder", type=str, help="data folder")
@ -46,27 +46,20 @@ parser.add_argument("--data_folder", type=str, help="data folder")
parser.add_argument(
"--learning_rate", type=float, default=0.0001, help="learning rate"
)
'''
"""
def read_config(json_file):
"""Read JSON config."""
json_object = json.load(open(json_file, "r", encoding="utf-8"))
return json_object
def train(config_file_path, data_folder, learning_rate=0.0001):
def train(config, data_folder, learning_rate=0.0001):
""" Train the Gensen model.
Args:
config_file_path(str): Path to the config json file.
config(dict): Loaded json file as a python object.
data_folder(str): Path to the folder containing the data.
learning_rate(float): Learning rate for the model.
"""
# os.chdir(data_folder)
config = read_config(config_file_path)
save_dir = config["data"]["save_dir"]
load_dir = config["data"]["load_dir"]

Просмотреть файл

@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [
{
@ -99,7 +99,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -280,7 +280,7 @@
"4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN "
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@ -304,7 +304,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -331,7 +331,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -358,7 +358,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@ -464,7 +464,7 @@
"4 [two, kids, at, a, ballgame, wash, their, hand... "
]
},
"execution_count": 8,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -498,12 +498,12 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"config_filepath = 'example_config.json'\n",
"clf = GenSenClassifier(config_filepath = config_filepath, learning_rate = 0.0001, cache_dir=BASE_DATA_PATH)"
"config_filepath = '../../../utils_nlp/model/local_config.json'\n",
"clf = GenSenClassifier(config_file = config_filepath, learning_rate = 0.0001, cache_dir=BASE_DATA_PATH)"
]
},
{
@ -515,23 +515,9 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "Provided config file does not exist!",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-10-6ac659b52cdf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_tok\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_tok\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32mE:\\Projects\\NLP-BP\\NLP\\utils_nlp\\model\\gensen.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, train_df, dev_df)\u001b[0m\n\u001b[0;32m 63\u001b[0m \"\"\"\n\u001b[0;32m 64\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 65\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 66\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_gensen_tokens\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_df\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_df\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mE:\\Projects\\NLP-BP\\NLP\\utils_nlp\\model\\gensen.py\u001b[0m in \u001b[0;36m_validate_params\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconfig_filepath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 37\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mFileNotFoundError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Provided config file does not exist!\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 39\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_get_gensen_tokens\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtest_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: Provided config file does not exist!"
]
}
],
"outputs": [],
"source": [
"clf.fit(train_tok, dev_tok)"
]

Просмотреть файл

@ -1,8 +1,9 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import json
import os.path
#from models.gensen.localcode import train
from models.gensen.localcode import train
from utils_nlp.model.gensen_utils import gensen_preprocess
@ -11,29 +12,29 @@ class GenSenClassifier:
learning_rate (str): The learning rate for the model.
config_filepath (str) : Configuration file that is used to train the model. This
config_file (str) : Configuration file that is used to train the model. This
specifies the batch size, directories to load and save the model.
cache_dir (str) : Location of GenSen's data directory.
"""
def __init__(self, config_filepath, learning_rate=0.0001, cache_dir="."):
def __init__(self, config_file, learning_rate=0.0001, cache_dir="."):
self.learning_rate = learning_rate
self.config_filepath = config_filepath
self.cache_dir = os.path.join(cache_dir, "clean/snli_1.0/")
self.config_file = config_file
self.cache_dir = cache_dir
def _validate_params(self):
"""Validate input params."""
if not isinstance(self.learning_rate, float) or (
self.learning_rate <= 0.0
self.learning_rate <= 0.0
):
raise ValueError(
"Learning rate must be of type float and greater than 0"
)
if not os.path.exists(self.config_filepath):
if not os.path.isfile(os.path.join(os.getcwd(), self.config_file)):
raise FileNotFoundError("Provided config file does not exist!")
def _get_gensen_tokens(self, train_df=None, dev_df=None, test_df=None):
@ -52,6 +53,19 @@ class GenSenClassifier:
"""
return gensen_preprocess(train_df, dev_df, test_df, self.cache_dir)
@staticmethod
def _read_config(config_file):
""" Read JSON config.
Args:
config_file: Path to the config file.
Returns(dict): The loaded json file as python object
"""
json_object = json.load(open(config_file, "r", encoding="utf-8"))
return json_object
def fit(self, train_df, dev_df):
""" Method to train the Gensen model.
@ -63,19 +77,20 @@ class GenSenClassifier:
"""
self._validate_params()
config = self._read_config(self.config_file)
self._get_gensen_tokens(train_df, dev_df)
print(self.cache_dir)
# train.train(
# data_folder=self.cache_dir,
# config_file_path=self.config_filepath,
# learning_rate=self.learning_rate,
# )
def eval(self, test_df):
train.train(
data_folder=self.cache_dir,
config=config,
learning_rate=self.learning_rate,
)
def predict(self, test_df):
"""
Method to evaluate the model on the test dataset. This uses SentEval utils.
Method to predict the model on the test dataset. This uses SentEval utils.
Returns: None
"""

Просмотреть файл

@ -0,0 +1,45 @@
{
"training": {
"optimizer": "adam",
"clip_c": 1,
"lrate": 0.0001,
"batch_size": 16,
"n_gpus": 1
},
"management": {
"monitor_loss": 9600,
"print_samples": 12800,
"checkpoint_freq": 640000,
"eval_freq": 4800000
},
"data": {"paths": [
{
"train_src": "snli_1.0_train.txt.s1.tok",
"train_trg": "snli_1.0_train.txt.s2.tok",
"val_src": "snli_1.0_dev.txt.s1.tok",
"val_trg": "snli_1.0_dev.txt.s2.tok",
"taskname": "sent-sim"
}
],
"max_src_length": 90,
"max_trg_length": 90,
"task": "multi-seq2seq-nli",
"save_dir": ".",
"load_dir": "auto",
"nli_train": "snli_1.0_train.txt.clean",
"nli_dev": "snli_1.0_dev.txt.clean",
"nli_test": "snli_1.0_test.txt.clean"
},
"model": {
"dim_src": 2048,
"dim_trg": 2048,
"dim_word_src": 512,
"dim_word_trg": 512,
"n_words_src": 80000,
"n_words_trg": 30000,
"n_layers_src": 1,
"bidirectional": true,
"layernorm": false,
"dropout": 0.3
}
}