Training Gensen on local
1. Updates to env.yaml to install pytorch and related dependencies. 2. Fixing path for validating params in Gensen class. 3. Minor changes to notebook.
This commit is contained in:
Родитель
5ca065c964
Коммит
a259d7683d
|
@ -1,52 +0,0 @@
|
|||
{
|
||||
"training": {
|
||||
"optimizer": "adam",
|
||||
"clip_c": 1,
|
||||
"lrate": 0.0001,
|
||||
"batch_size": 48,
|
||||
"n_gpus": 1
|
||||
},
|
||||
"management": {
|
||||
"monitor_loss": 9600,
|
||||
"print_samples": 12800,
|
||||
"checkpoint_freq": 640000,
|
||||
"eval_freq": 4800000
|
||||
},
|
||||
"data": {"paths": [
|
||||
{
|
||||
"train_src": "data/corpora/nmt/training/train.nmt.de-en.en.tok",
|
||||
"train_trg": "data/corpora/nmt/training/train.nmt.de-en.de.tok",
|
||||
"val_src": "data/corpora/nmt/training/dev.nmt.de-en.en.tok",
|
||||
"val_trg": "data/corpora/nmt/training/dev.nmt.de-en.de.tok",
|
||||
"taskname": "de-en"
|
||||
},
|
||||
{
|
||||
"train_src": "data/corpora/nmt/training/train.nmt.fr-en.en.tok",
|
||||
"train_trg": "data/corpora/nmt/training/train.nmt.fr-en.fr.tok",
|
||||
"val_src": "data/corpora/nmt/training/dev.nmt.fr-en.en.tok",
|
||||
"val_trg": "data/corpora/nmt/training/dev.nmt.fr-en.fr.tok",
|
||||
"taskname": "fr-en"
|
||||
}
|
||||
],
|
||||
"max_src_length": 90,
|
||||
"max_trg_length": 90,
|
||||
"task": "multi-seq2seq-nli",
|
||||
"save_dir": "data/models/example",
|
||||
"load_dir": "auto",
|
||||
"nli_train": "data/corpora/allnli.train.txt.clean.noblank",
|
||||
"nli_dev": "data/corpora/snli_1.0_dev.txt.clean.noblank",
|
||||
"nli_test": "data/corpora/snli_1.0_test.txt.clean.noblank"
|
||||
},
|
||||
"model": {
|
||||
"dim_src": 2048,
|
||||
"dim_trg": 2048,
|
||||
"dim_word_src": 512,
|
||||
"dim_word_trg": 512,
|
||||
"n_words_src": 80000,
|
||||
"n_words_trg": 30000,
|
||||
"n_layers_src": 1,
|
||||
"bidirectional": true,
|
||||
"layernorm": false,
|
||||
"dropout": 0.3
|
||||
}
|
||||
}
|
|
@ -4,25 +4,25 @@
|
|||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
|
||||
sys.path.append(".") # Required to run on the MILA machines with SLURM
|
||||
|
||||
import json
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from utils import BufferedDataIterator, NLIIterator, compute_validation_loss
|
||||
from models import MultitaskModel
|
||||
|
||||
import torch.optim as optim
|
||||
from azureml.core.run import Run
|
||||
|
||||
from models.gensen.localcode.models import MultitaskModel
|
||||
from models.gensen.localcode.utils import (
|
||||
BufferedDataIterator,
|
||||
NLIIterator,
|
||||
compute_validation_loss,
|
||||
)
|
||||
|
||||
sys.path.append(".") # Required to run on the MILA machines with SLURM
|
||||
|
||||
# parser = argparse.ArgumentParser()
|
||||
# parser.add_argument('--data_folder', type=str, help='data folder')
|
||||
#
|
||||
|
@ -38,7 +38,7 @@ run = Run.get_context()
|
|||
|
||||
cudnn.benchmark = True
|
||||
|
||||
'''
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--config", help="path to json config", required=True)
|
||||
parser.add_argument("--data_folder", type=str, help="data folder")
|
||||
|
@ -46,27 +46,20 @@ parser.add_argument("--data_folder", type=str, help="data folder")
|
|||
parser.add_argument(
|
||||
"--learning_rate", type=float, default=0.0001, help="learning rate"
|
||||
)
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def read_config(json_file):
|
||||
"""Read JSON config."""
|
||||
json_object = json.load(open(json_file, "r", encoding="utf-8"))
|
||||
return json_object
|
||||
|
||||
|
||||
def train(config_file_path, data_folder, learning_rate=0.0001):
|
||||
def train(config, data_folder, learning_rate=0.0001):
|
||||
""" Train the Gensen model.
|
||||
|
||||
Args:
|
||||
config_file_path(str): Path to the config json file.
|
||||
config(dict): Loaded json file as a python object.
|
||||
data_folder(str): Path to the folder containing the data.
|
||||
learning_rate(float): Learning rate for the model.
|
||||
|
||||
"""
|
||||
# os.chdir(data_folder)
|
||||
|
||||
config = read_config(config_file_path)
|
||||
save_dir = config["data"]["save_dir"]
|
||||
load_dir = config["data"]["load_dir"]
|
||||
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -99,7 +99,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -280,7 +280,7 @@
|
|||
"4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN "
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -304,7 +304,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -331,7 +331,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -358,7 +358,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -464,7 +464,7 @@
|
|||
"4 [two, kids, at, a, ballgame, wash, their, hand... "
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -498,12 +498,12 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"config_filepath = 'example_config.json'\n",
|
||||
"clf = GenSenClassifier(config_filepath = config_filepath, learning_rate = 0.0001, cache_dir=BASE_DATA_PATH)"
|
||||
"config_filepath = '../../../utils_nlp/model/local_config.json'\n",
|
||||
"clf = GenSenClassifier(config_file = config_filepath, learning_rate = 0.0001, cache_dir=BASE_DATA_PATH)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -515,23 +515,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "FileNotFoundError",
|
||||
"evalue": "Provided config file does not exist!",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32m<ipython-input-10-6ac659b52cdf>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mclf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_tok\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_tok\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[1;32mE:\\Projects\\NLP-BP\\NLP\\utils_nlp\\model\\gensen.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, train_df, dev_df)\u001b[0m\n\u001b[0;32m 63\u001b[0m \"\"\"\n\u001b[0;32m 64\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 65\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 66\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_gensen_tokens\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtrain_df\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_df\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 67\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;32mE:\\Projects\\NLP-BP\\NLP\\utils_nlp\\model\\gensen.py\u001b[0m in \u001b[0;36m_validate_params\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconfig_filepath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 37\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mFileNotFoundError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Provided config file does not exist!\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 38\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 39\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_get_gensen_tokens\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtrain_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdev_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtest_df\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;31mFileNotFoundError\u001b[0m: Provided config file does not exist!"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clf.fit(train_tok, dev_tok)"
|
||||
]
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
import json
|
||||
import os.path
|
||||
|
||||
#from models.gensen.localcode import train
|
||||
from models.gensen.localcode import train
|
||||
from utils_nlp.model.gensen_utils import gensen_preprocess
|
||||
|
||||
|
||||
|
@ -11,29 +12,29 @@ class GenSenClassifier:
|
|||
|
||||
learning_rate (str): The learning rate for the model.
|
||||
|
||||
config_filepath (str) : Configuration file that is used to train the model. This
|
||||
config_file (str) : Configuration file that is used to train the model. This
|
||||
specifies the batch size, directories to load and save the model.
|
||||
|
||||
cache_dir (str) : Location of GenSen's data directory.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, config_filepath, learning_rate=0.0001, cache_dir="."):
|
||||
def __init__(self, config_file, learning_rate=0.0001, cache_dir="."):
|
||||
self.learning_rate = learning_rate
|
||||
self.config_filepath = config_filepath
|
||||
self.cache_dir = os.path.join(cache_dir, "clean/snli_1.0/")
|
||||
self.config_file = config_file
|
||||
self.cache_dir = cache_dir
|
||||
|
||||
def _validate_params(self):
|
||||
"""Validate input params."""
|
||||
|
||||
if not isinstance(self.learning_rate, float) or (
|
||||
self.learning_rate <= 0.0
|
||||
self.learning_rate <= 0.0
|
||||
):
|
||||
raise ValueError(
|
||||
"Learning rate must be of type float and greater than 0"
|
||||
)
|
||||
|
||||
if not os.path.exists(self.config_filepath):
|
||||
if not os.path.isfile(os.path.join(os.getcwd(), self.config_file)):
|
||||
raise FileNotFoundError("Provided config file does not exist!")
|
||||
|
||||
def _get_gensen_tokens(self, train_df=None, dev_df=None, test_df=None):
|
||||
|
@ -52,6 +53,19 @@ class GenSenClassifier:
|
|||
"""
|
||||
return gensen_preprocess(train_df, dev_df, test_df, self.cache_dir)
|
||||
|
||||
@staticmethod
|
||||
def _read_config(config_file):
|
||||
""" Read JSON config.
|
||||
|
||||
Args:
|
||||
config_file: Path to the config file.
|
||||
|
||||
Returns(dict): The loaded json file as python object
|
||||
|
||||
"""
|
||||
json_object = json.load(open(config_file, "r", encoding="utf-8"))
|
||||
return json_object
|
||||
|
||||
def fit(self, train_df, dev_df):
|
||||
|
||||
""" Method to train the Gensen model.
|
||||
|
@ -63,19 +77,20 @@ class GenSenClassifier:
|
|||
"""
|
||||
|
||||
self._validate_params()
|
||||
config = self._read_config(self.config_file)
|
||||
self._get_gensen_tokens(train_df, dev_df)
|
||||
print(self.cache_dir)
|
||||
# train.train(
|
||||
# data_folder=self.cache_dir,
|
||||
# config_file_path=self.config_filepath,
|
||||
# learning_rate=self.learning_rate,
|
||||
# )
|
||||
|
||||
def eval(self, test_df):
|
||||
train.train(
|
||||
data_folder=self.cache_dir,
|
||||
config=config,
|
||||
learning_rate=self.learning_rate,
|
||||
)
|
||||
|
||||
def predict(self, test_df):
|
||||
|
||||
"""
|
||||
|
||||
Method to evaluate the model on the test dataset. This uses SentEval utils.
|
||||
Method to predict the model on the test dataset. This uses SentEval utils.
|
||||
Returns: None
|
||||
|
||||
"""
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
{
|
||||
"training": {
|
||||
"optimizer": "adam",
|
||||
"clip_c": 1,
|
||||
"lrate": 0.0001,
|
||||
"batch_size": 16,
|
||||
"n_gpus": 1
|
||||
},
|
||||
"management": {
|
||||
"monitor_loss": 9600,
|
||||
"print_samples": 12800,
|
||||
"checkpoint_freq": 640000,
|
||||
"eval_freq": 4800000
|
||||
},
|
||||
"data": {"paths": [
|
||||
{
|
||||
"train_src": "snli_1.0_train.txt.s1.tok",
|
||||
"train_trg": "snli_1.0_train.txt.s2.tok",
|
||||
"val_src": "snli_1.0_dev.txt.s1.tok",
|
||||
"val_trg": "snli_1.0_dev.txt.s2.tok",
|
||||
"taskname": "sent-sim"
|
||||
}
|
||||
],
|
||||
"max_src_length": 90,
|
||||
"max_trg_length": 90,
|
||||
"task": "multi-seq2seq-nli",
|
||||
"save_dir": ".",
|
||||
"load_dir": "auto",
|
||||
"nli_train": "snli_1.0_train.txt.clean",
|
||||
"nli_dev": "snli_1.0_dev.txt.clean",
|
||||
"nli_test": "snli_1.0_test.txt.clean"
|
||||
},
|
||||
"model": {
|
||||
"dim_src": 2048,
|
||||
"dim_trg": 2048,
|
||||
"dim_word_src": 512,
|
||||
"dim_word_trg": 512,
|
||||
"n_words_src": 80000,
|
||||
"n_words_trg": 30000,
|
||||
"n_layers_src": 1,
|
||||
"bidirectional": true,
|
||||
"layernorm": false,
|
||||
"dropout": 0.3
|
||||
}
|
||||
}
|
Загрузка…
Ссылка в новой задаче