This commit is contained in:
Abhiram E 2019-05-29 19:02:37 -04:00
Родитель 9602e6b50d
Коммит 34cfea300f
5 изменённых файлов: 25 добавлений и 515 удалений

Просмотреть файл

@ -1,45 +0,0 @@
{
"training": {
"optimizer": "adam",
"clip_c": 1,
"lrate": 0.0001,
"batch_size": 48,
"n_gpus": 1
},
"management": {
"monitor_loss": 9600,
"print_samples": 12800,
"checkpoint_freq": 480000,
"eval_freq": 4800000
},
"data": {"paths": [
{
"train_src": "snli_1.0_train.txt.s1.tok",
"train_trg": "snli_1.0_train.txt.s2.tok",
"val_src": "snli_1.0_dev.txt.s1.tok",
"val_trg": "snli_1.0_dev.txt.s2.tok",
"taskname": "snli"
}
],
"max_src_length": 90,
"max_trg_length": 90,
"task": "multi-seq2seq-nli",
"save_dir": "model",
"load_dir": "auto",
"nli_train": "snli_1.0_train.txt.clean.noblank",
"nli_dev": "snli_1.0_dev.txt.clean.noblank",
"nli_test": "snli_1.0_test.txt.clean.noblank"
},
"model": {
"dim_src": 2048,
"dim_trg": 2048,
"dim_word_src": 512,
"dim_word_trg": 512,
"n_words_src": 80000,
"n_words_trg": 30000,
"n_layers_src": 1,
"bidirectional": true,
"layernorm": false,
"dropout": 0.3
}
}

Просмотреть файл

@ -1,342 +0,0 @@
"""Parent model for Multitask Training."""
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
class ConditionalGRU(nn.Module):
"""A Gated Recurrent Unit (GRU) cell with peepholes."""
def __init__(self, input_dim, hidden_dim, dropout=0.):
"""Initialize params."""
super(ConditionalGRU, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.input_weights = nn.Linear(self.input_dim, 3 * self.hidden_dim)
self.hidden_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)
self.peep_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)
self.reset_parameters()
def reset_parameters(self):
"""Set params."""
stdv = 1.0 / math.sqrt(self.hidden_dim)
for weight in self.parameters():
weight.data.uniform_(-stdv, stdv)
def forward(self, input, hidden, ctx):
r"""Propogate input through the layer.
inputs:
input - batch size x target sequence length x embedding dimension
hidden - batch size x hidden dimension
ctx - batch size x source sequence length x hidden dimension
returns: output, hidden
output - batch size x target sequence length x hidden dimension
hidden - (batch size x hidden dimension, \
batch size x hidden dimension)
"""
def recurrence(input, hidden, ctx):
"""Recurrence helper."""
input_gate = self.input_weights(input)
hidden_gate = self.hidden_weights(hidden)
peep_gate = self.peep_weights(ctx)
i_r, i_i, i_n = input_gate.chunk(3, 1)
h_r, h_i, h_n = hidden_gate.chunk(3, 1)
p_r, p_i, p_n = peep_gate.chunk(3, 1)
resetgate = F.sigmoid(i_r + h_r + p_r)
inputgate = F.sigmoid(i_i + h_i + p_i)
newgate = F.tanh(i_n + resetgate * h_n + p_n)
hy = newgate + inputgate * (hidden - newgate)
return hy
input = input.transpose(0, 1)
output = []
steps = range(input.size(0))
for i in steps:
hidden = recurrence(input[i], hidden, ctx)
if isinstance(hidden, tuple):
output.append(hidden[0])
else:
output.append(hidden)
output = torch.cat(output, 0).view(input.size(0), *output[0].size())
output = output.transpose(0, 1)
return output, hidden
class MultitaskModel(nn.Module):
r"""A Multi Task Sequence to Sequence (Seq2Seq) model with GRUs.
Auxiliary NLI task trained jointly as well.
Ref: Multi-Task Sequence to Sequence Learning
https://arxiv.org/pdf/1511.06114.pdf
"""
def __init__(
self, src_emb_dim, trg_emb_dim, src_vocab_size,
trg_vocab_size, src_hidden_dim, trg_hidden_dim,
pad_token_src, pad_token_trg, num_tasks, bidirectional=False,
nlayers_src=1, dropout=0., paired_tasks=None
):
"""Initialize Seq2Seq Model."""
super(MultitaskModel, self).__init__()
self.src_vocab_size = src_vocab_size
self.trg_vocab_size = trg_vocab_size
self.src_emb_dim = src_emb_dim
self.trg_emb_dim = trg_emb_dim
self.src_hidden_dim = src_hidden_dim
self.trg_hidden_dim = trg_hidden_dim
self.bidirectional = bidirectional
self.nlayers_src = nlayers_src
self.dropout = dropout
self.num_tasks = num_tasks
self.paired_tasks = paired_tasks
self.num_directions = 2 if bidirectional else 1
self.pad_token_src = pad_token_src
self.pad_token_trg = pad_token_trg
self.src_hidden_dim = src_hidden_dim // 2 \
if self.bidirectional else src_hidden_dim
self.decoder = ConditionalGRU
self.src_embedding = nn.Embedding(
src_vocab_size,
src_emb_dim,
self.pad_token_src,
)
self.encoder = nn.GRU(
src_emb_dim,
self.src_hidden_dim,
self.nlayers_src,
bidirectional=bidirectional,
batch_first=True,
dropout=self.dropout
)
self.enc_drp = nn.Dropout(self.dropout)
self.trg_embedding = nn.ModuleList([
nn.Embedding(
trg_vocab_size,
trg_emb_dim,
self.pad_token_trg,
)
for task in range(self.num_tasks)
])
self.decoders = nn.ModuleList([
self.decoder(
trg_emb_dim, trg_hidden_dim,
dropout=self.dropout
)
for task in range(self.num_tasks)
])
self.decoder2vocab = nn.ModuleList([
nn.Linear(trg_hidden_dim, trg_vocab_size)
for task in range(self.num_tasks)
])
self.nli_decoder = nn.Sequential(
nn.Dropout(0.3),
nn.Linear(4 * src_hidden_dim, 512),
nn.ReLU(),
nn.Linear(512, 3)
)
self.init_weights()
def init_weights(self):
"""Initialize weights."""
initrange = 0.1
self.src_embedding.weight.data.uniform_(-initrange, initrange)
for module in self.trg_embedding:
module.weight.data.uniform_(-initrange, initrange)
for module in self.decoder2vocab:
module.bias.data.fill_(0)
def set_pretrained_embeddings(self, embedding_matrix):
"""Set embedding weights."""
if (
embedding_matrix.shape[0] != self.src_embedding.weight.size(0) or
embedding_matrix.shape[1] != self.src_embedding.weight.size(1)
):
self.src_embedding = nn.Embedding(
embedding_matrix.shape[0],
embedding_matrix.shape[1]
)
self.src_vocab_size = embedding_matrix.shape[0]
self.src_emb_dim = embedding_matrix.shape[1]
try:
self.src_embedding.weight.data.set_(
torch.from_numpy(embedding_matrix)
)
except:
self.src_embedding.weight.data.set_(
torch.from_numpy(embedding_matrix).cuda()
)
self.src_embedding.cuda()
def forward(
self, minibatch, task_idx,
return_hidden=False, paired_trg=None
):
r"""Propogate input through the network.
Seq2Seq:
inputs: minibatch['input_src'], minibatch['input_trg']
input_src - batch size x source sequence length
input_trg - batch size x target sequence length
src_lengths - batch size (list)
paired_trg - batch size x target sequence length or None
returns: decoder_logit (pre-softmax over words)
decoder_logit - batch size x target sequence length x target vocab size
NLI:
sent1 - batch size x source sequence length
sent2 - batch size x target sequence length
sent1_lengths - batch size (list)
sent2_lengths - batch size (list)
rev_sent1 - batch size (LongTensor)
rev_sent2 - batch size (LongTensor)
returns: class_logits (pre-softmax over NLI classes)
decoder_logit - batch size x 3
"""
if minibatch['type'] == 'nli':
sent1_emb = self.src_embedding(minibatch['sent1'])
sent2_emb = self.src_embedding(minibatch['sent2'])
sent1_lengths = minibatch['sent1_lens'].data.view(-1).tolist()
sent1_emb = pack_padded_sequence(
sent1_emb, sent1_lengths, batch_first=True
)
sent1, sent1_h = self.encoder(sent1_emb)
sent2_lengths = minibatch['sent2_lens'].data.view(-1).tolist()
sent2_emb = pack_padded_sequence(
sent2_emb, sent2_lengths, batch_first=True
)
sent2, sent2_h = self.encoder(sent2_emb)
if self.bidirectional:
sent1_h = torch.cat((sent1_h[-1], sent1_h[-2]), 1)
sent2_h = torch.cat((sent2_h[-1], sent2_h[-2]), 1)
else:
sent1_h = sent1_h[-1]
sent2_h = sent2_h[-1]
sent1_h = sent1_h.index_select(0, minibatch['rev_sent1'])
sent2_h = sent2_h.index_select(0, minibatch['rev_sent2'])
features = torch.cat((
sent1_h, sent2_h,
torch.abs(sent1_h - sent2_h),
sent1_h * sent2_h
), 1)
if return_hidden:
return sent1_h, sent2_h, self.nli_decoder(features)
else:
return self.nli_decoder(features)
else:
src_emb = self.src_embedding(minibatch['input_src'])
trg_emb = self.trg_embedding[task_idx](minibatch['input_trg'])
src_lengths = minibatch['src_lens'].data.view(-1).tolist()
src_emb = pack_padded_sequence(
src_emb, src_lengths, batch_first=True
)
_, src_h_t = self.encoder(src_emb)
if self.bidirectional:
h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
else:
h_t = src_h_t[-1]
h_t = h_t.unsqueeze(0)
h_t = self.enc_drp(h_t)
print("INSIDE FORWARD:", h_t.shape)
# Debug with squeeze on error.
trg_h, _ = self.decoders[task_idx](
trg_emb, h_t.view(-1, self.trg_hidden_dim), h_t.view(-1, self.trg_hidden_dim)
)
trg_h_reshape = trg_h.contiguous().view(
trg_h.size(0) * trg_h.size(1), trg_h.size(2)
)
decoder_logit = self.decoder2vocab[task_idx](trg_h_reshape)
decoder_logit = decoder_logit.view(
trg_h.size(0), trg_h.size(1), decoder_logit.size(1)
)
if (
self.paired_tasks is not None and
task_idx in self.paired_tasks and
paired_trg is not None
):
other_task_idx = self.paired_tasks[task_idx]
trg_emb_2 = self.trg_embedding[other_task_idx](paired_trg)
trg_h_2, _ = self.decoders[other_task_idx](
trg_emb_2, h_t.squeeze(), h_t.squeeze()
)
trg_h_reshape_2 = trg_h_2.contiguous().view(
trg_h_2.size(0) * trg_h_2.size(1), trg_h_2.size(2)
)
decoder_logit_2 = self.decoder2vocab[other_task_idx](
trg_h_reshape_2
)
decoder_logit_2 = decoder_logit_2.view(
trg_h_2.size(0), trg_h_2.size(1), decoder_logit_2.size(1)
)
if return_hidden:
return decoder_logit, decoder_logit_2, h_t
else:
return decoder_logit, decoder_logit_2
if return_hidden:
return decoder_logit, h_t
else:
return decoder_logit
def decode(self, logits):
"""Return probability distribution over words."""
logits_reshape = logits.view(-1, logits.size(2))
word_probs = F.softmax(logits_reshape)
word_probs = word_probs.view(
logits.size(0), logits.size(1), logits.size(2)
)
return word_probs
def get_hidden(self, input_src, src_lengths, strategy='last'):
"""Return the encoder hidden state."""
src_emb = self.src_embedding(input_src)
src_lengths = src_lengths.data.view(-1).tolist()
src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)
src_h, src_h_t = self.encoder(src_emb)
if strategy == 'last':
if self.bidirectional:
h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
else:
h_t = src_h_t[-1]
else:
src_h, _ = pad_packed_sequence(src_h, batch_first=True)
print("INSIDE GET HIDDEN",torch.max(src_h, 1)[0].shape)
h_t = torch.max(src_h, 1)[0].squeeze()
return src_h, h_t

Просмотреть файл

@ -3,13 +3,14 @@
import json
import os
from sklearn.metrics.pairwise import cosine_similarity
from utils_nlp.model.gensen import train
from utils_nlp.model.gensen.gensen import GenSenSingle
from utils_nlp.model.gensen.gensen_utils import gensen_preprocess
from utils_nlp.model.gensen.create_gensen_model import (
create_multiseq2seq_model,
)
from sklearn.metrics.pairwise import cosine_similarity
from utils_nlp.model.gensen.gensen import GenSenSingle
from utils_nlp.model.gensen.gensen_utils import gensen_preprocess
class GenSenClassifier:
@ -25,11 +26,11 @@ class GenSenClassifier:
"""
def __init__(
self,
config_file,
pretrained_embedding_path,
learning_rate=0.0001,
cache_dir=".",
self,
config_file,
pretrained_embedding_path,
learning_rate=0.0001,
cache_dir=".",
):
self.learning_rate = learning_rate
self.config_file = config_file
@ -41,7 +42,7 @@ class GenSenClassifier:
"""Validate input params."""
if not isinstance(self.learning_rate, float) or (
self.learning_rate <= 0.0
self.learning_rate <= 0.0
):
raise ValueError(
"Learning rate must be of type float and greater than 0"
@ -133,8 +134,6 @@ class GenSenClassifier:
"""
self._validate_params()
# Use only if you have the model trained and saved.
# self.cache_dir = os.path.join(self.cache_dir, "clean/snli_1.0")
self._create_multiseq2seq_model()

Просмотреть файл

@ -9,133 +9,28 @@ ICLR 2018
### About
GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to our ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ&noteId=B18WgG-CZ).
GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ&noteId=B18WgG-CZ).
### Code
We provide a PyTorch implementation of our paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks.
We provide a distributed PyTorch with Horovod implementation of the paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks.
This code is based on the gibhub codebase from [Maluuba](https://github.com/Maluuba/gensen), but we have refactored the code in the following aspects:
1. Support a distributed PyTorch with Horovod
2. Clean and refactor the original code in a more structured form
3. Change the training file (`train.py`) from non-stopping to stop when the validation loss reaches to the local minimum
4. Update the code from Python 2.7 to 3+ and PyTorch from 0.2 or 0.3 to 1.0.1
5. Add some necessary comments
6. Add some code for training on AzureML platform
7. Fix the bug on when setting the batch size to 1, the training raises an error
### Requirements
- Python 2.7 (Python 3 compatibility coming soon)
- PyTorch 0.2 or 0.3
- Python 3+
- PyTorch 1.0.1
- nltk
- h5py
- numpy
- scikit-learn
#### Usage
##### Setting up Models & pre-trained word vecotrs
You download our pre-trained models and set up pre-trained word vectors for vocabulary expansion by
```bash
cd data/models
bash download_models.sh
cd ../embedding
bash glove2h5.sh
```
##### Using a pre-trained model to extract sentence representations.
You can use our pre-trained models to extract the last hidden state or all hidden states of our multi-task GRU. Additionally, you can concatenate the output of multiple models to replicate the numbers in our paper.
```python
from gensen import GenSen, GenSenSingle
gensen_1 = GenSenSingle(
model_folder='./data/models',
filename_prefix='nli_large_bothskip',
pretrained_emb='./data/embedding/glove.840B.300d.h5'
)
reps_h, reps_h_t = gensen_1.get_representation(
sentences, pool='last', return_numpy=True, tokenize=True
)
print reps_h.shape, reps_h_t.shape
```
- The input to `get_representation` is `sentences`, which should be a list of strings. If your strings are not pre-tokenized, then set `tokenize=True` to use the NLTK tokenizer before computing representations.
- `reps_h` (batch_size x seq_len x 2048) contains the hidden states for all words in all sentences (padded to the max length of sentences)
- `reps_h_t` (batch_size x 2048) contains only the last hidden state for all sentences in the minibatch
GenSenSingle will return the output of a single model `nli_large_bothskip (+STN +Fr +De +NLI +L +STP)`. You can concatenate the output of multiple models by creating a GenSen instance with multiple GenSenSingle instances, as follows:
```python
gensen_2 = GenSenSingle(
model_folder='./data/models',
filename_prefix='nli_large_bothskip_parse',
pretrained_emb='./data/embedding/glove.840B.300d.h5'
)
gensen = GenSen(gensen_1, gensen_2)
reps_h, reps_h_t = gensen.get_representation(
sentences, pool='last', return_numpy=True, tokenize=True
)
```
1) `reps_h` (batch_size x seq_len x 4096) contains the hidden states for all words in all sentences (padded to the max length of sentences)
2) `reps_h_t` (batch_size x 4096) contains only the last hidden state for all sentences in the minibatch
The model will produce a fixed-length vector for each sentence as well as the hidden states corresponding to each word in every sentence (padded to max sentence length). You can also return a numpy array instead of a `torch.FloatTensor` by setting `return_numpy=True`.
##### Vocabulary Expansion
If you have a specific domain for which you want to compute representations, you can call `vocab_expansion` on instances of the GenSenSingle or GenSen class simply by `gensen.vocab_expansion(vocab)` where vocab is a list of unique words in the new domain. This will learn a linear mapping from the provided pretrained embeddings (which have a significantly larger vocabulary) provided to the space of gensen's word vectors. For an example of how this is used in an actual setting, please refer to `gensen_senteval.py`.
##### Training a model from scratch
To train a model from scratch, simply run `train.py` with an appropriate JSON config file. An example config is provided in `example_config.json`. To continue training, just relaunch the same scripy with `load_dir=auto` in the config file.
To download some of the data required to train a GenSen model, run:
```bash
bash get_data.sh
```
Note that this script can take a while to complete since it downloads, tokenizes and lowercases a fairly large En-Fr corpus. If you already have these parallel corpora processed, you can replace the paths to these files in the provided `example_config.json`
Some of the data used in our work is no longer publicly available (BookCorpus - see http://yknzhu.wixsite.com/mbweb) or has an LDC license associated (Penn Treebank). As a result, the `example_config.json` script will only train on Multilingual NMT and NLI, since they are publicly available. To use models trained on all tasks, please use our available pre-trained models.
Additional Sequence-to-Sequence transduction tasks can be added trivally to the multi-task framework by editing the json config file with more tasks.
```bash
python train.py --config example_config.json
```
To use the default settings in `example_config.json` you will need a GPU with atleast 16GB of memory (such as a P100), to train on smaller GPUs, you may need to reduce the batch size.
Note that if "load_dir" is set to auto, the script will resume from the last saved model in "save_dir".
##### Creating a GenSen model from a trained multi-task model
Once you have a trained model, we can throw away all of the decoders and just retain the encoder used to compute sentence representations.
You can do this by running
```bash
python create_gensen.py -t <path_to_trained_model> -s <path_to_save_encoder> -n <name_of_encoder>
```
Once you have done this, you can load this model just like any of the pre-trained models by specifying the model_folder as `path_to_save_encoder` and filename_prefix as `name_of_encoder` in the above command.
```python
your_gensen = GenSenSingle(
model_folder='<path_to_save_encoder>',
filename_prefix='<name_of_encoder>',
pretrained_emb='./data/embedding/glove.840B.300d.h5'
)
```
### Transfer Learning Evaluations
We used the [SentEval](https://github.com/facebookresearch/SentEval) toolkit to run most of our transfer learning experiments. To replicate these numbers, clone their repository and follow setup instructions. Once complete, copy `gensen_senteval.py` and `gensen.py` into their examples folder and run the following commands to reproduce different rows in Table 2 of our paper. Note: Please set the path to the pretrained glove embeddings (`glove.840B.300d.h5`) and model folder as appropriate.
```
(+STN +Fr +De +NLI +L +STP) python gensen_senteval.py --prefix_1 nli_large --prefix_2 nli_large_bothskip
(+STN +Fr +De +NLI +2L +STP) python gensen_senteval.py --prefix_1 nli_large_bothskip --prefix_2 nli_large_bothskip_2layer
(+STN +Fr +De +NLI +L +STP +Par) python gensen_senteval.py --prefix_1 nli_large_bothskip_parse --prefix_2 nli_large_bothskip
```
### Reference
```

Просмотреть файл

@ -1,5 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""Minibatching utilities."""
import itertools
import operator