Minor fix to gensen wrapper.
This commit is contained in:
Родитель
9602e6b50d
Коммит
34cfea300f
|
@ -1,45 +0,0 @@
|
|||
{
|
||||
"training": {
|
||||
"optimizer": "adam",
|
||||
"clip_c": 1,
|
||||
"lrate": 0.0001,
|
||||
"batch_size": 48,
|
||||
"n_gpus": 1
|
||||
},
|
||||
"management": {
|
||||
"monitor_loss": 9600,
|
||||
"print_samples": 12800,
|
||||
"checkpoint_freq": 480000,
|
||||
"eval_freq": 4800000
|
||||
},
|
||||
"data": {"paths": [
|
||||
{
|
||||
"train_src": "snli_1.0_train.txt.s1.tok",
|
||||
"train_trg": "snli_1.0_train.txt.s2.tok",
|
||||
"val_src": "snli_1.0_dev.txt.s1.tok",
|
||||
"val_trg": "snli_1.0_dev.txt.s2.tok",
|
||||
"taskname": "snli"
|
||||
}
|
||||
],
|
||||
"max_src_length": 90,
|
||||
"max_trg_length": 90,
|
||||
"task": "multi-seq2seq-nli",
|
||||
"save_dir": "model",
|
||||
"load_dir": "auto",
|
||||
"nli_train": "snli_1.0_train.txt.clean.noblank",
|
||||
"nli_dev": "snli_1.0_dev.txt.clean.noblank",
|
||||
"nli_test": "snli_1.0_test.txt.clean.noblank"
|
||||
},
|
||||
"model": {
|
||||
"dim_src": 2048,
|
||||
"dim_trg": 2048,
|
||||
"dim_word_src": 512,
|
||||
"dim_word_trg": 512,
|
||||
"n_words_src": 80000,
|
||||
"n_words_trg": 30000,
|
||||
"n_layers_src": 1,
|
||||
"bidirectional": true,
|
||||
"layernorm": false,
|
||||
"dropout": 0.3
|
||||
}
|
||||
}
|
|
@ -1,342 +0,0 @@
|
|||
"""Parent model for Multitask Training."""
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
|
||||
from torch.autograd import Variable
|
||||
|
||||
|
||||
class ConditionalGRU(nn.Module):
|
||||
"""A Gated Recurrent Unit (GRU) cell with peepholes."""
|
||||
|
||||
def __init__(self, input_dim, hidden_dim, dropout=0.):
|
||||
"""Initialize params."""
|
||||
super(ConditionalGRU, self).__init__()
|
||||
self.input_dim = input_dim
|
||||
self.hidden_dim = hidden_dim
|
||||
|
||||
self.input_weights = nn.Linear(self.input_dim, 3 * self.hidden_dim)
|
||||
self.hidden_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)
|
||||
self.peep_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)
|
||||
|
||||
self.reset_parameters()
|
||||
|
||||
def reset_parameters(self):
|
||||
"""Set params."""
|
||||
stdv = 1.0 / math.sqrt(self.hidden_dim)
|
||||
for weight in self.parameters():
|
||||
weight.data.uniform_(-stdv, stdv)
|
||||
|
||||
def forward(self, input, hidden, ctx):
|
||||
r"""Propogate input through the layer.
|
||||
|
||||
inputs:
|
||||
input - batch size x target sequence length x embedding dimension
|
||||
hidden - batch size x hidden dimension
|
||||
ctx - batch size x source sequence length x hidden dimension
|
||||
|
||||
returns: output, hidden
|
||||
output - batch size x target sequence length x hidden dimension
|
||||
hidden - (batch size x hidden dimension, \
|
||||
batch size x hidden dimension)
|
||||
"""
|
||||
def recurrence(input, hidden, ctx):
|
||||
"""Recurrence helper."""
|
||||
input_gate = self.input_weights(input)
|
||||
hidden_gate = self.hidden_weights(hidden)
|
||||
peep_gate = self.peep_weights(ctx)
|
||||
i_r, i_i, i_n = input_gate.chunk(3, 1)
|
||||
h_r, h_i, h_n = hidden_gate.chunk(3, 1)
|
||||
p_r, p_i, p_n = peep_gate.chunk(3, 1)
|
||||
resetgate = F.sigmoid(i_r + h_r + p_r)
|
||||
inputgate = F.sigmoid(i_i + h_i + p_i)
|
||||
newgate = F.tanh(i_n + resetgate * h_n + p_n)
|
||||
hy = newgate + inputgate * (hidden - newgate)
|
||||
|
||||
return hy
|
||||
|
||||
input = input.transpose(0, 1)
|
||||
|
||||
output = []
|
||||
steps = range(input.size(0))
|
||||
for i in steps:
|
||||
hidden = recurrence(input[i], hidden, ctx)
|
||||
if isinstance(hidden, tuple):
|
||||
output.append(hidden[0])
|
||||
else:
|
||||
output.append(hidden)
|
||||
|
||||
output = torch.cat(output, 0).view(input.size(0), *output[0].size())
|
||||
output = output.transpose(0, 1)
|
||||
return output, hidden
|
||||
|
||||
|
||||
class MultitaskModel(nn.Module):
|
||||
r"""A Multi Task Sequence to Sequence (Seq2Seq) model with GRUs.
|
||||
|
||||
Auxiliary NLI task trained jointly as well.
|
||||
Ref: Multi-Task Sequence to Sequence Learning
|
||||
https://arxiv.org/pdf/1511.06114.pdf
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, src_emb_dim, trg_emb_dim, src_vocab_size,
|
||||
trg_vocab_size, src_hidden_dim, trg_hidden_dim,
|
||||
pad_token_src, pad_token_trg, num_tasks, bidirectional=False,
|
||||
nlayers_src=1, dropout=0., paired_tasks=None
|
||||
):
|
||||
"""Initialize Seq2Seq Model."""
|
||||
super(MultitaskModel, self).__init__()
|
||||
self.src_vocab_size = src_vocab_size
|
||||
self.trg_vocab_size = trg_vocab_size
|
||||
self.src_emb_dim = src_emb_dim
|
||||
self.trg_emb_dim = trg_emb_dim
|
||||
self.src_hidden_dim = src_hidden_dim
|
||||
self.trg_hidden_dim = trg_hidden_dim
|
||||
self.bidirectional = bidirectional
|
||||
self.nlayers_src = nlayers_src
|
||||
self.dropout = dropout
|
||||
self.num_tasks = num_tasks
|
||||
self.paired_tasks = paired_tasks
|
||||
self.num_directions = 2 if bidirectional else 1
|
||||
self.pad_token_src = pad_token_src
|
||||
self.pad_token_trg = pad_token_trg
|
||||
self.src_hidden_dim = src_hidden_dim // 2 \
|
||||
if self.bidirectional else src_hidden_dim
|
||||
self.decoder = ConditionalGRU
|
||||
|
||||
self.src_embedding = nn.Embedding(
|
||||
src_vocab_size,
|
||||
src_emb_dim,
|
||||
self.pad_token_src,
|
||||
)
|
||||
|
||||
self.encoder = nn.GRU(
|
||||
src_emb_dim,
|
||||
self.src_hidden_dim,
|
||||
self.nlayers_src,
|
||||
bidirectional=bidirectional,
|
||||
batch_first=True,
|
||||
dropout=self.dropout
|
||||
)
|
||||
|
||||
self.enc_drp = nn.Dropout(self.dropout)
|
||||
|
||||
self.trg_embedding = nn.ModuleList([
|
||||
nn.Embedding(
|
||||
trg_vocab_size,
|
||||
trg_emb_dim,
|
||||
self.pad_token_trg,
|
||||
)
|
||||
for task in range(self.num_tasks)
|
||||
])
|
||||
|
||||
self.decoders = nn.ModuleList([
|
||||
self.decoder(
|
||||
trg_emb_dim, trg_hidden_dim,
|
||||
dropout=self.dropout
|
||||
)
|
||||
for task in range(self.num_tasks)
|
||||
])
|
||||
|
||||
self.decoder2vocab = nn.ModuleList([
|
||||
nn.Linear(trg_hidden_dim, trg_vocab_size)
|
||||
for task in range(self.num_tasks)
|
||||
])
|
||||
|
||||
self.nli_decoder = nn.Sequential(
|
||||
nn.Dropout(0.3),
|
||||
nn.Linear(4 * src_hidden_dim, 512),
|
||||
nn.ReLU(),
|
||||
nn.Linear(512, 3)
|
||||
)
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def init_weights(self):
|
||||
"""Initialize weights."""
|
||||
initrange = 0.1
|
||||
self.src_embedding.weight.data.uniform_(-initrange, initrange)
|
||||
for module in self.trg_embedding:
|
||||
module.weight.data.uniform_(-initrange, initrange)
|
||||
for module in self.decoder2vocab:
|
||||
module.bias.data.fill_(0)
|
||||
|
||||
def set_pretrained_embeddings(self, embedding_matrix):
|
||||
"""Set embedding weights."""
|
||||
if (
|
||||
embedding_matrix.shape[0] != self.src_embedding.weight.size(0) or
|
||||
embedding_matrix.shape[1] != self.src_embedding.weight.size(1)
|
||||
):
|
||||
self.src_embedding = nn.Embedding(
|
||||
embedding_matrix.shape[0],
|
||||
embedding_matrix.shape[1]
|
||||
)
|
||||
self.src_vocab_size = embedding_matrix.shape[0]
|
||||
self.src_emb_dim = embedding_matrix.shape[1]
|
||||
|
||||
try:
|
||||
self.src_embedding.weight.data.set_(
|
||||
torch.from_numpy(embedding_matrix)
|
||||
)
|
||||
except:
|
||||
self.src_embedding.weight.data.set_(
|
||||
torch.from_numpy(embedding_matrix).cuda()
|
||||
)
|
||||
|
||||
self.src_embedding.cuda()
|
||||
|
||||
def forward(
|
||||
self, minibatch, task_idx,
|
||||
return_hidden=False, paired_trg=None
|
||||
):
|
||||
r"""Propogate input through the network.
|
||||
|
||||
Seq2Seq:
|
||||
inputs: minibatch['input_src'], minibatch['input_trg']
|
||||
input_src - batch size x source sequence length
|
||||
input_trg - batch size x target sequence length
|
||||
src_lengths - batch size (list)
|
||||
paired_trg - batch size x target sequence length or None
|
||||
returns: decoder_logit (pre-softmax over words)
|
||||
decoder_logit - batch size x target sequence length x target vocab size
|
||||
|
||||
NLI:
|
||||
sent1 - batch size x source sequence length
|
||||
sent2 - batch size x target sequence length
|
||||
sent1_lengths - batch size (list)
|
||||
sent2_lengths - batch size (list)
|
||||
rev_sent1 - batch size (LongTensor)
|
||||
rev_sent2 - batch size (LongTensor)
|
||||
returns: class_logits (pre-softmax over NLI classes)
|
||||
decoder_logit - batch size x 3
|
||||
"""
|
||||
if minibatch['type'] == 'nli':
|
||||
sent1_emb = self.src_embedding(minibatch['sent1'])
|
||||
sent2_emb = self.src_embedding(minibatch['sent2'])
|
||||
|
||||
sent1_lengths = minibatch['sent1_lens'].data.view(-1).tolist()
|
||||
sent1_emb = pack_padded_sequence(
|
||||
sent1_emb, sent1_lengths, batch_first=True
|
||||
)
|
||||
sent1, sent1_h = self.encoder(sent1_emb)
|
||||
|
||||
sent2_lengths = minibatch['sent2_lens'].data.view(-1).tolist()
|
||||
sent2_emb = pack_padded_sequence(
|
||||
sent2_emb, sent2_lengths, batch_first=True
|
||||
)
|
||||
sent2, sent2_h = self.encoder(sent2_emb)
|
||||
|
||||
if self.bidirectional:
|
||||
sent1_h = torch.cat((sent1_h[-1], sent1_h[-2]), 1)
|
||||
sent2_h = torch.cat((sent2_h[-1], sent2_h[-2]), 1)
|
||||
else:
|
||||
sent1_h = sent1_h[-1]
|
||||
sent2_h = sent2_h[-1]
|
||||
|
||||
sent1_h = sent1_h.index_select(0, minibatch['rev_sent1'])
|
||||
sent2_h = sent2_h.index_select(0, minibatch['rev_sent2'])
|
||||
|
||||
features = torch.cat((
|
||||
sent1_h, sent2_h,
|
||||
torch.abs(sent1_h - sent2_h),
|
||||
sent1_h * sent2_h
|
||||
), 1)
|
||||
|
||||
if return_hidden:
|
||||
return sent1_h, sent2_h, self.nli_decoder(features)
|
||||
else:
|
||||
return self.nli_decoder(features)
|
||||
|
||||
else:
|
||||
src_emb = self.src_embedding(minibatch['input_src'])
|
||||
trg_emb = self.trg_embedding[task_idx](minibatch['input_trg'])
|
||||
src_lengths = minibatch['src_lens'].data.view(-1).tolist()
|
||||
src_emb = pack_padded_sequence(
|
||||
src_emb, src_lengths, batch_first=True
|
||||
)
|
||||
|
||||
_, src_h_t = self.encoder(src_emb)
|
||||
|
||||
if self.bidirectional:
|
||||
h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
|
||||
else:
|
||||
h_t = src_h_t[-1]
|
||||
|
||||
h_t = h_t.unsqueeze(0)
|
||||
h_t = self.enc_drp(h_t)
|
||||
|
||||
print("INSIDE FORWARD:", h_t.shape)
|
||||
# Debug with squeeze on error.
|
||||
trg_h, _ = self.decoders[task_idx](
|
||||
trg_emb, h_t.view(-1, self.trg_hidden_dim), h_t.view(-1, self.trg_hidden_dim)
|
||||
)
|
||||
|
||||
trg_h_reshape = trg_h.contiguous().view(
|
||||
trg_h.size(0) * trg_h.size(1), trg_h.size(2)
|
||||
)
|
||||
|
||||
decoder_logit = self.decoder2vocab[task_idx](trg_h_reshape)
|
||||
decoder_logit = decoder_logit.view(
|
||||
trg_h.size(0), trg_h.size(1), decoder_logit.size(1)
|
||||
)
|
||||
|
||||
if (
|
||||
self.paired_tasks is not None and
|
||||
task_idx in self.paired_tasks and
|
||||
paired_trg is not None
|
||||
):
|
||||
other_task_idx = self.paired_tasks[task_idx]
|
||||
trg_emb_2 = self.trg_embedding[other_task_idx](paired_trg)
|
||||
|
||||
trg_h_2, _ = self.decoders[other_task_idx](
|
||||
trg_emb_2, h_t.squeeze(), h_t.squeeze()
|
||||
)
|
||||
|
||||
trg_h_reshape_2 = trg_h_2.contiguous().view(
|
||||
trg_h_2.size(0) * trg_h_2.size(1), trg_h_2.size(2)
|
||||
)
|
||||
|
||||
decoder_logit_2 = self.decoder2vocab[other_task_idx](
|
||||
trg_h_reshape_2
|
||||
)
|
||||
decoder_logit_2 = decoder_logit_2.view(
|
||||
trg_h_2.size(0), trg_h_2.size(1), decoder_logit_2.size(1)
|
||||
)
|
||||
if return_hidden:
|
||||
return decoder_logit, decoder_logit_2, h_t
|
||||
else:
|
||||
return decoder_logit, decoder_logit_2
|
||||
|
||||
if return_hidden:
|
||||
return decoder_logit, h_t
|
||||
else:
|
||||
return decoder_logit
|
||||
|
||||
def decode(self, logits):
|
||||
"""Return probability distribution over words."""
|
||||
logits_reshape = logits.view(-1, logits.size(2))
|
||||
word_probs = F.softmax(logits_reshape)
|
||||
word_probs = word_probs.view(
|
||||
logits.size(0), logits.size(1), logits.size(2)
|
||||
)
|
||||
return word_probs
|
||||
|
||||
def get_hidden(self, input_src, src_lengths, strategy='last'):
|
||||
"""Return the encoder hidden state."""
|
||||
src_emb = self.src_embedding(input_src)
|
||||
src_lengths = src_lengths.data.view(-1).tolist()
|
||||
src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)
|
||||
src_h, src_h_t = self.encoder(src_emb)
|
||||
if strategy == 'last':
|
||||
if self.bidirectional:
|
||||
h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
|
||||
else:
|
||||
h_t = src_h_t[-1]
|
||||
else:
|
||||
src_h, _ = pad_packed_sequence(src_h, batch_first=True)
|
||||
print("INSIDE GET HIDDEN",torch.max(src_h, 1)[0].shape)
|
||||
h_t = torch.max(src_h, 1)[0].squeeze()
|
||||
|
||||
return src_h, h_t
|
|
@ -3,13 +3,14 @@
|
|||
import json
|
||||
import os
|
||||
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
from utils_nlp.model.gensen import train
|
||||
from utils_nlp.model.gensen.gensen import GenSenSingle
|
||||
from utils_nlp.model.gensen.gensen_utils import gensen_preprocess
|
||||
from utils_nlp.model.gensen.create_gensen_model import (
|
||||
create_multiseq2seq_model,
|
||||
)
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from utils_nlp.model.gensen.gensen import GenSenSingle
|
||||
from utils_nlp.model.gensen.gensen_utils import gensen_preprocess
|
||||
|
||||
|
||||
class GenSenClassifier:
|
||||
|
@ -25,11 +26,11 @@ class GenSenClassifier:
|
|||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config_file,
|
||||
pretrained_embedding_path,
|
||||
learning_rate=0.0001,
|
||||
cache_dir=".",
|
||||
self,
|
||||
config_file,
|
||||
pretrained_embedding_path,
|
||||
learning_rate=0.0001,
|
||||
cache_dir=".",
|
||||
):
|
||||
self.learning_rate = learning_rate
|
||||
self.config_file = config_file
|
||||
|
@ -41,7 +42,7 @@ class GenSenClassifier:
|
|||
"""Validate input params."""
|
||||
|
||||
if not isinstance(self.learning_rate, float) or (
|
||||
self.learning_rate <= 0.0
|
||||
self.learning_rate <= 0.0
|
||||
):
|
||||
raise ValueError(
|
||||
"Learning rate must be of type float and greater than 0"
|
||||
|
@ -133,8 +134,6 @@ class GenSenClassifier:
|
|||
|
||||
"""
|
||||
|
||||
self._validate_params()
|
||||
|
||||
# Use only if you have the model trained and saved.
|
||||
# self.cache_dir = os.path.join(self.cache_dir, "clean/snli_1.0")
|
||||
self._create_multiseq2seq_model()
|
|
@ -9,133 +9,28 @@ ICLR 2018
|
|||
|
||||
### About
|
||||
|
||||
GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to our ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ¬eId=B18WgG-CZ).
|
||||
GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ¬eId=B18WgG-CZ).
|
||||
|
||||
### Code
|
||||
|
||||
We provide a PyTorch implementation of our paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks.
|
||||
|
||||
We provide a distributed PyTorch with Horovod implementation of the paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks.
|
||||
This code is based on the gibhub codebase from [Maluuba](https://github.com/Maluuba/gensen), but we have refactored the code in the following aspects:
|
||||
1. Support a distributed PyTorch with Horovod
|
||||
2. Clean and refactor the original code in a more structured form
|
||||
3. Change the training file (`train.py`) from non-stopping to stop when the validation loss reaches to the local minimum
|
||||
4. Update the code from Python 2.7 to 3+ and PyTorch from 0.2 or 0.3 to 1.0.1
|
||||
5. Add some necessary comments
|
||||
6. Add some code for training on AzureML platform
|
||||
7. Fix the bug on when setting the batch size to 1, the training raises an error
|
||||
### Requirements
|
||||
|
||||
- Python 2.7 (Python 3 compatibility coming soon)
|
||||
- PyTorch 0.2 or 0.3
|
||||
- Python 3+
|
||||
- PyTorch 1.0.1
|
||||
- nltk
|
||||
- h5py
|
||||
- numpy
|
||||
- scikit-learn
|
||||
|
||||
#### Usage
|
||||
|
||||
##### Setting up Models & pre-trained word vecotrs
|
||||
|
||||
You download our pre-trained models and set up pre-trained word vectors for vocabulary expansion by
|
||||
|
||||
```bash
|
||||
cd data/models
|
||||
bash download_models.sh
|
||||
cd ../embedding
|
||||
bash glove2h5.sh
|
||||
```
|
||||
|
||||
##### Using a pre-trained model to extract sentence representations.
|
||||
|
||||
You can use our pre-trained models to extract the last hidden state or all hidden states of our multi-task GRU. Additionally, you can concatenate the output of multiple models to replicate the numbers in our paper.
|
||||
|
||||
```python
|
||||
from gensen import GenSen, GenSenSingle
|
||||
|
||||
gensen_1 = GenSenSingle(
|
||||
model_folder='./data/models',
|
||||
filename_prefix='nli_large_bothskip',
|
||||
pretrained_emb='./data/embedding/glove.840B.300d.h5'
|
||||
)
|
||||
reps_h, reps_h_t = gensen_1.get_representation(
|
||||
sentences, pool='last', return_numpy=True, tokenize=True
|
||||
)
|
||||
print reps_h.shape, reps_h_t.shape
|
||||
```
|
||||
|
||||
- The input to `get_representation` is `sentences`, which should be a list of strings. If your strings are not pre-tokenized, then set `tokenize=True` to use the NLTK tokenizer before computing representations.
|
||||
- `reps_h` (batch_size x seq_len x 2048) contains the hidden states for all words in all sentences (padded to the max length of sentences)
|
||||
- `reps_h_t` (batch_size x 2048) contains only the last hidden state for all sentences in the minibatch
|
||||
|
||||
GenSenSingle will return the output of a single model `nli_large_bothskip (+STN +Fr +De +NLI +L +STP)`. You can concatenate the output of multiple models by creating a GenSen instance with multiple GenSenSingle instances, as follows:
|
||||
|
||||
```python
|
||||
gensen_2 = GenSenSingle(
|
||||
model_folder='./data/models',
|
||||
filename_prefix='nli_large_bothskip_parse',
|
||||
pretrained_emb='./data/embedding/glove.840B.300d.h5'
|
||||
)
|
||||
gensen = GenSen(gensen_1, gensen_2)
|
||||
reps_h, reps_h_t = gensen.get_representation(
|
||||
sentences, pool='last', return_numpy=True, tokenize=True
|
||||
)
|
||||
```
|
||||
|
||||
1) `reps_h` (batch_size x seq_len x 4096) contains the hidden states for all words in all sentences (padded to the max length of sentences)
|
||||
2) `reps_h_t` (batch_size x 4096) contains only the last hidden state for all sentences in the minibatch
|
||||
|
||||
The model will produce a fixed-length vector for each sentence as well as the hidden states corresponding to each word in every sentence (padded to max sentence length). You can also return a numpy array instead of a `torch.FloatTensor` by setting `return_numpy=True`.
|
||||
|
||||
##### Vocabulary Expansion
|
||||
|
||||
If you have a specific domain for which you want to compute representations, you can call `vocab_expansion` on instances of the GenSenSingle or GenSen class simply by `gensen.vocab_expansion(vocab)` where vocab is a list of unique words in the new domain. This will learn a linear mapping from the provided pretrained embeddings (which have a significantly larger vocabulary) provided to the space of gensen's word vectors. For an example of how this is used in an actual setting, please refer to `gensen_senteval.py`.
|
||||
|
||||
##### Training a model from scratch
|
||||
|
||||
To train a model from scratch, simply run `train.py` with an appropriate JSON config file. An example config is provided in `example_config.json`. To continue training, just relaunch the same scripy with `load_dir=auto` in the config file.
|
||||
|
||||
To download some of the data required to train a GenSen model, run:
|
||||
|
||||
```bash
|
||||
bash get_data.sh
|
||||
```
|
||||
|
||||
Note that this script can take a while to complete since it downloads, tokenizes and lowercases a fairly large En-Fr corpus. If you already have these parallel corpora processed, you can replace the paths to these files in the provided `example_config.json`
|
||||
|
||||
Some of the data used in our work is no longer publicly available (BookCorpus - see http://yknzhu.wixsite.com/mbweb) or has an LDC license associated (Penn Treebank). As a result, the `example_config.json` script will only train on Multilingual NMT and NLI, since they are publicly available. To use models trained on all tasks, please use our available pre-trained models.
|
||||
|
||||
Additional Sequence-to-Sequence transduction tasks can be added trivally to the multi-task framework by editing the json config file with more tasks.
|
||||
|
||||
```bash
|
||||
python train.py --config example_config.json
|
||||
```
|
||||
|
||||
To use the default settings in `example_config.json` you will need a GPU with atleast 16GB of memory (such as a P100), to train on smaller GPUs, you may need to reduce the batch size.
|
||||
|
||||
Note that if "load_dir" is set to auto, the script will resume from the last saved model in "save_dir".
|
||||
|
||||
##### Creating a GenSen model from a trained multi-task model
|
||||
|
||||
Once you have a trained model, we can throw away all of the decoders and just retain the encoder used to compute sentence representations.
|
||||
|
||||
You can do this by running
|
||||
|
||||
```bash
|
||||
python create_gensen.py -t <path_to_trained_model> -s <path_to_save_encoder> -n <name_of_encoder>
|
||||
```
|
||||
|
||||
Once you have done this, you can load this model just like any of the pre-trained models by specifying the model_folder as `path_to_save_encoder` and filename_prefix as `name_of_encoder` in the above command.
|
||||
|
||||
```python
|
||||
your_gensen = GenSenSingle(
|
||||
model_folder='<path_to_save_encoder>',
|
||||
filename_prefix='<name_of_encoder>',
|
||||
pretrained_emb='./data/embedding/glove.840B.300d.h5'
|
||||
)
|
||||
```
|
||||
|
||||
### Transfer Learning Evaluations
|
||||
|
||||
We used the [SentEval](https://github.com/facebookresearch/SentEval) toolkit to run most of our transfer learning experiments. To replicate these numbers, clone their repository and follow setup instructions. Once complete, copy `gensen_senteval.py` and `gensen.py` into their examples folder and run the following commands to reproduce different rows in Table 2 of our paper. Note: Please set the path to the pretrained glove embeddings (`glove.840B.300d.h5`) and model folder as appropriate.
|
||||
|
||||
```
|
||||
(+STN +Fr +De +NLI +L +STP) python gensen_senteval.py --prefix_1 nli_large --prefix_2 nli_large_bothskip
|
||||
(+STN +Fr +De +NLI +2L +STP) python gensen_senteval.py --prefix_1 nli_large_bothskip --prefix_2 nli_large_bothskip_2layer
|
||||
(+STN +Fr +De +NLI +L +STP +Par) python gensen_senteval.py --prefix_1 nli_large_bothskip_parse --prefix_2 nli_large_bothskip
|
||||
```
|
||||
|
||||
### Reference
|
||||
|
||||
```
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Minibatching utilities."""
|
||||
import itertools
|
||||
import operator
|
||||
|
|
Загрузка…
Ссылка в новой задаче