Minor fix to gensen wrapper.

2019-05-29 19:02:37 -04:00 · 2019-05-29 19:02:37 -04:00 · 34cfea300f
--- a/utils_nlp/model/gensen/local_config.json
+++ b/utils_nlp/model/gensen/local_config.json
@ -1,45 +0,0 @@
-{
-  "training": {
-    "optimizer": "adam",
-    "clip_c": 1,
-    "lrate": 0.0001,
-    "batch_size": 48,
-    "n_gpus": 1
-  },
-  "management": {
-    "monitor_loss": 9600,
-    "print_samples": 12800,
-    "checkpoint_freq": 480000,
-    "eval_freq": 4800000
-  },
-  "data": {"paths": [
-        {
-            "train_src": "snli_1.0_train.txt.s1.tok",
-            "train_trg": "snli_1.0_train.txt.s2.tok",
-            "val_src": "snli_1.0_dev.txt.s1.tok",
-            "val_trg": "snli_1.0_dev.txt.s2.tok",
-            "taskname": "snli"
-        }
-    ],
-        "max_src_length": 90,
-        "max_trg_length": 90,
-        "task": "multi-seq2seq-nli",
-        "save_dir": "model",
-        "load_dir": "auto",
-        "nli_train": "snli_1.0_train.txt.clean.noblank",
-        "nli_dev": "snli_1.0_dev.txt.clean.noblank",
-        "nli_test": "snli_1.0_test.txt.clean.noblank"
-	},
-    "model": {
-    	"dim_src": 2048,
-    	"dim_trg": 2048,
-    	"dim_word_src": 512,
-    	"dim_word_trg": 512,
-    	"n_words_src": 80000,
-    	"n_words_trg": 30000,
-    	"n_layers_src": 1,
-    	"bidirectional": true,
-        "layernorm": false,
-        "dropout": 0.3
-    }
-}
--- a/utils_nlp/model/gensen/models.py
+++ b/utils_nlp/model/gensen/models.py
@ -1,342 +0,0 @@
-"""Parent model for Multitask Training."""
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
-from torch.autograd import Variable
-
-
-class ConditionalGRU(nn.Module):
-    """A Gated Recurrent Unit (GRU) cell with peepholes."""
-
-    def __init__(self, input_dim, hidden_dim, dropout=0.):
-        """Initialize params."""
-        super(ConditionalGRU, self).__init__()
-        self.input_dim = input_dim
-        self.hidden_dim = hidden_dim
-
-        self.input_weights = nn.Linear(self.input_dim, 3 * self.hidden_dim)
-        self.hidden_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)
-        self.peep_weights = nn.Linear(self.hidden_dim, 3 * self.hidden_dim)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        """Set params."""
-        stdv = 1.0 / math.sqrt(self.hidden_dim)
-        for weight in self.parameters():
-            weight.data.uniform_(-stdv, stdv)
-
-    def forward(self, input, hidden, ctx):
-        r"""Propogate input through the layer.
-
-        inputs:
-        input   - batch size x target sequence length  x embedding dimension
-        hidden  - batch size x hidden dimension
-        ctx     - batch size x source sequence length  x hidden dimension
-
-        returns: output, hidden
-        output  - batch size x target sequence length  x hidden dimension
-        hidden  - (batch size x hidden dimension, \
-            batch size x hidden dimension)
-        """
-        def recurrence(input, hidden, ctx):
-            """Recurrence helper."""
-            input_gate = self.input_weights(input)
-            hidden_gate = self.hidden_weights(hidden)
-            peep_gate = self.peep_weights(ctx)
-            i_r, i_i, i_n = input_gate.chunk(3, 1)
-            h_r, h_i, h_n = hidden_gate.chunk(3, 1)
-            p_r, p_i, p_n = peep_gate.chunk(3, 1)
-            resetgate = F.sigmoid(i_r + h_r + p_r)
-            inputgate = F.sigmoid(i_i + h_i + p_i)
-            newgate = F.tanh(i_n + resetgate * h_n + p_n)
-            hy = newgate + inputgate * (hidden - newgate)
-
-            return hy
-
-        input = input.transpose(0, 1)
-
-        output = []
-        steps = range(input.size(0))
-        for i in steps:
-            hidden = recurrence(input[i], hidden, ctx)
-            if isinstance(hidden, tuple):
-                output.append(hidden[0])
-            else:
-                output.append(hidden)
-
-        output = torch.cat(output, 0).view(input.size(0), *output[0].size())
-        output = output.transpose(0, 1)
-        return output, hidden
-
-
-class MultitaskModel(nn.Module):
-    r"""A Multi Task Sequence to Sequence (Seq2Seq) model with GRUs.
-
-    Auxiliary NLI task trained jointly as well.
-    Ref: Multi-Task Sequence to Sequence Learning
-    https://arxiv.org/pdf/1511.06114.pdf
-    """
-
-    def __init__(
-        self, src_emb_dim, trg_emb_dim, src_vocab_size,
-        trg_vocab_size, src_hidden_dim, trg_hidden_dim,
-        pad_token_src, pad_token_trg, num_tasks, bidirectional=False,
-        nlayers_src=1, dropout=0., paired_tasks=None
-    ):
-        """Initialize Seq2Seq Model."""
-        super(MultitaskModel, self).__init__()
-        self.src_vocab_size = src_vocab_size
-        self.trg_vocab_size = trg_vocab_size
-        self.src_emb_dim = src_emb_dim
-        self.trg_emb_dim = trg_emb_dim
-        self.src_hidden_dim = src_hidden_dim
-        self.trg_hidden_dim = trg_hidden_dim
-        self.bidirectional = bidirectional
-        self.nlayers_src = nlayers_src
-        self.dropout = dropout
-        self.num_tasks = num_tasks
-        self.paired_tasks = paired_tasks
-        self.num_directions = 2 if bidirectional else 1
-        self.pad_token_src = pad_token_src
-        self.pad_token_trg = pad_token_trg
-        self.src_hidden_dim = src_hidden_dim // 2 \
-            if self.bidirectional else src_hidden_dim
-        self.decoder = ConditionalGRU
-
-        self.src_embedding = nn.Embedding(
-            src_vocab_size,
-            src_emb_dim,
-            self.pad_token_src,
-        )
-
-        self.encoder = nn.GRU(
-            src_emb_dim,
-            self.src_hidden_dim,
-            self.nlayers_src,
-            bidirectional=bidirectional,
-            batch_first=True,
-            dropout=self.dropout
-        )
-
-        self.enc_drp = nn.Dropout(self.dropout)
-
-        self.trg_embedding = nn.ModuleList([
-            nn.Embedding(
-                trg_vocab_size,
-                trg_emb_dim,
-                self.pad_token_trg,
-            )
-            for task in range(self.num_tasks)
-        ])
-
-        self.decoders = nn.ModuleList([
-            self.decoder(
-                trg_emb_dim, trg_hidden_dim,
-                dropout=self.dropout
-            )
-            for task in range(self.num_tasks)
-        ])
-
-        self.decoder2vocab = nn.ModuleList([
-            nn.Linear(trg_hidden_dim, trg_vocab_size)
-            for task in range(self.num_tasks)
-        ])
-
-        self.nli_decoder = nn.Sequential(
-            nn.Dropout(0.3),
-            nn.Linear(4 * src_hidden_dim, 512),
-            nn.ReLU(),
-            nn.Linear(512, 3)
-        )
-
-        self.init_weights()
-
-    def init_weights(self):
-        """Initialize weights."""
-        initrange = 0.1
-        self.src_embedding.weight.data.uniform_(-initrange, initrange)
-        for module in self.trg_embedding:
-            module.weight.data.uniform_(-initrange, initrange)
-        for module in self.decoder2vocab:
-            module.bias.data.fill_(0)
-
-    def set_pretrained_embeddings(self, embedding_matrix):
-        """Set embedding weights."""
-        if (
-            embedding_matrix.shape[0] != self.src_embedding.weight.size(0) or
-            embedding_matrix.shape[1] != self.src_embedding.weight.size(1)
-        ):
-            self.src_embedding = nn.Embedding(
-                embedding_matrix.shape[0],
-                embedding_matrix.shape[1]
-            )
-            self.src_vocab_size = embedding_matrix.shape[0]
-            self.src_emb_dim = embedding_matrix.shape[1]
-
-        try:
-            self.src_embedding.weight.data.set_(
-                torch.from_numpy(embedding_matrix)
-            )
-        except:
-            self.src_embedding.weight.data.set_(
-                torch.from_numpy(embedding_matrix).cuda()
-            )
-
-        self.src_embedding.cuda()
-
-    def forward(
-        self, minibatch, task_idx,
-        return_hidden=False, paired_trg=None
-    ):
-        r"""Propogate input through the network.
-
-        Seq2Seq:
-        inputs: minibatch['input_src'], minibatch['input_trg']
-        input_src       - batch size x source sequence length
-        input_trg       - batch size x target sequence length
-        src_lengths     - batch size (list)
-        paired_trg      - batch size x target sequence length or None
-        returns: decoder_logit (pre-softmax over words)
-        decoder_logit   - batch size x target sequence length x target vocab size
-
-        NLI:
-        sent1           - batch size x source sequence length
-        sent2           - batch size x target sequence length
-        sent1_lengths   - batch size (list)
-        sent2_lengths   - batch size (list)
-        rev_sent1       - batch size (LongTensor)
-        rev_sent2       - batch size (LongTensor)
-        returns: class_logits (pre-softmax over NLI classes)
-        decoder_logit   - batch size x 3
-        """
-        if minibatch['type'] == 'nli':
-            sent1_emb = self.src_embedding(minibatch['sent1'])
-            sent2_emb = self.src_embedding(minibatch['sent2'])
-
-            sent1_lengths = minibatch['sent1_lens'].data.view(-1).tolist()
-            sent1_emb = pack_padded_sequence(
-                sent1_emb, sent1_lengths, batch_first=True
-            )
-            sent1, sent1_h = self.encoder(sent1_emb)
-
-            sent2_lengths = minibatch['sent2_lens'].data.view(-1).tolist()
-            sent2_emb = pack_padded_sequence(
-                sent2_emb, sent2_lengths, batch_first=True
-            )
-            sent2, sent2_h = self.encoder(sent2_emb)
-
-            if self.bidirectional:
-                sent1_h = torch.cat((sent1_h[-1], sent1_h[-2]), 1)
-                sent2_h = torch.cat((sent2_h[-1], sent2_h[-2]), 1)
-            else:
-                sent1_h = sent1_h[-1]
-                sent2_h = sent2_h[-1]
-
-            sent1_h = sent1_h.index_select(0, minibatch['rev_sent1'])
-            sent2_h = sent2_h.index_select(0, minibatch['rev_sent2'])
-
-            features = torch.cat((
-                sent1_h, sent2_h,
-                torch.abs(sent1_h - sent2_h),
-                sent1_h * sent2_h
-            ), 1)
-
-            if return_hidden:
-                return sent1_h, sent2_h, self.nli_decoder(features)
-            else:
-                return self.nli_decoder(features)
-
-        else:
-            src_emb = self.src_embedding(minibatch['input_src'])
-            trg_emb = self.trg_embedding[task_idx](minibatch['input_trg'])
-            src_lengths = minibatch['src_lens'].data.view(-1).tolist()
-            src_emb = pack_padded_sequence(
-                src_emb, src_lengths, batch_first=True
-            )
-
-            _, src_h_t = self.encoder(src_emb)
-
-            if self.bidirectional:
-                h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
-            else:
-                h_t = src_h_t[-1]
-
-            h_t = h_t.unsqueeze(0)
-            h_t = self.enc_drp(h_t)
-
-            print("INSIDE FORWARD:", h_t.shape)
-            # Debug with squeeze on error.
-            trg_h, _ = self.decoders[task_idx](
-                trg_emb, h_t.view(-1, self.trg_hidden_dim), h_t.view(-1, self.trg_hidden_dim)
-            )
-
-            trg_h_reshape = trg_h.contiguous().view(
-                trg_h.size(0) * trg_h.size(1), trg_h.size(2)
-            )
-
-            decoder_logit = self.decoder2vocab[task_idx](trg_h_reshape)
-            decoder_logit = decoder_logit.view(
-                trg_h.size(0), trg_h.size(1), decoder_logit.size(1)
-            )
-
-            if (
-                self.paired_tasks is not None and
-                task_idx in self.paired_tasks and
-                paired_trg is not None
-            ):
-                other_task_idx = self.paired_tasks[task_idx]
-                trg_emb_2 = self.trg_embedding[other_task_idx](paired_trg)
-
-                trg_h_2, _ = self.decoders[other_task_idx](
-                    trg_emb_2, h_t.squeeze(), h_t.squeeze()
-                )
-
-                trg_h_reshape_2 = trg_h_2.contiguous().view(
-                    trg_h_2.size(0) * trg_h_2.size(1), trg_h_2.size(2)
-                )
-
-                decoder_logit_2 = self.decoder2vocab[other_task_idx](
-                    trg_h_reshape_2
-                )
-                decoder_logit_2 = decoder_logit_2.view(
-                    trg_h_2.size(0), trg_h_2.size(1), decoder_logit_2.size(1)
-                )
-                if return_hidden:
-                    return decoder_logit, decoder_logit_2, h_t
-                else:
-                    return decoder_logit, decoder_logit_2
-
-            if return_hidden:
-                return decoder_logit, h_t
-            else:
-                return decoder_logit
-
-    def decode(self, logits):
-        """Return probability distribution over words."""
-        logits_reshape = logits.view(-1, logits.size(2))
-        word_probs = F.softmax(logits_reshape)
-        word_probs = word_probs.view(
-            logits.size(0), logits.size(1), logits.size(2)
-        )
-        return word_probs
-
-    def get_hidden(self, input_src, src_lengths, strategy='last'):
-        """Return the encoder hidden state."""
-        src_emb = self.src_embedding(input_src)
-        src_lengths = src_lengths.data.view(-1).tolist()
-        src_emb = pack_padded_sequence(src_emb, src_lengths, batch_first=True)
-        src_h, src_h_t = self.encoder(src_emb)
-        if strategy == 'last':
-            if self.bidirectional:
-                h_t = torch.cat((src_h_t[-1], src_h_t[-2]), 1)
-            else:
-                h_t = src_h_t[-1]
-        else:
-            src_h, _ = pad_packed_sequence(src_h, batch_first=True)
-            print("INSIDE GET HIDDEN",torch.max(src_h, 1)[0].shape)
-            h_t = torch.max(src_h, 1)[0].squeeze()
-
-        return src_h, h_t
--- a/utils_nlp/model/gensen_wrapper.py
+++ b/utils_nlp/model/gensen_wrapper.py
@ -3,13 +3,14 @@
 import json
 import os

+from sklearn.metrics.pairwise import cosine_similarity
+
 from utils_nlp.model.gensen import train
-from utils_nlp.model.gensen.gensen import GenSenSingle
-from utils_nlp.model.gensen.gensen_utils import gensen_preprocess
 from utils_nlp.model.gensen.create_gensen_model import (
    create_multiseq2seq_model,
 )
-from sklearn.metrics.pairwise import cosine_similarity
+from utils_nlp.model.gensen.gensen import GenSenSingle
+from utils_nlp.model.gensen.gensen_utils import gensen_preprocess


 class GenSenClassifier:
@ -25,11 +26,11 @@ class GenSenClassifier:
    """

    def __init__(
-        self,
-        config_file,
-        pretrained_embedding_path,
-        learning_rate=0.0001,
-        cache_dir=".",
+            self,
+            config_file,
+            pretrained_embedding_path,
+            learning_rate=0.0001,
+            cache_dir=".",
    ):
        self.learning_rate = learning_rate
        self.config_file = config_file
@ -41,7 +42,7 @@ class GenSenClassifier:
        """Validate input params."""

        if not isinstance(self.learning_rate, float) or (
-            self.learning_rate <= 0.0
+                self.learning_rate <= 0.0
        ):
            raise ValueError(
                "Learning rate must be of type float and greater than 0"
@ -133,8 +134,6 @@ class GenSenClassifier:

        """

-        self._validate_params()
-
        # Use only if you have the model trained and saved.
        # self.cache_dir = os.path.join(self.cache_dir, "clean/snli_1.0")
        self._create_multiseq2seq_model()
--- a/utils_nlp/models/gensen/README.md
+++ b/utils_nlp/models/gensen/README.md
@ -9,133 +9,28 @@ ICLR 2018

 ### About

-GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to our ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ&noteId=B18WgG-CZ).
+GenSen is a technique to learn general purpose, fixed-length representations of sentences via multi-task training. These representations are useful for transfer and low-resource learning. For details please refer to ICLR [paper](https://openreview.net/forum?id=B18WgG-CZ&noteId=B18WgG-CZ).

 ### Code

-We provide a PyTorch implementation of our paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks.
-
+We provide a distributed PyTorch with Horovod implementation of the paper along with pre-trained models as well as code to evaluate these models on a variety of transfer learning benchmarks.
+This code is based on the gibhub codebase from [Maluuba](https://github.com/Maluuba/gensen), but we have refactored the code in the following aspects:
+1. Support a distributed PyTorch with Horovod
+2. Clean and refactor the original code in a more structured form
+3. Change the training file (`train.py`) from non-stopping to stop when the validation loss reaches to the local minimum
+4. Update the code from Python 2.7 to 3+ and PyTorch from 0.2 or 0.3 to 1.0.1
+5. Add some necessary comments
+6. Add some code for training on AzureML platform
+7. Fix the bug on when setting the batch size to 1, the training raises an error
 ### Requirements

- Python 2.7 (Python 3 compatibility coming soon)
- PyTorch 0.2 or 0.3
+- Python 3+
+- PyTorch 1.0.1
 - nltk
 - h5py
 - numpy
 - scikit-learn

-#### Usage
-
-##### Setting up Models & pre-trained word vecotrs
-
-You download our pre-trained models and set up pre-trained word vectors for vocabulary expansion by
-
-```bash
-cd data/models
-bash download_models.sh
-cd ../embedding
-bash glove2h5.sh
-```
-
-##### Using a pre-trained model to extract sentence representations.
-
-You can use our pre-trained models to extract the last hidden state or all hidden states of our multi-task GRU. Additionally, you can concatenate the output of multiple models to replicate the numbers in our paper.
-
-```python
-from gensen import GenSen, GenSenSingle
-
-gensen_1 = GenSenSingle(
-    model_folder='./data/models',
-    filename_prefix='nli_large_bothskip',
-    pretrained_emb='./data/embedding/glove.840B.300d.h5'
-)
-reps_h, reps_h_t = gensen_1.get_representation(
-    sentences, pool='last', return_numpy=True, tokenize=True
-)
-print reps_h.shape, reps_h_t.shape
-```
-
- The input to `get_representation` is `sentences`, which should be a list of strings. If your strings are not pre-tokenized, then set `tokenize=True` to use the NLTK tokenizer before computing representations.
- `reps_h` (batch_size x seq_len x 2048) contains the hidden states for all words in all sentences (padded to the max length of sentences)
- `reps_h_t` (batch_size x 2048) contains only the last hidden state for all sentences in the minibatch 
-
-GenSenSingle will return the output of a single model `nli_large_bothskip (+STN +Fr +De +NLI +L +STP)`. You can concatenate the output of multiple models by creating a GenSen instance with multiple GenSenSingle instances, as follows:
-
-```python
-gensen_2 = GenSenSingle(
-    model_folder='./data/models',
-    filename_prefix='nli_large_bothskip_parse',
-    pretrained_emb='./data/embedding/glove.840B.300d.h5'
-)
-gensen = GenSen(gensen_1, gensen_2)
-reps_h, reps_h_t = gensen.get_representation(
-    sentences, pool='last', return_numpy=True, tokenize=True
-)
-```
-
-1) `reps_h` (batch_size x seq_len x 4096) contains the hidden states for all words in all sentences (padded to the max length of sentences)
-2) `reps_h_t` (batch_size x 4096) contains only the last hidden state for all sentences in the minibatch 
-
-The model will produce a fixed-length vector for each sentence as well as the hidden states corresponding to each word in every sentence (padded to max sentence length). You can also return a numpy array instead of a `torch.FloatTensor` by setting `return_numpy=True`. 
-
-##### Vocabulary Expansion
-
-If you have a specific domain for which you want to compute representations, you can call `vocab_expansion` on instances of the GenSenSingle or GenSen class simply by `gensen.vocab_expansion(vocab)` where vocab is a list of unique words in the new domain. This will learn a linear mapping from the provided pretrained embeddings (which have a significantly larger vocabulary) provided to the space of gensen's word vectors. For an example of how this is used in an actual setting, please refer to `gensen_senteval.py`.
-
-##### Training a model from scratch
-
-To train a model from scratch, simply run `train.py` with an appropriate JSON config file. An example config is provided in `example_config.json`. To continue training, just relaunch the same scripy with `load_dir=auto` in the config file.
-
-To download some of the data required to train a GenSen model, run:
-
-```bash
-bash get_data.sh
-```
-
-Note that this script can take a while to complete since it downloads, tokenizes and lowercases a fairly large En-Fr corpus. If you already have these parallel corpora processed, you can replace the paths to these files in the provided `example_config.json` 
-
-Some of the data used in our work is no longer publicly available (BookCorpus - see http://yknzhu.wixsite.com/mbweb) or has an LDC license associated (Penn Treebank). As a result, the `example_config.json` script will only train on Multilingual NMT and NLI, since they are publicly available. To use models trained on all tasks, please use our available pre-trained models.
-
-Additional Sequence-to-Sequence transduction tasks can be added trivally to the multi-task framework by editing the json config file with more tasks.
-
-```bash
-python train.py --config example_config.json
-```
-
-To use the default settings in `example_config.json` you will need a GPU with atleast 16GB of memory (such as a P100), to train on smaller GPUs, you may need to reduce the batch size.
-
-Note that if "load_dir" is set to auto, the script will resume from the last saved model in "save_dir".
-
-##### Creating a GenSen model from a trained multi-task model
-
-Once you have a trained model, we can throw away all of the decoders and just retain the encoder used to compute sentence representations.
-
-You can do this by running
-
-```bash
-python create_gensen.py -t <path_to_trained_model> -s <path_to_save_encoder> -n <name_of_encoder>
-```
-
-Once you have done this, you can load this model just like any of the pre-trained models by specifying the model_folder as `path_to_save_encoder` and filename_prefix as `name_of_encoder` in the above command.
-
-```python
-your_gensen = GenSenSingle(
-    model_folder='<path_to_save_encoder>',
-    filename_prefix='<name_of_encoder>',
-    pretrained_emb='./data/embedding/glove.840B.300d.h5'
-)
-```
-
-### Transfer Learning Evaluations
-
-We used the [SentEval](https://github.com/facebookresearch/SentEval) toolkit to run most of our transfer learning experiments. To replicate these numbers, clone their repository and follow setup instructions. Once complete, copy `gensen_senteval.py` and `gensen.py` into their examples folder and run the following commands to reproduce different rows in Table 2 of our paper. Note: Please set the path to the pretrained glove embeddings (`glove.840B.300d.h5`) and model folder as appropriate.
-
-```
-(+STN +Fr +De +NLI +L +STP)      python gensen_senteval.py --prefix_1 nli_large --prefix_2 nli_large_bothskip
-(+STN +Fr +De +NLI +2L +STP)     python gensen_senteval.py --prefix_1 nli_large_bothskip --prefix_2 nli_large_bothskip_2layer
-(+STN +Fr +De +NLI +L +STP +Par) python gensen_senteval.py --prefix_1 nli_large_bothskip_parse --prefix_2 nli_large_bothskip
-```
-
 ### Reference

 ```
--- a/utils_nlp/models/gensen/utils.py
+++ b/utils_nlp/models/gensen/utils.py
@ -1,5 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
 """Minibatching utilities."""
 import itertools
 import operator