update summarization models

2020-05-08 22:38:54 +00:00 · 2020-05-08 22:38:54 +00:00 · 78726ff471
--- a/utils_nlp/models/transformers/abstractive_summarization_bertsum.py
+++ b/utils_nlp/models/transformers/abstractive_summarization_bertsum.py
@ -5,34 +5,29 @@
 # This script reuses some code from https://github.com/huggingface/transformers/
 # Add to noticefile

-from collections import namedtuple
 import logging
 import os
 import pickle
-from tqdm import tqdm
+from collections import namedtuple

 import torch
-from torch.utils.data import (
-    DataLoader,
-    SequentialSampler,
-    RandomSampler,
-)
-
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-from transformers import BertModel
+from tqdm import tqdm
+from transformers import AutoTokenizer, BertModel

 from utils_nlp.common.pytorch_utils import (
    compute_training_steps,
-    get_device,
    get_amp,
+    get_device,
    move_model_to_device,
    parallelize_model,
 )
 from utils_nlp.eval import compute_rouge_python
-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
 from utils_nlp.models.transformers.bertsum import model_builder
 from utils_nlp.models.transformers.bertsum.model_builder import AbsSummarizer
 from utils_nlp.models.transformers.bertsum.predictor import build_predictor
+from utils_nlp.models.transformers.common import Transformer

 MODEL_CLASS = {"bert-base-uncased": BertModel}

@ -134,8 +129,11 @@ class BertSumAbsProcessor:

        """
        self.model_name = model_name
-        self.tokenizer = TOKENIZER_CLASS[self.model_name].from_pretrained(
-            self.model_name, do_lower_case=to_lower, cache_dir=cache_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            do_lower_case=to_lower,
+            cache_dir=cache_dir,
+            output_loading_info=False,
        )

        self.symbols = {
@ -156,7 +154,7 @@ class BertSumAbsProcessor:

    @staticmethod
    def list_supported_models():
-        return list(MODEL_CLASS.keys())
+        return list(MODEL_CLASS)

    @property
    def model_name(self):
@ -184,7 +182,7 @@ class BertSumAbsProcessor:
                also contains the target ids and the number of tokens
                in the target and target text.
            device (torch.device): A PyTorch device.
-            model_name (bool, optional): Model name used to format the inputs.
+            model_name (bool): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.

@ -403,7 +401,8 @@ class BertSumAbs(Transformer):
                check MODEL_CLASS for supported models. Defaults to "bert-base-uncased".
            finetune_bert (bool, option): Whether the bert model in the encoder is
                finetune or not. Defaults to True.
-            cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
+            cache_dir (str, optional): Directory to cache the tokenizer.
+                Defaults to ".".
            label_smoothing (float, optional): The amount of label smoothing.
                Value range is [0, 1]. Defaults to 0.1.
            test (bool, optional): Whether the class is initiated for test or not.
@ -412,13 +411,11 @@ class BertSumAbs(Transformer):
            max_pos_length (int, optional): maximum postional embedding length for the
                input. Defaults to 768.
        """
-
-        super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=0,
-            cache_dir=cache_dir,
+        model = MODEL_CLASS[model_name].from_pretrained(
+            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
        )
+        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
+
        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by BertSumAbs. "
@ -616,10 +613,7 @@ class BertSumAbs(Transformer):
            )

        train_dataloader = DataLoader(
-            train_dataset,
-            sampler=sampler,
-            batch_size=batch_size,
-            collate_fn=collate_fn,
+            train_dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn
        )

        # compute the max number of training steps
--- a/utils_nlp/models/transformers/abstractive_summarization_seq2seq.py
+++ b/utils_nlp/models/transformers/abstractive_summarization_seq2seq.py
@ -1,36 +1,34 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.

-import os
 import json
 import logging
-from tqdm import tqdm
+import os
 import random

 import torch
-from torch.utils.data import DataLoader, SequentialSampler, Dataset
+from torch.utils.data import DataLoader, Dataset, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm
+from transformers import BertConfig, RobertaConfig

-from transformers import RobertaConfig, BertConfig
-
-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
+import s2s_ft
+import s2s_ft.s2s_loader as seq2seq_loader
+from s2s_ft.config import BertForSeq2SeqConfig
+from s2s_ft.configuration_unilm import UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP, UnilmConfig
+from s2s_ft.modeling import (
+    UNILM_PRETRAINED_MODEL_ARCHIVE_MAP,
+    BertForSequenceToSequence,
+)
+from s2s_ft.modeling_decoding import BertForSeq2SeqDecoder
+from s2s_ft.tokenization_unilm import UnilmTokenizer
+from s2s_ft.utils import Seq2seqDatasetForBert, batch_list_to_batch_tensors
 from utils_nlp.common.pytorch_utils import (
    get_device,
    move_model_to_device,
    parallelize_model,
 )
-import s2s_ft
-from s2s_ft.utils import (
-    Seq2seqDatasetForBert,
-    batch_list_to_batch_tensors,
-)
-from s2s_ft.modeling import BertForSequenceToSequence
-from s2s_ft.modeling import UNILM_PRETRAINED_MODEL_ARCHIVE_MAP
-from s2s_ft.tokenization_unilm import UnilmTokenizer
-from s2s_ft.configuration_unilm import UnilmConfig, UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from s2s_ft.config import BertForSeq2SeqConfig
-import s2s_ft.s2s_loader as seq2seq_loader
-from s2s_ft.modeling_decoding import BertForSeq2SeqDecoder
+from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer

 SUPPORTED_BERT_MODELS = ["bert-large-uncased", "bert-base-cased", "bert-large-cased"]
 SUPPORTED_ROBERTA_MODELS = ["roberta-base", "roberta-large"]
@ -115,9 +113,7 @@ class S2SAbsSumProcessor:
            Defaults to ".".
    """

-    def __init__(
-        self, model_name="unilm-base-cased", to_lower=False, cache_dir=".",
-    ):
+    def __init__(self, model_name="unilm-base-cased", to_lower=False, cache_dir="."):

        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
            model_name, do_lower_case=to_lower, cache_dir=cache_dir
@ -513,7 +509,7 @@ class S2SAbstractiveSummarizer(Transformer):
            model_config = config_class.from_pretrained(
                self._model_name, cache_dir=cache_dir
            )
-        
+
        self.model_config = model_config

        # Convert regular model config to sequence to sequence config
@ -888,15 +884,15 @@ class S2SAbstractiveSummarizer(Transformer):
            state_dict = self.model.state_dict()

        bert_config = s2s_ft.modeling_decoding.BertConfig(
-                len(list(vocab.keys())) if not is_roberta else 50265, 
-                type_vocab_size=type_vocab_size,
-                max_position_embeddings=self.max_seq_length,
-                ffn_type=s2s_config.ffn_type,
-                num_qkv=s2s_config.num_qkv,
-                seg_emb=s2s_config.seg_emb,
-                is_roberta=is_roberta,
-                no_segment_embedding=no_segment_embedding
-            )
+            len(list(vocab.keys())) if not is_roberta else 50265,
+            type_vocab_size=type_vocab_size,
+            max_position_embeddings=self.max_seq_length,
+            ffn_type=s2s_config.ffn_type,
+            num_qkv=s2s_config.num_qkv,
+            seg_emb=s2s_config.seg_emb,
+            is_roberta=is_roberta,
+            no_segment_embedding=no_segment_embedding,
+        )

        model = BertForSeq2SeqDecoder.from_pretrained(
            self._bert_model_name,
@ -1035,7 +1031,7 @@ class S2SAbstractiveSummarizer(Transformer):
        if fp16:
            optim_to_save["amp"] = self.amp_state_dict
        torch.save(
-            optim_to_save, os.path.join(output_dir, "optim.{}.bin".format(global_step)),
+            optim_to_save, os.path.join(output_dir, "optim.{}.bin".format(global_step))
        )


@ -1087,7 +1083,7 @@ def load_and_cache_examples(
                else:
                    source_tokens = tokenizer.tokenize(example["src"])
                features.append(
-                    {"source_ids": tokenizer.convert_tokens_to_ids(source_tokens),}
+                    {"source_ids": tokenizer.convert_tokens_to_ids(source_tokens)}
                )

        if shuffle:
--- a/utils_nlp/models/transformers/extractive_summarization.py
+++ b/utils_nlp/models/transformers/extractive_summarization.py
@ -12,14 +12,20 @@ from multiprocessing import Pool, cpu_count

 import numpy as np
 import torch
-from torch.utils.data import (
-    DataLoader,
-    SequentialSampler,
-    RandomSampler,
-)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-from transformers import BertModel, DistilBertModel
+from transformers import AutoTokenizer, BertModel, DistilBertModel

+from utils_nlp.common.pytorch_utils import (
+    compute_training_steps,
+    get_device,
+    move_model_to_device,
+    parallelize_model,
+)
+from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
+from utils_nlp.models.transformers.abstractive_summarization_bertsum import (
+    fit_to_block_size,
+)

 from utils_nlp.models.transformers.bertsum import model_builder
 from utils_nlp.models.transformers.bertsum.data_loader import (
@ -32,17 +38,7 @@ from utils_nlp.models.transformers.bertsum.dataset import (
    ExtSumProcessedIterableDataset,
 )
 from utils_nlp.models.transformers.bertsum.model_builder import BertSumExt
-from utils_nlp.common.pytorch_utils import (
-    compute_training_steps,
-    get_device,
-    move_model_to_device,
-    parallelize_model,
-)
-from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
-from utils_nlp.models.transformers.abstractive_summarization_bertsum import (
-    fit_to_block_size,
-)
+from utils_nlp.models.transformers.common import Transformer

 MODEL_CLASS = {
    "bert-base-uncased": BertModel,
@ -302,7 +298,7 @@ def parallel_preprocess(input_data, preprocess, num_pool=-1):
    p = Pool(num_pool)

    results = p.map(
-        preprocess, input_data, chunksize=min(1, int(len(input_data) / num_pool)),
+        preprocess, input_data, chunksize=min(1, int(len(input_data) / num_pool))
    )
    p.close()
    p.join()
@ -347,8 +343,11 @@ class ExtSumProcessor:

        """
        self.model_name = model_name
-        self.tokenizer = TOKENIZER_CLASS[self.model_name].from_pretrained(
-            self.model_name, do_lower_case=to_lower, cache_dir=cache_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            do_lower_case=to_lower,
+            cache_dir=cache_dir,
+            output_loading_info=False,
        )
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
@ -361,7 +360,7 @@ class ExtSumProcessor:

    @staticmethod
    def list_supported_models():
-        return list(TOKENIZER_CLASS.keys())
+        return list(MODEL_CLASS)

    @property
    def model_name(self):
@ -389,7 +388,7 @@ class ExtSumProcessor:
                text. If train_model is True, it also contains the labels and target
                text.
            device (torch.device): A PyTorch device.
-            model_name (bool, optional): Model name used to format the inputs.
+            model_name (bool): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.

@ -500,7 +499,6 @@ class ExtSumProcessor:

        if len(src) == 0:
            raise ValueError("source doesn't have any sentences")
-            return None

        original_src_txt = [" ".join(s) for s in src]
        # no filtering for prediction
@ -588,12 +586,11 @@ class ExtractiveSummarizer(Transformer):
                Defaults to ".".
        """

-        super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=0,
-            cache_dir=cache_dir,
+        model = MODEL_CLASS[model_name].from_pretrained(
+            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
        )
+        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
+
        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by ExtractiveSummarizer. "
@ -621,7 +618,7 @@ class ExtractiveSummarizer(Transformer):

    @staticmethod
    def list_supported_models():
-        return list(MODEL_CLASS.keys())
+        return list(MODEL_CLASS)

    def fit(
        self,