fix qa notebook

2020-02-11 05:27:23 +00:00 · 2020-02-11 05:27:23 +00:00 · 67b1d8a8ea
--- a/examples/question_answering/question_answering_squad_transformers.ipynb
+++ b/examples/question_answering/question_answering_squad_transformers.ipynb
@ -69,23 +69,19 @@
   "source": [
    "import os\n",
    "import sys\n",
-    "import scrapbook as sb\n",
    "\n",
+    "import scrapbook as sb\n",
    "import torch\n",
    "\n",
-    "nlp_path = os.path.abspath('../../')\n",
-    "if nlp_path not in sys.path:\n",
-    "    sys.path.insert(0, nlp_path)\n",
-    "\n",
+    "from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
+    "from utils_nlp.common.timer import Timer\n",
    "from utils_nlp.dataset.squad import load_pandas_df\n",
+    "from utils_nlp.eval.question_answering import evaluate_qa\n",
    "from utils_nlp.models.transformers.datasets import QADataset\n",
    "from utils_nlp.models.transformers.question_answering import (\n",
+    "    AnswerExtractor,\n",
    "    QAProcessor,\n",
-    "    AnswerExtractor\n",
-    ")\n",
-    "                                                              \n",
-    "from utils_nlp.eval.question_answering import evaluate_qa\n",
-    "from utils_nlp.common.timer import Timer"
+    ")"
   ]
  },
  {
@ -559,7 +555,7 @@
    "* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\n",
    "* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
    "\n",
-    "`QAProcessor.preprocess` returns a Pytorch Dataloader. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
+    "`QAProcessor.preprocess` returns a Pytorch DataSet. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
   ]
  },
  {
@ -577,24 +573,28 @@
   ],
   "source": [
    "qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
-    "train_dataloader = qa_processor.preprocess(\n",
-    "    train_dataset, \n",
-    "    batch_size=PER_GPU_BATCH_SIZE,\n",
-    "    num_gpus=NUM_GPUS,\n",
+    "train_dataset = qa_processor.preprocess(\n",
+    "    train_dataset,\n",
    "    is_training=True,\n",
    "    max_question_length=MAX_QUESTION_LENGTH,\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
-    "    doc_stride=DOC_STRIDE\n",
+    "    doc_stride=DOC_STRIDE,\n",
    ")\n",
    "\n",
-    "dev_dataloader = qa_processor.preprocess(\n",
-    "    dev_dataset, \n",
-    "    batch_size=PER_GPU_BATCH_SIZE,\n",
-    "    num_gpus=NUM_GPUS,\n",
+    "# we keep a copy of the oroginal dev_dataset as it is needed for evaluation\n",
+    "dev_dataset_processed = qa_processor.preprocess(\n",
+    "    dev_dataset,\n",
    "    is_training=False,\n",
    "    max_question_length=MAX_QUESTION_LENGTH,\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
-    "    doc_stride=DOC_STRIDE\n",
+    "    doc_stride=DOC_STRIDE,\n",
+    ")\n",
+    "\n",
+    "train_dataloader = dataloader_from_dataset(\n",
+    "    train_dataset, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
+    ")\n",
+    "dev_dataloader = dataloader_from_dataset(\n",
+    "    dev_dataset_processed, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
    ")"
   ]
  },
--- a/tests/unit/test_models_transformers_question_answering.py
+++ b/tests/unit/test_models_transformers_question_answering.py
@ -65,22 +65,18 @@ def qa_test_data(qa_test_df, tmp_module):
        qa_id_col=qa_test_df["qa_id_col"],
    )

+    # bert
    qa_processor_bert = QAProcessor(cache_dir=tmp_module)
    train_features_bert = qa_processor_bert.preprocess(
        train_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=True,
        max_question_length=16,
        max_seq_length=64,
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
-
    test_features_bert = qa_processor_bert.preprocess(
        test_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=False,
        max_question_length=16,
        max_seq_length=64,
@ -88,22 +84,18 @@ def qa_test_data(qa_test_df, tmp_module):
        feature_cache_dir=tmp_module,
    )

+    # xlnet
    qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
    train_features_xlnet = qa_processor_xlnet.preprocess(
        train_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=True,
        max_question_length=16,
        max_seq_length=64,
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
-
    test_features_xlnet = qa_processor_xlnet.preprocess(
        test_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=False,
        max_question_length=16,
        max_seq_length=64,
@ -111,22 +103,20 @@ def qa_test_data(qa_test_df, tmp_module):
        feature_cache_dir=tmp_module,
    )

-    qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
+    # distilbert
+    qa_processor_distilbert = QAProcessor(
+        model_name="distilbert-base-uncased", cache_dir=tmp_module
+    )
    train_features_distilbert = qa_processor_distilbert.preprocess(
        train_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=True,
        max_question_length=16,
        max_seq_length=64,
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
-
    test_features_distilbert = qa_processor_distilbert.preprocess(
        test_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=False,
        max_question_length=16,
        max_seq_length=64,
@ -151,11 +141,21 @@ def qa_test_data(qa_test_df, tmp_module):

@pytest.mark.gpu
 def test_QAProcessor(qa_test_data, tmp_module):
-    for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
+    for model_name in [
+        "bert-base-cased",
+        "xlnet-base-cased",
+        "distilbert-base-uncased",
+    ]:
        qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
-        qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
-        qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
-        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(
+            qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
+        )
+        qa_processor.preprocess(
+            qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
+        )
+        qa_processor.preprocess(
+            qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
+        )

    # test unsupported model type
    with pytest.raises(ValueError):
@ -163,18 +163,24 @@ def test_QAProcessor(qa_test_data, tmp_module):

    # test training data has no ground truth exception
    with pytest.raises(Exception):
-        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(
+            qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
+        )

    # test when answer start is a list, but answer text is not
    with pytest.raises(Exception):
        qa_processor.preprocess(
-            qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_start_text_mismatch"],
+            is_training=True,
+            feature_cache_dir=tmp_module,
        )

    # test when training data has multiple answers
    with pytest.raises(Exception):
        qa_processor.preprocess(
-            qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_multi_answers"],
+            is_training=True,
+            feature_cache_dir=tmp_module,
        )


@ -190,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
    assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
    assert os.path.exists(os.path.join(model_output_dir, "config.json"))

-    qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
+    qa_extractor_from_cache = AnswerExtractor(
+        cache_dir=tmp_module, load_model_from_dir=model_output_dir
+    )
    qa_extractor_from_cache.predict(test_loader_bert, verbose=False)

    # xlnet
@ -202,8 +210,12 @@ def test_AnswerExtractor(qa_test_data, tmp_module):

    # distilbert
    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
-    test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
-    qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
+    test_loader_xlnet = dataloader_from_dataset(
+        qa_test_data["test_features_distilbert"], shuffle=False
+    )
+    qa_extractor_distilbert = AnswerExtractor(
+        model_name="distilbert-base-uncased", cache_dir=tmp_module
+    )
    qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
    qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)

--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@ -27,19 +27,41 @@ import jsonlines
 import torch
 from torch.utils.data import TensorDataset
 from tqdm import tqdm
-from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
-from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
-from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
-from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
+from transformers.modeling_albert import (
+    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    AlbertForQuestionAnswering,
+)
+from transformers.modeling_bert import (
+    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    BertForQuestionAnswering,
+)
+from transformers.modeling_distilbert import (
+    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    DistilBertForQuestionAnswering,
+)
+from transformers.modeling_xlnet import (
+    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetForQuestionAnswering,
+)
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize

-from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
-from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
+from utils_nlp.common.pytorch_utils import (
+    compute_training_steps,
+    get_device,
+    move_model_to_device,
+)
+from utils_nlp.models.transformers.common import (
+    MAX_SEQ_LEN,
+    TOKENIZER_CLASS,
+    Transformer,
+)

 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update(
+    {k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
+)
 MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})

 # cached files during preprocessing
@ -62,25 +84,29 @@ class QAProcessor:
    Class for preprocessing and postprocessing question answering data.

    Args:
-        model_name (str, optional): Name of the model. Call QAProcessor.list_supported_models() to
-            get all supported models. Defaults to "bert-base-cased".
+        model_name (str, optional): Name of the model.
+            Call QAProcessor.list_supported_models() to get all supported models.
+            Defaults to "bert-base-cased".
        to_lower (bool, optional): Whether to convert all letters to lower case during
-            tokenization. This is determined by if a cased model is used. Defaults to False,
-            which corresponds to a cased model.
-        custom_tokenize (function, optional): A custom tokenize function used to tokenize the
-            input text. If not provided, the default tokenizer corresponding to the model_name
-            is loaded and its `tokenize` method is used. NOTE that even this function is
-            provided, the numerical token ids are still generated by the `convert_tokens_to_ids`
-            method of the default tokenizer, so there is a risk that tokens generated by the
-            custom_tokenize function don't have correponding token ids in the default toeknizer.
+            tokenization. This is determined by if a cased model is used.
+            Defaults to False, which corresponds to a cased model.
+        custom_tokenize (function, optional): A custom tokenize function
+            used to tokenize the input text. If not provided, the default tokenizer
+            corresponding to the model_name is loaded and its `tokenize` method is used.
+            NOTE that even this function is provided, the numerical token ids are still
+            generated by the `convert_tokens_to_ids` method of the default tokenizer,
+            so there is a risk that tokens generated by the custom_tokenize
+            function don't have correponding token ids in the default toeknizer.
            Defaults to None.
        cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
    """

-    def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
+    def __init__(
+        self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
+    ):
        self.model_name = model_name
        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
-            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
+            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
        )
        self.do_lower_case = to_lower
        self.custom_tokenize = custom_tokenize
@ -149,9 +175,6 @@ class QAProcessor:
        self,
        qa_dataset,
        is_training,
-        batch_size=32,
-        num_gpus=None,
-        distributed=False,
        max_question_length=64,
        max_seq_length=MAX_SEQ_LEN,
        doc_stride=128,
@ -161,28 +184,31 @@ class QAProcessor:
        Preprocesses raw question answering data and generates train/test features.

        Args:
-            qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`): Question answering data in
-                standard QADataset format.
+            qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`):
+                Question answering data in standard QADataset format.
            is_training (bool): Whether the input data is training data.
-            max_question_length (int, optional): Maximum number of tokens of the question sequence
-                after tokenization, so the number of words in the raw question is usually less than
-                max_question_length. Defaults to 64.
-            max_seq_length (int, optional): Maximum number of tokens of the entire feature token
-                sequence after tokenization. The entire feature token sequence is composed
-                of [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP] for models other
-                than XLNet, and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
+            max_question_length (int, optional): Maximum number of tokens
+                of the question sequence after tokenization, so the number of words
+                in the raw question is usually less than max_question_length.
+                Defaults to 64.
+            max_seq_length (int, optional): Maximum number of tokens of the entire
+                feature token sequence after tokenization. The entire feature token
+                sequence is composed of:
+                [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP]
+                for models other than XLNet,
+                and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
                XLNet. Defaults to MAX_SEQ_LEN.
-            doc_stride (int, optional): Size (number of tokens) of the sliding window when
-                breaking down a long document paragraph in to multiple document spans. Defaults
-                to 128.
-            feature_cache_dir (int, optional): Directory to save some intermediate preprocessing
-                results.
+            doc_stride (int, optional): Size (number of tokens) of the sliding window
+                when breaking down a long document paragraph in to multiple document
+                spans. Defaults to 128.
+            feature_cache_dir (int, optional): Directory to save some intermediate
+                preprocessing results.
                If `is_training` is True, CACHED_EXAMPLES_TRAIN_FILE and
                CACHED_FEATURES_TRAIN_FILE are saved to this directory. Otherwise,
-                CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved to this
-                directory. These files are required during postprocessing to generate the final
-                answer texts from predicted answer start and answer end indices. Defaults to
-                "./cached_qa_features".
+                CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved
+                to this directory. These files are required during postprocessing to
+                generate the final answer texts from predicted answer start and answer
+                end indices. Defaults to "./cached_qa_features".
        Returns:
            DataSet: A Pytorch DataSet.
        """
@ -217,7 +243,9 @@ class QAProcessor:

                qa_examples.append(qa_example_cur)

-                qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
+                qa_examples_json.append(
+                    {"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
+                )

                features_cur = _create_qa_features(
                    qa_example_cur,
@ -257,17 +285,25 @@ class QAProcessor:
        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-        p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)

        if is_training:
            start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            qa_dataset = TensorDataset(
-                input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
+                input_ids,
+                input_mask,
+                segment_ids,
+                start_positions,
+                end_positions,
+                cls_index,
+                p_mask,
            )
        else:
            unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
-            qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
+            qa_dataset = TensorDataset(
+                input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
+            )

        return qa_dataset

@ -397,7 +433,14 @@ class QAResult(QAResult_):

 QAResultExtended_ = collections.namedtuple(
    "QAResultExtended",
-    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
+    [
+        "unique_id",
+        "start_top_log_probs",
+        "start_top_index",
+        "end_top_log_probs",
+        "end_top_index",
+        "cls_logits",
+    ],
 )


@ -481,36 +524,45 @@ class AnswerExtractor(Transformer):
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
            max_steps (int, optional): Total number of training steps.
                If set to a positive value, it overrides num_epochs.
-                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Otherwise, it's determined by the dataset length,
+                    gradient_accumulation_steps, and num_epochs.
                Defualts to -1.
            gradient_accumulation_steps (int, optional): Number of steps to accumulate
                before performing a backward/update pass.
                Default to 1.
-            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will be used.
+            num_gpus (int, optional): The number of GPUs to use.
+                If None, all available GPUs will be used.
+                If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
-            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-                -1, which means non-distributed training.
-            weight_decay (float, optional): Weight decay to apply after each parameter update.
-                Defaults to 0.0.
-            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
-                5e-5.
-            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
-            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
-                to `learning rate`. Defaults to 0.
-            verbose (bool, optional): Whether to print out the training log. Defaults to True.
-            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
-            cache_model (bool, optional): Whether to save the fine-tuned model. If True,
-                the fine-tuned model is saved to a `fine_tuned` folder under of the `cache_dir`
-                of AnswerExtractor. Defaults to True.
+            local_rank (int, optional): Local_rank for distributed training on GPUs.
+                Defaults to -1, which means non-distributed training.
+            weight_decay (float, optional): Weight decay to apply after each
+                parameter update. Defaults to 0.0.
+            learning_rate (float, optional):  Learning rate of the AdamW optimizer.
+                Defaults to 5e-5.
+            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
+                Defaults to 1e-8.
+            warmup_steps (int, optional): Number of steps taken to increase
+                learning rate from 0 to `learning rate`.
+                Defaults to 0.
+            verbose (bool, optional): Whether to print out the training log.
+                Defaults to True.
+            seed (int, optional): Random seed used to improve reproducibility.
+                Defaults to None.
+            cache_model (bool, optional): Whether to save the fine-tuned model.
+                If True, the fine-tuned model is saved to a `fine_tuned` folder
+                    under of the `cache_dir` of AnswerExtractor.
+                Defaults to True.

        """

        # init optimizer
-        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+        optimizer = Transformer.get_default_optimizer(
+            self.model, weight_decay, learning_rate, adam_epsilon
+        )

        # compute the max number of training steps
        max_steps = compute_training_steps(
@ -522,7 +574,7 @@ class AnswerExtractor(Transformer):

        # inin scheduler
        scheduler = Transformer.get_default_scheduler(
-            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
        )

        # fine tune
@ -530,7 +582,7 @@ class AnswerExtractor(Transformer):
            train_dataloader=train_dataloader,
            get_inputs=QAProcessor.get_inputs,
            num_gpus=num_gpus,
-            gpu_ids=gpu_ids,          
+            gpu_ids=gpu_ids,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
@ -550,13 +602,15 @@ class AnswerExtractor(Transformer):

        Args:
            test_dataloader (DataLoader): DataLoader for scoring the data.
-            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
+            num_gpus (int, optional): The number of GPUs to use.
+                If None, all available GPUs will be used.
+                If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
-            verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
+            verbose (bool, optional): Whether to print out the predicting log.
+                Defaults to True.

        Returns:
            list: List of :class:`QAResult` or :class:`QAResultExtended`.
@ -589,7 +643,9 @@ class AnswerExtractor(Transformer):
                    )
                else:
                    result = QAResult(
-                        unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
+                        unique_id=u_id.item(),
+                        start_logits=_to_list(outputs[0][i]),
+                        end_logits=_to_list(outputs[1][i]),
                    )
                all_results.append(result)
            torch.cuda.empty_cache()
@ -612,53 +668,61 @@ def postprocess_bert_answer(
    verbose_logging=False,
 ):
    """
-    Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit` for BERT.
+    Postprocesses start and end logits
+    generated by :meth:`AnswerExtractor.fit` for BERT.

    Args:
        results (list): List of :class:`QAResult`.
-        examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
-            contains the original document tokens that are used to generate the final answers
-            from the predicted start and end positions.
-        features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
-            contains the mapping from indices in the processed token list to the original
-            document tokens that are used to generate the final predicted answers.
-        do_lower_case (bool): Whether an uncased tokenizer was used during data preprocessing.
-            This is required during answer finalization by comparing the predicted answer text
-            and the original text span in :func:`_get_final_text`.
-        unanswerable_exists (bool, optional): Whether there are unanswerable questions in the
-            data. If True, the start and end logits of the [CLS] token, which indicate the
-            probability of the answer being empty, are included in the candidate answer list.
+        examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
+            This file contains the original document tokens that are used to generate
+            the final answers from the predicted start and end positions.
+        features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
+            This file contains the mapping from indices in the processed token list
+            to the original document tokens that are used to generate the final
+            predicted answers.
+        do_lower_case (bool): Whether an uncased tokenizer was used during
+            data preprocessing. This is required during answer finalization
+            by comparing the predicted answer text and the original
+            text span in :func:`_get_final_text`.
+        unanswerable_exists (bool, optional): Whether there are unanswerable
+            questions in the data. If True, the start and end logits of the [CLS]
+            token, which indicate the probability of the answer being empty,
+            are included in the candidate answer list.
            Defaults to False.
-        n_best_size (int, optional): The number of candidates to choose from each QAResult to
-            generate the final prediction. It's also the maximum number of n-best answers to
-            output for each question. Note that the number of n-best answers can be smaller than
-            `n_best_size` because some unqualified answers, e.g. answer that are too long,
-            are removed.
+        n_best_size (int, optional): The number of candidates to choose from each
+            QAResult to generate the final prediction. It's also the maximum number
+            of n-best answers to output for each question.
+            Note that the number of n-best answers can be smaller than `n_best_size`
+            because some unqualified answers,
+            e.g. answer that are too long, are removed.
        max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.
-        output_prediction_file (str, optional): Path of the file to save the predicted answers.
-            Defaults to "./qa_predictions.json".
-        output_nbest_file (str, optional): Path of the file to save the n-best answers. Defaults
-            to "./nbest_predictions.json".
-        output_null_log_odds_file (str, optional): If unanswerable_exists is True, the score
-            difference between empty prediction and best non-empty prediction are saved to this
-            file. These scores can be used to find the best threshold for predicting an empty
-            answer. Defaults to "./null_odds.json".
-        null_score_diff_threshold (float, optional): If unanswerable_exists=True and the score
-            difference between empty prediction and best non-empty prediction is higher than this
-            threshold, the final predicted answer is empty. Defaults to 0.0.
-        verbose_logging (bool, optional): Whether to log details of answer postprocessing.
-            Defaults to False.
+        output_prediction_file (str, optional): Path of the file to save the
+            predicted answers. Defaults to "./qa_predictions.json".
+        output_nbest_file (str, optional): Path of the file to save the n-best answers.
+            Defaults to "./nbest_predictions.json".
+        output_null_log_odds_file (str, optional): If unanswerable_exists is True,
+            the score difference between empty prediction and best non-empty prediction
+            are saved to this file. These scores can be used to find the best threshold
+            for predicting an empty answer. Defaults to "./null_odds.json".
+        null_score_diff_threshold (float, optional): If unanswerable_exists=True
+            and the score difference between empty prediction and best non-empty
+            prediction is higher than this threshold, the final predicted
+            answer is empty.
+            Defaults to 0.0.
+        verbose_logging (bool, optional): Whether to log details of
+            answer postprocessing. Defaults to False.

    Returns:
        tuple: (OrderedDict, OrderedDict, OrderedDict)
            The keys of the dictionaries are the `qa_id` in the original
            :class:`utils_nlp.dataset.pytorch.QADataset`
-            The values of the first dictionary are the predicted answer texts in string type.
-            The values of the second dictionary are the softmax probabilities of the predicted
-            answers.
-            The values of the third dictionary are the n-best answers for each qa_id. Note that
-            the number of n-best answers can be smaller than `n_best_size` because some
-            unqualified answers, e.g. answers that are too long, are removed.
+            The values of the first dictionary are the predicted answer texts
+            in string type. The values of the second dictionary are the softmax
+            probabilities of the predicted answers.
+            The values of the third dictionary are the n-best answers for each qa_id.
+            Note that the number of n-best answers can be smaller than `n_best_size`
+            because some unqualified answers, e.g. answers that are too long,
+            are removed.

    """
    with jsonlines.open(examples_file) as reader:
@ -753,7 +817,9 @@ def postprocess_bert_answer(

        # Sort by the sum of the start and end logits in ascending order,
        # so that the first element is the most probable answer
-        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
+        )

        seen_predictions = {}
        nbest = []
@ -786,11 +852,19 @@ def postprocess_bert_answer(
                final_text = ""
                seen_predictions[final_text] = True

-            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
+                )
+            )
        # if we didn't include the empty option in the n-best, include it
        if unanswerable_exists:
            if "" not in seen_predictions:
-                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+                nbest.append(
+                    _NbestPrediction(
+                        text="", start_logit=null_start_logit, end_logit=null_end_logit
+                    )
+                )

            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
@ -834,7 +908,9 @@ def postprocess_bert_answer(
            all_probs[example["qa_id"]] = nbest_json[0]["probability"]
        else:
            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            score_diff = (
+                score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            )
            scores_diff_json[example["qa_id"]] = score_diff
            if score_diff > null_score_diff_threshold:
                all_predictions[example["qa_id"]] = ""
@ -1000,7 +1076,9 @@ def postprocess_xlnet_answer(
                        )
                    )

-        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
+        )

        seen_predictions = {}
        nbest = []
@ -1031,14 +1109,20 @@ def postprocess_xlnet_answer(
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)

-            final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
+            final_text = _get_final_text(
+                tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
+            )

            if final_text in seen_predictions:
                continue

            seen_predictions[final_text] = True

-            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
+                )
+            )

        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
@ -1185,7 +1269,9 @@ def _create_qa_example(qa_input, is_training):
            actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
            cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
            if actual_text.find(cleaned_answer_text) == -1:
-                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
+                logger.warning(
+                    "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
+                )
                return
        else:
            start_position = -1
@ -1408,7 +1494,7 @@ def _create_qa_features(
        else:
            tok_end_position = len(all_doc_tokens) - 1
        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text
+            all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
        )

    # The -3 accounts for [CLS], [SEP] and [SEP]
@ -1579,7 +1665,7 @@ def _create_qa_features(
 # -------------------------------------------------------------------------------------------------
 # Post processing helper functions
 _PrelimPrediction = collections.namedtuple(
-    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
 )

 _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
@ -1644,7 +1730,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
+            logger.info(
+                "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
+            )
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using