Merge branch 'staging' into hlu/unilm_abstractive_summarization

2020-02-13 18:52:24 -05:00 · 2020-02-13 18:52:24 -05:00 · 2f6aaf7a41
--- a/.flake8
+++ b/.flake8
@ -14,4 +14,5 @@
 # E722: do not use bare except
 # E231: missing white space after "," --> black generates autoformat [,] which fails flake8
 ignore = E203, E266, W503, F403, F405, E402, E731, F821, E722, E231
-max-line-length = 100
+
+max-line-length = 88
--- a/README.md
+++ b/README.md
@ -87,7 +87,7 @@ The following is a list of related repositories that we like and think are usefu
 |[MASS](https://github.com/microsoft/MASS)|MASS: Masked Sequence to Sequence Pre-training for Language Generation.|
 |[MT-DNN](https://github.com/namisan/mt-dnn)|Multi-Task Deep Neural Networks for Natural Language Understanding.|
 |[UniLM](https://github.com/microsoft/unilm)|Unified Language Model Pre-training.|
-
+|[DialoGPT](https://github.com/microsoft/DialoGPT)|DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation|


 ## Build Status
--- a/examples/entailment/entailment_multinli_transformers.ipynb
+++ b/examples/entailment/entailment_multinli_transformers.ipynb
@ -53,7 +53,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {
    "scrolled": false
   },
@ -76,6 +76,7 @@
    "\n",
    "from utils_nlp.models.transformers.sequence_classification import Processor, SequenceClassifier\n",
    "from utils_nlp.dataset.multinli import load_pandas_df\n",
+    "from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
    "from utils_nlp.common.timer import Timer"
   ]
  },
@ -201,6 +202,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# sample\n",
    "train_df = train_df.sample(frac=TRAIN_DATA_USED_FRACTION).reset_index(drop=True)\n",
    "dev_df_matched = dev_df_matched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)\n",
    "dev_df_mismatched = dev_df_mismatched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)"
@ -224,7 +226,7 @@
   "source": [
    "## Tokenize and Preprocess\n",
    "Before training, we tokenize and preprocess the sentence texts to convert them into the format required by transformer model classes.  \n",
-    "The `create_dataloader_from_df` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataLoader`\n",
+    "The `dataset_from_dataframe` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataSet`\n",
    "* Tokenize input texts using the tokenizer of the pre-trained model specified by `model_name`. \n",
    "* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
    "* Pad or truncate the token lists to the specified max length."
@ -237,30 +239,35 @@
   "outputs": [],
   "source": [
    "processor = Processor(model_name=MODEL_NAME, cache_dir=CACHE_DIR, to_lower=TO_LOWER)\n",
-    "train_dataloader = processor.create_dataloader_from_df(\n",
+    "\n",
+    "train_dataset = processor.dataset_from_dataframe(\n",
    "    df=train_df,\n",
    "    text_col=TEXT_COL_1,\n",
    "    label_col=LABEL_COL_NUM,\n",
-    "    shuffle=True,\n",
    "    text2_col=TEXT_COL_2,\n",
    "    max_len=MAX_SEQ_LENGTH,\n",
-    "    batch_size=BATCH_SIZE,\n",
    ")\n",
-    "dev_dataloader_matched = processor.create_dataloader_from_df(\n",
+    "dev_dataset_matched = processor.dataset_from_dataframe(\n",
    "    df=dev_df_matched,\n",
-    "    text_col=TEXT_COL_1,\n",
-    "    shuffle=False,\n",
+    "    text_col=TEXT_COL_1,    \n",
    "    text2_col=TEXT_COL_2,\n",
    "    max_len=MAX_SEQ_LENGTH,\n",
-    "    batch_size=BATCH_SIZE,\n",
    ")\n",
-    "dev_dataloader_mismatched = processor.create_dataloader_from_df(\n",
+    "dev_dataset_mismatched = processor.dataset_from_dataframe(\n",
    "    df=dev_df_mismatched,\n",
-    "    text_col=TEXT_COL_1,\n",
-    "    shuffle=False,\n",
+    "    text_col=TEXT_COL_1,    \n",
    "    text2_col=TEXT_COL_2,\n",
    "    max_len=MAX_SEQ_LENGTH,\n",
-    "    batch_size=BATCH_SIZE,\n",
+    ")\n",
+    "\n",
+    "train_dataloader = dataloader_from_dataset(\n",
+    "    train_dataset, batch_size=BATCH_SIZE, shuffle=True\n",
+    ")\n",
+    "dev_dataloader_matched = dataloader_from_dataset(\n",
+    "    dev_dataset_matched, batch_size=BATCH_SIZE, shuffle=False\n",
+    ")\n",
+    "dev_dataloader_mismatched = dataloader_from_dataset(\n",
+    "    dev_dataset_mismatched, batch_size=BATCH_SIZE, shuffle=False\n",
    ")"
   ]
  },
--- a/examples/question_answering/question_answering_squad_transformers.ipynb
+++ b/examples/question_answering/question_answering_squad_transformers.ipynb
@ -69,23 +69,19 @@
   "source": [
    "import os\n",
    "import sys\n",
-    "import scrapbook as sb\n",
    "\n",
+    "import scrapbook as sb\n",
    "import torch\n",
    "\n",
-    "nlp_path = os.path.abspath('../../')\n",
-    "if nlp_path not in sys.path:\n",
-    "    sys.path.insert(0, nlp_path)\n",
-    "\n",
+    "from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
+    "from utils_nlp.common.timer import Timer\n",
    "from utils_nlp.dataset.squad import load_pandas_df\n",
+    "from utils_nlp.eval.question_answering import evaluate_qa\n",
    "from utils_nlp.models.transformers.datasets import QADataset\n",
    "from utils_nlp.models.transformers.question_answering import (\n",
+    "    AnswerExtractor,\n",
    "    QAProcessor,\n",
-    "    AnswerExtractor\n",
-    ")\n",
-    "                                                              \n",
-    "from utils_nlp.eval.question_answering import evaluate_qa\n",
-    "from utils_nlp.common.timer import Timer"
+    ")"
   ]
  },
  {
@ -559,7 +555,7 @@
    "* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\n",
    "* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
    "\n",
-    "`QAProcessor.preprocess` returns a Pytorch Dataloader. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
+    "`QAProcessor.preprocess` returns a Pytorch DataSet. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
   ]
  },
  {
@ -577,24 +573,28 @@
   ],
   "source": [
    "qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
-    "train_dataloader = qa_processor.preprocess(\n",
-    "    train_dataset, \n",
-    "    batch_size=PER_GPU_BATCH_SIZE,\n",
-    "    num_gpus=NUM_GPUS,\n",
+    "train_dataset = qa_processor.preprocess(\n",
+    "    train_dataset,\n",
    "    is_training=True,\n",
    "    max_question_length=MAX_QUESTION_LENGTH,\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
-    "    doc_stride=DOC_STRIDE\n",
+    "    doc_stride=DOC_STRIDE,\n",
    ")\n",
    "\n",
-    "dev_dataloader = qa_processor.preprocess(\n",
-    "    dev_dataset, \n",
-    "    batch_size=PER_GPU_BATCH_SIZE,\n",
-    "    num_gpus=NUM_GPUS,\n",
+    "# we keep a copy of the oroginal dev_dataset as it is needed for evaluation\n",
+    "dev_dataset_processed = qa_processor.preprocess(\n",
+    "    dev_dataset,\n",
    "    is_training=False,\n",
    "    max_question_length=MAX_QUESTION_LENGTH,\n",
    "    max_seq_length=MAX_SEQ_LENGTH,\n",
-    "    doc_stride=DOC_STRIDE\n",
+    "    doc_stride=DOC_STRIDE,\n",
+    ")\n",
+    "\n",
+    "train_dataloader = dataloader_from_dataset(\n",
+    "    train_dataset, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
+    ")\n",
+    "dev_dataloader = dataloader_from_dataset(\n",
+    "    dev_dataset_processed, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
    ")"
   ]
  },
--- a/examples/text_classification/tc_multi_languages_transformers.ipynb
+++ b/examples/text_classification/tc_multi_languages_transformers.ipynb
@ -525,7 +525,7 @@
   "source": [
    "with Timer() as t:\n",
    "    preds = model.predict(\n",
-    "        eval_dataloader=test_dataloader,\n",
+    "        test_dataloader=test_dataloader,\n",
    "        num_gpus=CONFIG['num_gpus'],\n",
    "        verbose=CONFIG['verbose']\n",
    "    )\n",
--- a/examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb
+++ b/examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb
@ -0,0 +1,768 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Copyright (c) Microsoft Corporation. All rights reserved.\n",
+    "\n",
+    "Licensed under the MIT License"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Distributed Training For Extractive Summarization on CNN/DM Dataset\n",
+    "\n",
+    "## Summary\n",
+    "This notebook demonstrates how to use Azure Machine Learning to run distributed training using Distributed Data Parallel in Pytorch for extractive summarization. For more detailed model related information, please see [extractive_summarization_cnndm_transformer.ipynb](extractive_summarization_cnndm_transformer.ipynb)\n",
+    "\n",
+    "## Prerequisites\n",
+    "If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, refer to the [Configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:\n",
+    "\n",
+    "- Azure subscription\n",
+    "- Azure Machine Learning Workspace\n",
+    "- Azure Machine Learning SDK\n",
+    "\n",
+    "To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "from tempfile import TemporaryDirectory\n",
+    "import torch\n",
+    "\n",
+    "import azureml.core\n",
+    "from azureml.core import Experiment, Workspace, Run\n",
+    "from azureml.core.compute import ComputeTarget, AmlCompute\n",
+    "from azureml.core.compute_target import ComputeTargetException\n",
+    "from azureml.train.dnn import PyTorch\n",
+    "from azureml.train.dnn import Nccl\n",
+    "from azureml.widgets import RunDetails\n",
+    "\n",
+    "nlp_path = os.path.abspath(\"../../\")\n",
+    "if nlp_path not in sys.path:\n",
+    "    sys.path.insert(0, nlp_path)\n",
+    "from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
+    "from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\n",
+    "\n",
+    "# Check core SDK version number\n",
+    "print(\"SDK version:\", azureml.core.VERSION)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for Azure ML Workspacen\n",
+    "SUBSRIPTION_ID = \"YOUR_SUBSCRIPTION_ID\"\n",
+    "LOCATION = \"YOUR_RESOURCE_GROUP_NAME\"  # example \"eastus2\"\n",
+    "RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\"  # modifiy to use your own\n",
+    "WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\"  # modifiy to use your own\n",
+    "\n",
+    "\n",
+    "# for creating Azure ML Compute Cluster\n",
+    "AMLCOMPUTE_CLUSTER_NAME = \"extsum5\"  # modifiy to use your own\n",
+    "NODE_COUNT = 4\n",
+    "VM_SIZE = \"STANDARD_NC6\"  # this should be the VM that's supported by Azure and Azure ML\n",
+    "\n",
+    "\n",
+    "# for creating Azure ML Experiment\n",
+    "EXPERIMENT_NAME = \"NLP-ExtSum\"  # modifiy to use your own\n",
+    "\n",
+    "\n",
+    "# local folder to save the downloaded data\n",
+    "LOCAL_DATA_FOLDER = (\n",
+    "    \"./bertsumdata/\"\n",
+    ")  # modify to use your own, the penultimate level folder should exist\n",
+    "\n",
+    "# Training related parameter\n",
+    "MODEL_NAME = \"distilbert-base-uncased\"  # limited choice\n",
+    "ENCODER = \"transformer\"\n",
+    "# folder in the workspace where the data is uploaded to\n",
+    "TARGET_DATA_FOLDER = \"/bertsum_processed_data\"  # modify to use your own\n",
+    "TARGET_OUTPUT_DIR = f\"output/{EXPERIMENT_NAME}/\"\n",
+    "# cache dir in the workspace\n",
+    "TARGET_CACHE_DIR = f\"cache/{EXPERIMENT_NAME}/\"\n",
+    "# file name for saving the prediction\n",
+    "SUMMARY_FILENAME = \"generated_summaries.txt\"\n",
+    "# file name for saving the trained model\n",
+    "MODEL_FILENAME = \"dist_extsum.pt\"\n",
+    "\n",
+    "\n",
+    "# local path to download the output from the cluster\n",
+    "LOCAL_OUTPUT_DIR = \"./output\"  # modifiy to use your own, the penultimate level folder\n",
+    "\n",
+    "\n",
+    "# local folder to store all the related files to be copied to the workspace\n",
+    "PROJECT_FOLDER = \"./azureml_exp\"\n",
+    "# conda environment name, the yaml file will be copied to the workspace\n",
+    "CONDA_ENV_NAME = \"nlp_gpu\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create an AML Workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the workspace using the specified parameters\n",
+    "ws = get_or_create_workspace(\n",
+    "    workspace_name=WORKSPACE_NAME,\n",
+    "    subscription_id=SUBSRIPTION_ID,\n",
+    "    resource_group=RESOURCE_GROUP,\n",
+    "    workspace_region=LOCATION,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\n",
+    "    \"Workspace name: \" + ws.name,\n",
+    "    \"Azure region: \" + ws.location,\n",
+    "    \"Subscription id: \" + ws.subscription_id,\n",
+    "    \"Resource group: \" + ws.resource_group,\n",
+    "    sep=\"\\n\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create an AML GPU Compute Cluster"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Found existing compute target.\n",
+      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-01-30T22:43:39.856000+00:00', 'errors': None, 'creationTime': '2020-01-23T04:50:26.160743+00:00', 'modifiedTime': '2020-01-23T20:31:35.349184+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)\n",
+    "    print(\"Found existing compute target.\")\n",
+    "except ComputeTargetException:\n",
+    "    print(\"Creating a new compute target...\")\n",
+    "    compute_config = AmlCompute.provisioning_configuration(\n",
+    "        vm_size=VM_SIZE, max_nodes=NODE_COUNT, NodeIdleTimeBeforeScaleDown=\"PT1200S\"\n",
+    "    )\n",
+    "\n",
+    "    # create the cluster\n",
+    "    gpu_compute_target = ComputeTarget.create(\n",
+    "        ws, AMLCOMPUTE_CLUSTER_NAME, compute_config\n",
+    "    )\n",
+    "\n",
+    "    gpu_compute_target.wait_for_completion(show_output=True)\n",
+    "\n",
+    "# use get_status() to get a detailed status for the current AmlCompute.\n",
+    "print(gpu_compute_target.get_status().serialize())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create an Experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "experiment = Experiment(ws, name=EXPERIMENT_NAME)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Dataset to Local File System"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p {LOCAL_DATA_FOLDER}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "bertsum_data.zip: 869MB [00:29, 29.7MB/s] \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'./bertsumdata/'"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Upload the Downloaded Dataset to AML Workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = ws.get_default_datastore()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare for the Experiment Run\n",
+    "Prepare the local project folder which is mirror to the workspace for the experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Generated conda file: nlp_gpu.yaml\r\n",
+      "\r\n",
+      "To create the conda environment:\r\n",
+      "$ conda env create -f nlp_gpu.yaml\r\n",
+      "\r\n",
+      "To update the conda environment:\r\n",
+      "$ conda env update -f nlp_gpu.yaml\r\n",
+      "\r\n",
+      "To register the conda environment in Jupyter:\r\n",
+      "$ conda activate nlp_gpu\r\n",
+      "$ python -m ipykernel install --user --name nlp_gpu --display-name \"Python (nlp_gpu)\"\r\n",
+      "\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "ENTRY_SCRIPT = \"extractive_summarization_cnndm_distributed_train.py\"\n",
+    "!mkdir -p {PROJECT_FOLDER}\n",
+    "!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
+    "!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
+    "!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
+    "!cp -r ../../utils_nlp {PROJECT_FOLDER}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Submit Run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)\n",
+    "os.makedirs(os.path.join(LOCAL_OUTPUT_DIR, EXPERIMENT_NAME), exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:azureml.train.estimator._framework_base_estimator:If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
+      "WARNING:azureml.train.estimator._framework_base_estimator:framework_version is not specified, defaulting to version 1.3.\n"
+     ]
+    }
+   ],
+   "source": [
+    "NcclConfig=Nccl()\n",
+    "estimator = PyTorch(source_directory=PROJECT_FOLDER,\n",
+    "                    compute_target=gpu_compute_target,\n",
+    "                    script_params={\n",
+    "                        \"--dist_url\": \"$AZ_BATCHAI_PYTORCH_INIT_METHOD\",\n",
+    "                        \"--rank\": \"$AZ_BATCHAI_TASK_INDEX\",\n",
+    "                        \"--node_count\": NODE_COUNT,\n",
+    "                        \"--data_dir\":ds.path(f'{TARGET_DATA_FOLDER}').as_mount(),\n",
+    "                        \"--cache_dir\": ds.path(f'{TARGET_CACHE_DIR}').as_mount(),\n",
+    "                        '--output_dir':ds.path(f'{TARGET_OUTPUT_DIR}').as_mount(),\n",
+    "                        \"--quick_run\": 'true',\n",
+    "                        \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
+    "                        \"--model_filename\": f'{MODEL_FILENAME}',\n",
+    "                        \"--model_name\": MODEL_NAME,\n",
+    "                        \"--encoder\": ENCODER\n",
+    "                    },\n",
+    "                    entry_script= ENTRY_SCRIPT,\n",
+    "                    node_count=NODE_COUNT,\n",
+    "                    distributed_training=NcclConfig,\n",
+    "                    conda_dependencies_file=f'{CONDA_ENV_NAME}.yaml',\n",
+    "                    use_gpu=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run = experiment.submit(estimator)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "97f3678284a44f7aab5c27fa3e19bb11",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "RunDetails(run).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "If you stop the notebook and come back, \n",
+    "you'll need to use the run_id in the output of the previous cell \n",
+    "to get run details.\n",
+    "\"\"\"\n",
+    "# fetched_run = Run(experiment, \"NLP-ExtSum_1579816237_ea238f69\")\n",
+    "# RunDetails(fetched_run).show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download Generated Summaries "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# need to clear the local output dir as the ds.download won't download if the path exists\n",
+    "!rm -rf {LOCAL_OUTPUT_DIR}/* "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading output/NLP-ExtSum/generated_summaries.txt\n",
+      "Downloaded output/NLP-ExtSum/generated_summaries.txt, 1 files out of an estimated total of 1\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
+    "                   prefix=f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}',\n",
+    "                   show_progress=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils_nlp.eval.evaluate_summarization import get_rouge\n",
+    "from utils_nlp.models.transformers.extractive_summarization import ExtSumProcessedData\n",
+    "import pickle\n",
+    "from utils_nlp.models.transformers.extractive_summarization import ExtractiveSummarizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset, test_dataset = ExtSumProcessedData().splits(root=LOCAL_DATA_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target = [i['tgt_txt'] for i in test_dataset]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prediction = []\n",
+    "with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \"r\") as filehandle:\n",
+    "    for cnt, line in enumerate(filehandle):\n",
+    "        prediction.append(line[0:-1]) # remove the ending \"\\n\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading output/NLP-ExtSum/dist_extsum.pt\n",
+      "Downloaded output/NLP-ExtSum/dist_extsum.pt, 1 files out of an estimated total of 1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 546/546 [00:00<00:00, 306489.56B/s]\n",
+      "100%|██████████| 267967963/267967963 [00:04<00:00, 63548158.24B/s]\n",
+      "Scoring: 100%|██████████| 90/90 [00:41<00:00,  3.68it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "## you can also download the saved model and run prediction if you are running the notebook on a gpu machine\n",
+    "#\"\"\"\n",
+    "ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
+    "               prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\n",
+    "               show_progress=True)\n",
+    "summarizer = ExtractiveSummarizer(MODEL_NAME, ENCODER, LOCAL_OUTPUT_DIR)\n",
+    "summarizer.model.load_state_dict(\n",
+    "    torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'))\n",
+    ")\n",
+    "prediction = summarizer.predict(test_dataset, num_gpus=torch.cuda.device_count(), batch_size=128)\n",
+    "#\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .',\n",
+       " 'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .',\n",
+       " 'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .',\n",
+       " 'andrew mogni , 20 , from glen ellyn , illinois , a university of iowa student has died nearly three months after a fall in rome in a suspected robbery',\n",
+       " 'he was taken to a medical facility in the chicago area , close to his family home in glen ellyn .',\n",
+       " \"he died on sunday at northwestern memorial hospital - medical examiner 's office spokesman frank shuftan says a cause of death wo n't be released until monday at the earliest .\",\n",
+       " 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed .',\n",
+       " \"on sunday , his cousin abby wrote online : ` this morning my cousin andrew 's soul was lifted up to heaven .\",\n",
+       " 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed',\n",
+       " '` at the beginning of january he went to rome to study aboard and on the way home from a party he was brutally attacked and thrown off a 40ft bridge and hit the concrete below .',\n",
+       " \"` he was in a coma and in critical condition for months . '\",\n",
+       " 'paula barnett , who said she is a close family friend , told my suburban life , that mogni had only been in the country for six hours when the incident happened .',\n",
+       " 'she said he was was alone at the time of the alleged assault and personal items were stolen .',\n",
+       " 'she added that he was in a non-medically induced coma , having suffered serious infection and internal bleeding .',\n",
+       " 'mogni was a third-year finance major from glen ellyn , ill. , who was participating in a semester-long program at john cabot university .',\n",
+       " \"mogni belonged to the school 's chapter of the sigma nu fraternity , reports the chicago tribune who posted a sign outside a building reading ` pray for mogni . '\",\n",
+       " \"the fraternity 's iowa chapter announced sunday afternoon via twitter that a memorial service will be held on campus to remember mogni .\"]"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "test_dataset[0]['src_txt']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .<q>andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .<q>a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .'"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prediction[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program when the incident happened in january<q>he was flown back to chicago via air on march 20 but he died on sunday<q>initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed<q>his cousin claims he was attacked and thrown 40ft from a bridge'"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RESULT_DIR = TemporaryDirectory().name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "11489\n",
+      "11489\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-01-31 03:38:44,480 [MainThread  ] [INFO ]  Writing summaries.\n",
+      "2020-01-31 03:38:44,481 [MainThread  ] [INFO ]  Processing summaries. Saving system files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system and model files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.\n",
+      "2020-01-31 03:38:44,481 [MainThread  ] [INFO ]  Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/candidate/.\n",
+      "2020-01-31 03:38:45,679 [MainThread  ] [INFO ]  Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system.\n",
+      "2020-01-31 03:38:45,681 [MainThread  ] [INFO ]  Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/reference/.\n",
+      "2020-01-31 03:38:46,904 [MainThread  ] [INFO ]  Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.\n",
+      "2020-01-31 03:38:46,989 [MainThread  ] [INFO ]  Written ROUGE configuration to /tmp/tmpz9fasy6o/tmpicwmu3se/rouge_conf.xml\n",
+      "2020-01-31 03:38:46,989 [MainThread  ] [INFO ]  Running ROUGE with command /dadendev/pyrouge/tools/ROUGE-1.5.5/ROUGE-1.5.5.pl -e /dadendev/pyrouge/tools/ROUGE-1.5.5/data -c 95 -m -r 1000 -n 2 -a /tmp/tmpz9fasy6o/tmpicwmu3se/rouge_conf.xml\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------------------------------\n",
+      "1 ROUGE-1 Average_R: 0.52322 (95%-conf.int. 0.52036 - 0.52598)\n",
+      "1 ROUGE-1 Average_P: 0.35403 (95%-conf.int. 0.35171 - 0.35628)\n",
+      "1 ROUGE-1 Average_F: 0.40840 (95%-conf.int. 0.40623 - 0.41040)\n",
+      "---------------------------------------------\n",
+      "1 ROUGE-2 Average_R: 0.23066 (95%-conf.int. 0.22789 - 0.23341)\n",
+      "1 ROUGE-2 Average_P: 0.15558 (95%-conf.int. 0.15365 - 0.15744)\n",
+      "1 ROUGE-2 Average_F: 0.17947 (95%-conf.int. 0.17740 - 0.18147)\n",
+      "---------------------------------------------\n",
+      "1 ROUGE-L Average_R: 0.47554 (95%-conf.int. 0.47274 - 0.47834)\n",
+      "1 ROUGE-L Average_P: 0.32224 (95%-conf.int. 0.32001 - 0.32440)\n",
+      "1 ROUGE-L Average_F: 0.37150 (95%-conf.int. 0.36935 - 0.37361)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "rouge_score = get_rouge(prediction, target, RESULT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import shutil\n",
+    "if os.path.exists(LOCAL_DATA_FOLDER):\n",
+    "    shutil.rmtree(LOCAL_DATA_FOLDER, ignore_errors=True)\n",
+    "if os.path.exists(LOCAL_OUTPUT_DIR):\n",
+    "    shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\n",
+    "if os.path.exists(PROJECT_FOLDER):\n",
+    "    shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)\n",
+    "if os.path.exists(RESULT_DIR):\n",
+    "    shutil.rmtree(RESULT_DIR, ignore_errors=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (nlp_gpu)",
+   "language": "python",
+   "name": "nlp_gpu"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/text_summarization/extractive_summarization_cnndm_distributed_train.py
+++ b/examples/text_summarization/extractive_summarization_cnndm_distributed_train.py
@ -0,0 +1,165 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+import os
+import sys
+import time
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+nlp_path = os.path.abspath("../../")
+if nlp_path not in sys.path:
+    sys.path.insert(0, nlp_path)
+
+
+from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset
+from utils_nlp.models.transformers.extractive_summarization import (
+    ExtractiveSummarizer,
+    ExtSumProcessedData,
+    ExtSumProcessor,
+)
+
+# os.environ["NCCL_BLOCKING_WAIT"] = "1"
+
+os.environ["NCCL_IB_DISABLE"] = "0"
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--rank", type=int, default=0,
+                    help="The rank of the current node in the cluster")
+parser.add_argument("--dist_url", type=str, default="tcp://127.0.0.1:29500",
+                    help="URL specifying how to initialize the process groupi.")
+parser.add_argument("--node_count", type=int, default=1,
+                    help="Number of nodes in the cluster.")
+parser.add_argument("--cache_dir", type=str, default="./",
+                    help="Directory to cache the tokenizer.")
+parser.add_argument("--data_dir", type=str, default="./",
+                    help="Directory to download the preprocessed data.")
+parser.add_argument("--output_dir", type=str, default="./",
+                    help="Directory to save the output model and prediction results.")
+parser.add_argument("--quick_run", type=str.lower, default='false', choices=['true', 'false'],
+                    help="Whether to have a quick run")
+parser.add_argument("--model_name", type=str, default="distilbert-base-uncased",
+                    help="Transformer model used in the extractive summarization, only \
+                        \"bert-uncased\" and \"distilbert-base-uncased\" are supported.")
+parser.add_argument("--encoder", type=str.lower, default='transformer', 
+                    choices=['baseline', 'classifier', 'transformer', 'rnn'],
+                    help="Encoder types in the extractive summarizer.")
+parser.add_argument("--learning_rate", type=float, default=1e-3,
+                    help="Learning rate.")
+parser.add_argument("--batch_size", type=int, default=3000,
+                    help="batch size in terms of input token numbers in training")
+parser.add_argument("--max_steps", type=int, default=1e4,
+                    help="Maximum number of training steps run in training. If quick_run is set,\
+                        it's not used.")
+parser.add_argument("--warmup_steps", type=int, default=5e3,
+                    help="Warm-up number of training steps run in training. If quick_run is set,\
+                        it's not used.")
+parser.add_argument("--top_n", type=int, default=3,
+                    help="Number of sentences selected in prediction for evaluation.")
+parser.add_argument("--summary_filename", type=str, default="generated_summaries.txt", 
+                    help="Summary file name generated by prediction for evaluation.")
+parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt", 
+                    help="model file name saved for evaluation.")
+
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+# How often the statistics reports show up in training, unit is step.
+REPORT_EVERY = 100
+SAVE_EVERY = 1000
+
+def main():
+    print("NCCL_IB_DISABLE: {}".format(os.getenv("NCCL_IB_DISABLE")))
+    args = parser.parse_args()
+    print("quick_run is {}".format(args.quick_run))
+    print("output_dir is {}".format(args.output_dir))
+    print("data_dir is {}".format(args.data_dir))
+    print("cache_dir is {}".format(args.cache_dir))
+
+    #shutil.rmtree(args.output_dir)
+    os.makedirs(args.output_dir, exist_ok=True)
+    os.makedirs(args.cache_dir, exist_ok=True)
+
+    ngpus_per_node = torch.cuda.device_count()
+
+    summarizer = ExtractiveSummarizer(args.model_name, args.encoder, args.cache_dir)
+
+    mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, summarizer, args))
+
+
+def main_worker(local_rank, ngpus_per_node, summarizer, args):
+    rank = args.rank * ngpus_per_node + local_rank
+    world_size = args.node_count * ngpus_per_node
+
+    print("init_method: {}".format(args.dist_url))
+    print("ngpus_per_node: {}".format(ngpus_per_node))
+    print("rank: {}".format(rank))
+    print("local_rank: {}".format(local_rank))
+    print("world_size: {}".format(world_size))
+
+    torch.distributed.init_process_group(
+        backend="nccl",
+        init_method=args.dist_url,
+        world_size=world_size,
+        rank=rank,
+    )
+
+    train_dataset, test_dataset = ExtSumProcessedData().splits(root=args.data_dir)
+    # total number of steps for training
+    MAX_STEPS = 1e3
+    # number of steps for warm up
+    WARMUP_STEPS = 5e2
+    if args.quick_run.lower() == "false":
+        MAX_STEPS = args.max_steps
+        WARMUP_STEPS = args.warmup_steps
+    
+    print("max steps is {}".format(MAX_STEPS))
+    print("warmup steps is {}".format(WARMUP_STEPS))
+    start = time.time()
+
+    if rank not in [-1, 0]:
+        save_every = -1
+    else:
+        save_every = SAVE_EVERY
+    summarizer.fit(
+        train_dataset,
+        num_gpus=world_size,
+        batch_size=args.batch_size,
+        gradient_accumulation_steps=2,
+        max_steps=MAX_STEPS / world_size,
+        learning_rate=args.learning_rate,
+        warmup_steps=WARMUP_STEPS,
+        verbose=True,
+        report_every=REPORT_EVERY,
+        clip_grad_norm=False,
+        local_rank=rank,
+        save_every=save_every,
+        world_size=world_size
+    )
+
+    end = time.time()
+    print("rank {0}, duration {1:.6f}s".format(rank, end - start))
+    if rank in [-1, 0]:
+        summarizer.save_model(os.path.join(args.output_dir, args.model_filename))
+        prediction = summarizer.predict(test_dataset, num_gpus=ngpus_per_node, batch_size=128)
+
+        def _write_list_to_file(list_items, filename):
+            with open(filename, "w") as filehandle:
+                # for cnt, line in enumerate(filehandle):
+                for item in list_items:
+                    filehandle.write("%s\n" % item)
+
+        print("writing generated summaries")
+        _write_list_to_file(prediction, os.path.join(args.output_dir, args.summary_filename))
+    # only use the following line when you use your own cluster. 
+    # AML distributed training run cleanup for you.
+    # cleanup()
+
+
+if __name__ == "__main__":
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,2 +1,2 @@
 [tool.black]
-line-length = 100
+line-length = 88
--- a/tests/unit/test_distributed_sampler.py
+++ b/tests/unit/test_distributed_sampler.py
@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+from utils_nlp.models.transformers.extractive_summarization import IterableDistributedSampler
+
+@pytest.mark.cpu
+def test_sampler():
+    sampler = IterableDistributedSampler(1, -1)
+    samples = list(sampler.iter('abcdefg'))
+    assert ''.join(samples) == 'abcdefg'
+
+    sampler = IterableDistributedSampler(2, -1)
+    samples = list(sampler.iter('abcdefg'))
+    assert ''.join(samples) == 'abcdefg'
+
+    sampler = IterableDistributedSampler(4, 1)
+    samples = list(sampler.iter('abcdefg'))
+    assert ''.join(samples) == 'bf'
+
+    sampler = IterableDistributedSampler(4, 2)
+    samples = list(sampler.iter('abcdefg'))
+    assert ''.join(samples) == 'cg'
+
+    sampler = IterableDistributedSampler(4, 3)
+    samples = list(sampler.iter('abcdefg'))
+    assert ''.join(samples) == 'd'
+
--- a/tests/unit/test_models_transformers_question_answering.py
+++ b/tests/unit/test_models_transformers_question_answering.py
@ -65,22 +65,18 @@ def qa_test_data(qa_test_df, tmp_module):
        qa_id_col=qa_test_df["qa_id_col"],
    )

+    # bert
    qa_processor_bert = QAProcessor(cache_dir=tmp_module)
    train_features_bert = qa_processor_bert.preprocess(
        train_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=True,
        max_question_length=16,
        max_seq_length=64,
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
-
    test_features_bert = qa_processor_bert.preprocess(
        test_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=False,
        max_question_length=16,
        max_seq_length=64,
@ -88,22 +84,18 @@ def qa_test_data(qa_test_df, tmp_module):
        feature_cache_dir=tmp_module,
    )

+    # xlnet
    qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
    train_features_xlnet = qa_processor_xlnet.preprocess(
        train_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=True,
        max_question_length=16,
        max_seq_length=64,
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
-
    test_features_xlnet = qa_processor_xlnet.preprocess(
        test_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=False,
        max_question_length=16,
        max_seq_length=64,
@ -111,22 +103,20 @@ def qa_test_data(qa_test_df, tmp_module):
        feature_cache_dir=tmp_module,
    )

-    qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
+    # distilbert
+    qa_processor_distilbert = QAProcessor(
+        model_name="distilbert-base-uncased", cache_dir=tmp_module
+    )
    train_features_distilbert = qa_processor_distilbert.preprocess(
        train_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=True,
        max_question_length=16,
        max_seq_length=64,
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
-
    test_features_distilbert = qa_processor_distilbert.preprocess(
        test_dataset,
-        batch_size=BATCH_SIZE,
-        num_gpus=NUM_GPUS,
        is_training=False,
        max_question_length=16,
        max_seq_length=64,
@ -151,11 +141,21 @@ def qa_test_data(qa_test_df, tmp_module):

@pytest.mark.gpu
 def test_QAProcessor(qa_test_data, tmp_module):
-    for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
+    for model_name in [
+        "bert-base-cased",
+        "xlnet-base-cased",
+        "distilbert-base-uncased",
+    ]:
        qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
-        qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
-        qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
-        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(
+            qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
+        )
+        qa_processor.preprocess(
+            qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
+        )
+        qa_processor.preprocess(
+            qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
+        )

    # test unsupported model type
    with pytest.raises(ValueError):
@ -163,18 +163,24 @@ def test_QAProcessor(qa_test_data, tmp_module):

    # test training data has no ground truth exception
    with pytest.raises(Exception):
-        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(
+            qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
+        )

    # test when answer start is a list, but answer text is not
    with pytest.raises(Exception):
        qa_processor.preprocess(
-            qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_start_text_mismatch"],
+            is_training=True,
+            feature_cache_dir=tmp_module,
        )

    # test when training data has multiple answers
    with pytest.raises(Exception):
        qa_processor.preprocess(
-            qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_multi_answers"],
+            is_training=True,
+            feature_cache_dir=tmp_module,
        )


@ -190,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
    assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
    assert os.path.exists(os.path.join(model_output_dir, "config.json"))

-    qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
+    qa_extractor_from_cache = AnswerExtractor(
+        cache_dir=tmp_module, load_model_from_dir=model_output_dir
+    )
    qa_extractor_from_cache.predict(test_loader_bert, verbose=False)

    # xlnet
@ -202,8 +210,12 @@ def test_AnswerExtractor(qa_test_data, tmp_module):

    # distilbert
    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
-    test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
-    qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
+    test_loader_xlnet = dataloader_from_dataset(
+        qa_test_data["test_features_distilbert"], shuffle=False
+    )
+    qa_extractor_distilbert = AnswerExtractor(
+        model_name="distilbert-base-uncased", cache_dir=tmp_module
+    )
    qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
    qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)

--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@ -4,6 +4,7 @@
 # This script reuses some code from
 # https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py

+import datetime
 import logging
 import os
 import random
@ -191,6 +192,7 @@ class Transformer:
        verbose=True,
        seed=None,
        report_every=10,
+        save_every=-1,
        clip_grad_norm=True,
    ):

@ -251,9 +253,14 @@ class Transformer:

                    if global_step % report_every == 0 and verbose:
                        end = time.time()
+                        endtime_string = datetime.datetime.fromtimestamp(end).strftime(
+                            "%d/%m/%Y %H:%M:%S"
+                        )
                        print(
-                            "loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, "
-                            "step:{3:.0f}/{4:.0f}".format(
+                            """timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f},
+                            number of examples in current step: {3:.0f}, step {4:.0f}
+                            out of total {5:.0f}""".format(
+                                endtime_string,
                                accum_loss / report_every,
                                end - start,
                                len(batch),
@ -268,7 +275,10 @@ class Transformer:
                    if scheduler:
                        scheduler.step()
                    self.model.zero_grad()
-
+                    if save_every != -1 and global_step % save_every == 0 and verbose:
+                        self.save_model(
+                            os.path.join(self.cache_dir, f"{self.model_name}_step_{global_step}.pt")
+                        )
                if global_step > max_steps:
                    epoch_iterator.close()
                    break
@ -292,16 +302,31 @@ class Transformer:
                logits = outputs[0]
            yield logits.detach().cpu().numpy()

-    def save_model(self):
-        output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
+    def save_model(self, full_name=None):
+        """
+        save the trained model.

-        os.makedirs(self.cache_dir, exist_ok=True)
-        os.makedirs(output_model_dir, exist_ok=True)
+        Args:
+            full_name (str, optional): File name to save the model's `state_dict()` and can
+                be loaded by torch.load(). If it's None, the trained model, configuration
+                and tokenizer using `save_pretrained()`; and the file is going to be saved
+                under "fine_tuned" folder of the cached directory of the object. Defaults to None.
+        """

-        logger.info("Saving model checkpoint to %s", output_model_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            self.model.module if hasattr(self.model, "module") else self.model
        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(output_model_dir)
+
+        if full_name:
+            logger.info("Saving model checkpoint to %s", full_name)
+            torch.save(model_to_save.state_dict(), full_name)
+        else:
+            output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
+
+            os.makedirs(self.cache_dir, exist_ok=True)
+            os.makedirs(output_model_dir, exist_ok=True)
+
+            logger.info("Saving model checkpoint to %s", output_model_dir)
+            model_to_save.save_pretrained(output_model_dir)
--- a/utils_nlp/models/transformers/extractive_summarization.py
+++ b/utils_nlp/models/transformers/extractive_summarization.py
@ -3,9 +3,11 @@

 # This script reuses some code from https://github.com/nlpyang/BertSum

+import gc
 import itertools
 import logging
 import os
+import pickle
 import random

 import numpy as np
@ -15,8 +17,8 @@ from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSam
 # from torch.utils.data.distributed import DistributedSampler
 from transformers import BertModel, DistilBertModel

-from bertsum.models import data_loader, model_builder
-from bertsum.models.data_loader import Batch
+from bertsum.models import model_builder
+from bertsum.models.data_loader import Batch, DataIterator
 from bertsum.models.model_builder import Summarizer
 from utils_nlp.common.pytorch_utils import compute_training_steps, get_device
 from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
@ -27,6 +29,79 @@ MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": Distil
 logger = logging.getLogger(__name__)


+# https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataloader.html
+class IterableDistributedSampler(object):
+    """ Distributed sampler for iterable dataset.
+
+    Args:
+        world_size (int): Total number of GPUs that will be used. Defaults to 1.
+        rank (int): Rank of the current GPU. Defaults to -1.
+
+    """
+
+    def __init__(self, world_size=1, rank=-1):
+        self.world_size = world_size
+        self.rank = rank
+
+    def iter(self, iterable):
+        if self.rank != -1:
+            return itertools.islice(iterable, self.rank, None, self.world_size)
+        else:
+            return iterable
+
+
+class ChunkDataLoader(object):
+    """ Data Loader for Chunked Dataset.
+
+    Args:
+        datasets (list): list of data item list.
+        batch_size (int): Number of tokens per batch.
+        shuffle (bool): Whether the data is shuffled.
+        is_labeled (bool): Whether the data is labeled.
+        sampler (obj): Data sampler.
+
+    """
+
+    def __init__(self, datasets, batch_size, shuffle, is_labeled, sampler):
+        self.datasets = datasets
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.is_labeled = is_labeled
+        self.cur_iter = self._next_dataset_iterator(datasets)
+        assert self.cur_iter is not None
+        self.sampler = sampler
+
+    def eachiter(self):
+        dataset_iter = (d for d in self.datasets)
+        while self.cur_iter is not None:
+            for batch in self.cur_iter:
+                yield batch
+            self.cur_iter = self._next_dataset_iterator(dataset_iter)
+
+    def __iter__(self):
+        return self.sampler.iter(self.eachiter())
+
+    def _next_dataset_iterator(self, dataset_iter):
+        try:
+            # Drop the current dataset for decreasing memory
+            if hasattr(self, "cur_dataset"):
+                self.cur_dataset = None
+                gc.collect()
+                del self.cur_dataset
+                gc.collect()
+
+            self.cur_dataset = next(dataset_iter)
+        except StopIteration:
+            return None
+
+        return DataIterator(
+            dataset=self.cur_dataset,
+            batch_size=self.batch_size,
+            shuffle=self.shuffle,
+            is_labeled=self.is_labeled,
+        )
+
+
 class Bunch(object):
    """ Class which convert a dictionary to an object """

@ -34,21 +109,28 @@ class Bunch(object):
        self.__dict__.update(adict)


-def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000):
+def get_dataloader(
+    data_iter, shuffle=True, is_labeled=False, batch_size=3000, world_size=1, rank=-1
+):
    """
    Function to get data iterator over a list of data objects.

    Args:
-        data_iter (generator): data generator.
-        shuffle (bool): whether the data is shuffled.
-        is_labeled (bool): specifies whether the data objects are labeled data.
-        batch_size (int): number of tokens per batch.
+        data_iter (generator): Data generator.
+        shuffle (bool): Whether the data is shuffled. Defaults to True.
+        is_labeled (bool): Whether the data objects are labeled data.
+                            Defaults to False.
+        batch_size (int): Number of tokens per batch. Defaults to 3000.
+        world_size (int): Total number of GPUs that will be used. Defaults to 1.
+        rank (int): Rank of the current GPU. Defaults to -1.

    Returns:
        DataIterator
    """
-
-    return data_loader.Dataloader(data_iter, batch_size, shuffle=shuffle, is_labeled=is_labeled)
+    sampler = IterableDistributedSampler(world_size, rank)
+    return ChunkDataLoader(
+        data_iter, batch_size, shuffle=shuffle, is_labeled=is_labeled, sampler=sampler
+    )


 def get_dataset(file):
@ -77,7 +159,9 @@ class ExtSumProcessedIterableDataset(IterableDataset):
        if self.is_shuffle:
            return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list)))
        else:
-            return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list))))
+            return itertools.chain.from_iterable(
+                map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))
+            )

    def __iter__(self):
        return self.get_stream()
@ -96,12 +180,12 @@ class ExtSumProcessedDataset(Dataset):
                files is shuffled when the dataset is loaded. Defaults to False.
        """

-        self.file_list = file_list
+        self.file_list = sorted(file_list)
        if is_shuffle:
-            random.shuffle(file_list)
+            random.shuffle(self.file_list)
        self.data = []
-        for file in file_list:
-            self.data.extend(torch.load(file))
+        for f in self.file_list:
+            self.data.extend(torch.load(f))

    def __len__(self):
        return len(self.data)
@ -110,7 +194,9 @@ class ExtSumProcessedDataset(Dataset):
        return self.data[idx]


-def get_pred(example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3):
+def get_pred(
+    example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3
+):
    """
        Get the summarization prediction for the paragraph example based on the scores
        returned by the transformer summarization model.
@ -223,7 +309,9 @@ class ExtSumProcessedData:
    def _get_files(self, root):
        train_files = []
        test_files = []
-        files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))]
+        files = [
+            os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
+        ]
        for fname in files:
            if fname.find("train") != -1:
                train_files.append(fname)
@ -484,7 +572,9 @@ class ExtractiveSummarizer(Transformer):
            cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
        """

-        super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir)
+        super().__init__(
+            model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir
+        )
        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by ExtractiveSummarizer. "
@ -530,6 +620,8 @@ class ExtractiveSummarizer(Transformer):
        report_every=50,
        verbose=True,
        seed=None,
+        save_every=-1,
+        world_size=1,
        **kwargs,
    ):
        """
@ -582,11 +674,19 @@ class ExtractiveSummarizer(Transformer):
        )

        # batch_size is the number of tokens in a batch
-        train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)
+        train_dataloader = get_dataloader(
+            train_dataset.get_stream(),
+            is_labeled=True,
+            batch_size=batch_size,
+            world_size=world_size,
+            rank=local_rank,
+        )

        # compute the max number of training steps
        max_steps = compute_training_steps(
-            train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
+            train_dataloader,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
@ -603,6 +703,7 @@ class ExtractiveSummarizer(Transformer):
            seed=seed,
            report_every=report_every,
            clip_grad_norm=False,
+            save_every=save_every,
        )

    def predict(
@ -647,32 +748,22 @@ class ExtractiveSummarizer(Transformer):
            # tuple_batch =  [list(col) for col in zip(*[d.values() for d in dict_list]
            if dict_list is None or len(dict_list) <= 0:
                return None
-            is_labeled = False
-            if "labels" in dict_list[0]:
-                is_labeled = True
            tuple_batch = [list(d.values()) for d in dict_list]
            ## generate mask and mask_cls, and only select tensors for the model input
-            batch = Batch(tuple_batch, is_labeled=True)
-            if is_labeled:
-                return {
-                    "src": batch.src,
-                    "segs": batch.segs,
-                    "clss": batch.clss,
-                    "mask": batch.mask,
-                    "mask_cls": batch.mask_cls,
-                    "labels": batch.labels,
-                }
-            else:
-                return {
-                    "src": batch.src,
-                    "segs": batch.segs,
-                    "clss": batch.clss,
-                    "mask": batch.mask,
-                    "mask_cls": batch.mask_cls,
-                }
+            ## the labels was never used in prediction, set is_labeled as False
+            batch = Batch(tuple_batch, is_labeled=False)
+            return {
+                "src": batch.src,
+                "segs": batch.segs,
+                "clss": batch.clss,
+                "mask": batch.mask,
+                "mask_cls": batch.mask_cls,
+            }

        test_sampler = SequentialSampler(test_dataset)
-        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn)
+        test_dataloader = DataLoader(
+            test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn
+        )
        sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids)
        sent_scores_list = list(sent_scores)
        scores_list = []
@ -722,12 +813,34 @@ class ExtractiveSummarizer(Transformer):
        )
        return preds

-    def save_model(self, name):
-        output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
+    def save_model(self, full_name=None):
+        """
+        save the trained model.

-        os.makedirs(self.cache_dir, exist_ok=True)
-        os.makedirs(output_model_dir, exist_ok=True)
+        Args:
+            full_name (str, optional): File name to save the model's `state_dict()`. If it's None,
+                the model is going to be saved under "fine_tuned" folder of the cached directory
+                of the object. Defaults to None.
+        """
+        model_to_save = (
+            self.model.module if hasattr(self.model, "module") else self.model
+        )  # Take care of distributed/parallel training
+
+        if full_name is None:
+            output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
+            os.makedirs(self.cache_dir, exist_ok=True)
+            os.makedirs(output_model_dir, exist_ok=True)
+            full_name = os.path.join(output_model_dir, name)

-        full_name = os.path.join(output_model_dir, name)
        logger.info("Saving model checkpoint to %s", full_name)
-        torch.save(self.model, name)
+        try:
+            print("saving through pytorch")
+            torch.save(model_to_save.state_dict(), full_name)
+        except OSError:
+            try:
+                print("saving as pickle")
+                pickle.dump(model_to_save.state_dict(), open(full_name, "wb"))
+            except Exception:
+                raise
+        except Exception:
+            raise
--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@ -27,19 +27,41 @@ import jsonlines
 import torch
 from torch.utils.data import TensorDataset
 from tqdm import tqdm
-from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
-from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
-from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
-from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
+from transformers.modeling_albert import (
+    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    AlbertForQuestionAnswering,
+)
+from transformers.modeling_bert import (
+    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    BertForQuestionAnswering,
+)
+from transformers.modeling_distilbert import (
+    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+    DistilBertForQuestionAnswering,
+)
+from transformers.modeling_xlnet import (
+    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+    XLNetForQuestionAnswering,
+)
 from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize

-from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
-from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
+from utils_nlp.common.pytorch_utils import (
+    compute_training_steps,
+    get_device,
+    move_model_to_device,
+)
+from utils_nlp.models.transformers.common import (
+    MAX_SEQ_LEN,
+    TOKENIZER_CLASS,
+    Transformer,
+)

 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update(
+    {k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
+)
 MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})

 # cached files during preprocessing
@ -62,25 +84,29 @@ class QAProcessor:
    Class for preprocessing and postprocessing question answering data.

    Args:
-        model_name (str, optional): Name of the model. Call QAProcessor.list_supported_models() to
-            get all supported models. Defaults to "bert-base-cased".
+        model_name (str, optional): Name of the model.
+            Call QAProcessor.list_supported_models() to get all supported models.
+            Defaults to "bert-base-cased".
        to_lower (bool, optional): Whether to convert all letters to lower case during
-            tokenization. This is determined by if a cased model is used. Defaults to False,
-            which corresponds to a cased model.
-        custom_tokenize (function, optional): A custom tokenize function used to tokenize the
-            input text. If not provided, the default tokenizer corresponding to the model_name
-            is loaded and its `tokenize` method is used. NOTE that even this function is
-            provided, the numerical token ids are still generated by the `convert_tokens_to_ids`
-            method of the default tokenizer, so there is a risk that tokens generated by the
-            custom_tokenize function don't have correponding token ids in the default toeknizer.
+            tokenization. This is determined by if a cased model is used.
+            Defaults to False, which corresponds to a cased model.
+        custom_tokenize (function, optional): A custom tokenize function
+            used to tokenize the input text. If not provided, the default tokenizer
+            corresponding to the model_name is loaded and its `tokenize` method is used.
+            NOTE that even this function is provided, the numerical token ids are still
+            generated by the `convert_tokens_to_ids` method of the default tokenizer,
+            so there is a risk that tokens generated by the custom_tokenize
+            function don't have correponding token ids in the default toeknizer.
            Defaults to None.
        cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
    """

-    def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
+    def __init__(
+        self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
+    ):
        self.model_name = model_name
        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
-            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
+            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
        )
        self.do_lower_case = to_lower
        self.custom_tokenize = custom_tokenize
@ -149,9 +175,6 @@ class QAProcessor:
        self,
        qa_dataset,
        is_training,
-        batch_size=32,
-        num_gpus=None,
-        distributed=False,
        max_question_length=64,
        max_seq_length=MAX_SEQ_LEN,
        doc_stride=128,
@ -161,28 +184,31 @@ class QAProcessor:
        Preprocesses raw question answering data and generates train/test features.

        Args:
-            qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`): Question answering data in
-                standard QADataset format.
+            qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`):
+                Question answering data in standard QADataset format.
            is_training (bool): Whether the input data is training data.
-            max_question_length (int, optional): Maximum number of tokens of the question sequence
-                after tokenization, so the number of words in the raw question is usually less than
-                max_question_length. Defaults to 64.
-            max_seq_length (int, optional): Maximum number of tokens of the entire feature token
-                sequence after tokenization. The entire feature token sequence is composed
-                of [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP] for models other
-                than XLNet, and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
+            max_question_length (int, optional): Maximum number of tokens
+                of the question sequence after tokenization, so the number of words
+                in the raw question is usually less than max_question_length.
+                Defaults to 64.
+            max_seq_length (int, optional): Maximum number of tokens of the entire
+                feature token sequence after tokenization. The entire feature token
+                sequence is composed of:
+                [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP]
+                for models other than XLNet,
+                and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
                XLNet. Defaults to MAX_SEQ_LEN.
-            doc_stride (int, optional): Size (number of tokens) of the sliding window when
-                breaking down a long document paragraph in to multiple document spans. Defaults
-                to 128.
-            feature_cache_dir (int, optional): Directory to save some intermediate preprocessing
-                results.
+            doc_stride (int, optional): Size (number of tokens) of the sliding window
+                when breaking down a long document paragraph in to multiple document
+                spans. Defaults to 128.
+            feature_cache_dir (int, optional): Directory to save some intermediate
+                preprocessing results.
                If `is_training` is True, CACHED_EXAMPLES_TRAIN_FILE and
                CACHED_FEATURES_TRAIN_FILE are saved to this directory. Otherwise,
-                CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved to this
-                directory. These files are required during postprocessing to generate the final
-                answer texts from predicted answer start and answer end indices. Defaults to
-                "./cached_qa_features".
+                CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved
+                to this directory. These files are required during postprocessing to
+                generate the final answer texts from predicted answer start and answer
+                end indices. Defaults to "./cached_qa_features".
        Returns:
            DataSet: A Pytorch DataSet.
        """
@ -217,7 +243,9 @@ class QAProcessor:

                qa_examples.append(qa_example_cur)

-                qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
+                qa_examples_json.append(
+                    {"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
+                )

                features_cur = _create_qa_features(
                    qa_example_cur,
@ -257,17 +285,25 @@ class QAProcessor:
        input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-        p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)

        if is_training:
            start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            qa_dataset = TensorDataset(
-                input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
+                input_ids,
+                input_mask,
+                segment_ids,
+                start_positions,
+                end_positions,
+                cls_index,
+                p_mask,
            )
        else:
            unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
-            qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
+            qa_dataset = TensorDataset(
+                input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
+            )

        return qa_dataset

@ -397,7 +433,14 @@ class QAResult(QAResult_):

 QAResultExtended_ = collections.namedtuple(
    "QAResultExtended",
-    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
+    [
+        "unique_id",
+        "start_top_log_probs",
+        "start_top_index",
+        "end_top_log_probs",
+        "end_top_index",
+        "cls_logits",
+    ],
 )


@ -481,36 +524,45 @@ class AnswerExtractor(Transformer):
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
            max_steps (int, optional): Total number of training steps.
                If set to a positive value, it overrides num_epochs.
-                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Otherwise, it's determined by the dataset length,
+                    gradient_accumulation_steps, and num_epochs.
                Defualts to -1.
            gradient_accumulation_steps (int, optional): Number of steps to accumulate
                before performing a backward/update pass.
                Default to 1.
-            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will be used.
+            num_gpus (int, optional): The number of GPUs to use.
+                If None, all available GPUs will be used.
+                If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
-            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-                -1, which means non-distributed training.
-            weight_decay (float, optional): Weight decay to apply after each parameter update.
-                Defaults to 0.0.
-            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
-                5e-5.
-            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
-            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
-                to `learning rate`. Defaults to 0.
-            verbose (bool, optional): Whether to print out the training log. Defaults to True.
-            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
-            cache_model (bool, optional): Whether to save the fine-tuned model. If True,
-                the fine-tuned model is saved to a `fine_tuned` folder under of the `cache_dir`
-                of AnswerExtractor. Defaults to True.
+            local_rank (int, optional): Local_rank for distributed training on GPUs.
+                Defaults to -1, which means non-distributed training.
+            weight_decay (float, optional): Weight decay to apply after each
+                parameter update. Defaults to 0.0.
+            learning_rate (float, optional):  Learning rate of the AdamW optimizer.
+                Defaults to 5e-5.
+            adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
+                Defaults to 1e-8.
+            warmup_steps (int, optional): Number of steps taken to increase
+                learning rate from 0 to `learning rate`.
+                Defaults to 0.
+            verbose (bool, optional): Whether to print out the training log.
+                Defaults to True.
+            seed (int, optional): Random seed used to improve reproducibility.
+                Defaults to None.
+            cache_model (bool, optional): Whether to save the fine-tuned model.
+                If True, the fine-tuned model is saved to a `fine_tuned` folder
+                    under of the `cache_dir` of AnswerExtractor.
+                Defaults to True.

        """

        # init optimizer
-        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+        optimizer = Transformer.get_default_optimizer(
+            self.model, weight_decay, learning_rate, adam_epsilon
+        )

        # compute the max number of training steps
        max_steps = compute_training_steps(
@ -522,7 +574,7 @@ class AnswerExtractor(Transformer):

        # inin scheduler
        scheduler = Transformer.get_default_scheduler(
-            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
        )

        # fine tune
@ -530,7 +582,7 @@ class AnswerExtractor(Transformer):
            train_dataloader=train_dataloader,
            get_inputs=QAProcessor.get_inputs,
            num_gpus=num_gpus,
-            gpu_ids=gpu_ids,          
+            gpu_ids=gpu_ids,
            max_steps=max_steps,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optimizer=optimizer,
@ -550,13 +602,15 @@ class AnswerExtractor(Transformer):

        Args:
            test_dataloader (DataLoader): DataLoader for scoring the data.
-            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
+            num_gpus (int, optional): The number of GPUs to use.
+                If None, all available GPUs will be used.
+                If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
            gpu_ids (list): List of GPU IDs to be used.
                If set to None, the first num_gpus GPUs will be used.
                Defaults to None.
-            verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
+            verbose (bool, optional): Whether to print out the predicting log.
+                Defaults to True.

        Returns:
            list: List of :class:`QAResult` or :class:`QAResultExtended`.
@ -589,7 +643,9 @@ class AnswerExtractor(Transformer):
                    )
                else:
                    result = QAResult(
-                        unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
+                        unique_id=u_id.item(),
+                        start_logits=_to_list(outputs[0][i]),
+                        end_logits=_to_list(outputs[1][i]),
                    )
                all_results.append(result)
            torch.cuda.empty_cache()
@ -612,53 +668,61 @@ def postprocess_bert_answer(
    verbose_logging=False,
 ):
    """
-    Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit` for BERT.
+    Postprocesses start and end logits
+    generated by :meth:`AnswerExtractor.fit` for BERT.

    Args:
        results (list): List of :class:`QAResult`.
-        examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
-            contains the original document tokens that are used to generate the final answers
-            from the predicted start and end positions.
-        features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
-            contains the mapping from indices in the processed token list to the original
-            document tokens that are used to generate the final predicted answers.
-        do_lower_case (bool): Whether an uncased tokenizer was used during data preprocessing.
-            This is required during answer finalization by comparing the predicted answer text
-            and the original text span in :func:`_get_final_text`.
-        unanswerable_exists (bool, optional): Whether there are unanswerable questions in the
-            data. If True, the start and end logits of the [CLS] token, which indicate the
-            probability of the answer being empty, are included in the candidate answer list.
+        examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
+            This file contains the original document tokens that are used to generate
+            the final answers from the predicted start and end positions.
+        features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
+            This file contains the mapping from indices in the processed token list
+            to the original document tokens that are used to generate the final
+            predicted answers.
+        do_lower_case (bool): Whether an uncased tokenizer was used during
+            data preprocessing. This is required during answer finalization
+            by comparing the predicted answer text and the original
+            text span in :func:`_get_final_text`.
+        unanswerable_exists (bool, optional): Whether there are unanswerable
+            questions in the data. If True, the start and end logits of the [CLS]
+            token, which indicate the probability of the answer being empty,
+            are included in the candidate answer list.
            Defaults to False.
-        n_best_size (int, optional): The number of candidates to choose from each QAResult to
-            generate the final prediction. It's also the maximum number of n-best answers to
-            output for each question. Note that the number of n-best answers can be smaller than
-            `n_best_size` because some unqualified answers, e.g. answer that are too long,
-            are removed.
+        n_best_size (int, optional): The number of candidates to choose from each
+            QAResult to generate the final prediction. It's also the maximum number
+            of n-best answers to output for each question.
+            Note that the number of n-best answers can be smaller than `n_best_size`
+            because some unqualified answers,
+            e.g. answer that are too long, are removed.
        max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.
-        output_prediction_file (str, optional): Path of the file to save the predicted answers.
-            Defaults to "./qa_predictions.json".
-        output_nbest_file (str, optional): Path of the file to save the n-best answers. Defaults
-            to "./nbest_predictions.json".
-        output_null_log_odds_file (str, optional): If unanswerable_exists is True, the score
-            difference between empty prediction and best non-empty prediction are saved to this
-            file. These scores can be used to find the best threshold for predicting an empty
-            answer. Defaults to "./null_odds.json".
-        null_score_diff_threshold (float, optional): If unanswerable_exists=True and the score
-            difference between empty prediction and best non-empty prediction is higher than this
-            threshold, the final predicted answer is empty. Defaults to 0.0.
-        verbose_logging (bool, optional): Whether to log details of answer postprocessing.
-            Defaults to False.
+        output_prediction_file (str, optional): Path of the file to save the
+            predicted answers. Defaults to "./qa_predictions.json".
+        output_nbest_file (str, optional): Path of the file to save the n-best answers.
+            Defaults to "./nbest_predictions.json".
+        output_null_log_odds_file (str, optional): If unanswerable_exists is True,
+            the score difference between empty prediction and best non-empty prediction
+            are saved to this file. These scores can be used to find the best threshold
+            for predicting an empty answer. Defaults to "./null_odds.json".
+        null_score_diff_threshold (float, optional): If unanswerable_exists=True
+            and the score difference between empty prediction and best non-empty
+            prediction is higher than this threshold, the final predicted
+            answer is empty.
+            Defaults to 0.0.
+        verbose_logging (bool, optional): Whether to log details of
+            answer postprocessing. Defaults to False.

    Returns:
        tuple: (OrderedDict, OrderedDict, OrderedDict)
            The keys of the dictionaries are the `qa_id` in the original
            :class:`utils_nlp.dataset.pytorch.QADataset`
-            The values of the first dictionary are the predicted answer texts in string type.
-            The values of the second dictionary are the softmax probabilities of the predicted
-            answers.
-            The values of the third dictionary are the n-best answers for each qa_id. Note that
-            the number of n-best answers can be smaller than `n_best_size` because some
-            unqualified answers, e.g. answers that are too long, are removed.
+            The values of the first dictionary are the predicted answer texts
+            in string type. The values of the second dictionary are the softmax
+            probabilities of the predicted answers.
+            The values of the third dictionary are the n-best answers for each qa_id.
+            Note that the number of n-best answers can be smaller than `n_best_size`
+            because some unqualified answers, e.g. answers that are too long,
+            are removed.

    """
    with jsonlines.open(examples_file) as reader:
@ -753,7 +817,9 @@ def postprocess_bert_answer(

        # Sort by the sum of the start and end logits in ascending order,
        # so that the first element is the most probable answer
-        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
+        )

        seen_predictions = {}
        nbest = []
@ -786,11 +852,19 @@ def postprocess_bert_answer(
                final_text = ""
                seen_predictions[final_text] = True

-            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
+                )
+            )
        # if we didn't include the empty option in the n-best, include it
        if unanswerable_exists:
            if "" not in seen_predictions:
-                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
+                nbest.append(
+                    _NbestPrediction(
+                        text="", start_logit=null_start_logit, end_logit=null_end_logit
+                    )
+                )

            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
@ -834,7 +908,9 @@ def postprocess_bert_answer(
            all_probs[example["qa_id"]] = nbest_json[0]["probability"]
        else:
            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            score_diff = (
+                score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+            )
            scores_diff_json[example["qa_id"]] = score_diff
            if score_diff > null_score_diff_threshold:
                all_predictions[example["qa_id"]] = ""
@ -1000,7 +1076,9 @@ def postprocess_xlnet_answer(
                        )
                    )

-        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+        prelim_predictions = sorted(
+            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
+        )

        seen_predictions = {}
        nbest = []
@ -1031,14 +1109,20 @@ def postprocess_xlnet_answer(
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)

-            final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
+            final_text = _get_final_text(
+                tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
+            )

            if final_text in seen_predictions:
                continue

            seen_predictions[final_text] = True

-            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
+                )
+            )

        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
@ -1185,7 +1269,9 @@ def _create_qa_example(qa_input, is_training):
            actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
            cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
            if actual_text.find(cleaned_answer_text) == -1:
-                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
+                logger.warning(
+                    "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
+                )
                return
        else:
            start_position = -1
@ -1408,7 +1494,7 @@ def _create_qa_features(
        else:
            tok_end_position = len(all_doc_tokens) - 1
        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text
+            all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
        )

    # The -3 accounts for [CLS], [SEP] and [SEP]
@ -1579,7 +1665,7 @@ def _create_qa_features(
 # -------------------------------------------------------------------------------------------------
 # Post processing helper functions
 _PrelimPrediction = collections.namedtuple(
-    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
 )

 _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
@ -1644,7 +1730,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
+            logger.info(
+                "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
+            )
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using