Merge pull request #586 from microsoft/bleik/add-models

update utils and examples
2020-05-13 00:20:56 -04:00 · 2020-05-13 00:20:56 -04:00 · e02e3b5525
--- a/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
+++ b/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
@ -15,37 +15,6 @@
    "# Named Entity Recognition Using Transformer Model"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Before You Start\n",
-    "\n",
-    "The running time shown in this notebook is on a Standard_NC6 Azure Deep Learning Virtual Machine with 1 NVIDIA Tesla K80 GPU. \n",
-    "> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
-    "\n",
-    "The table below provides some reference running time on different machine configurations.  \n",
-    "\n",
-    "|QUICK_RUN|Machine Configurations|Running time|\n",
-    "|:---------|:----------------------|:------------|\n",
-    "|True|4 **CPU**s, 14GB memory| ~ 2 minutes|\n",
-    "|False|4 **CPU**s, 14GB memory| ~1.5 hours|\n",
-    "|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1 minute|\n",
-    "|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 7 minutes |\n",
-    "\n",
-    "If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
-    "QUICK_RUN = False"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -61,29 +30,30 @@
    "<img src=\"https://nlpbp.blob.core.windows.net/images/bert_architecture.png\">"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Preparation"
-   ]
-  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
-    "import sys\n",
    "import os\n",
+    "import random\n",
+    "import string\n",
+    "import sys\n",
+    "from tempfile import TemporaryDirectory\n",
+    "\n",
+    "import pandas as pd\n",
    "import scrapbook as sb\n",
    "import torch\n",
-    "\n",
-    "from tempfile import TemporaryDirectory\n",
-    "from utils_nlp.dataset import wikigold\n",
-    "from utils_nlp.common.timer import Timer\n",
    "from seqeval.metrics import classification_report\n",
-    "from utils_nlp.models.transformers.named_entity_recognition import TokenClassifier"
+    "from sklearn.model_selection import train_test_split\n",
+    "from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
+    "from utils_nlp.common.timer import Timer\n",
+    "from utils_nlp.dataset import wikigold\n",
+    "from utils_nlp.dataset.ner_utils import read_conll_file\n",
+    "from utils_nlp.dataset.url_utils import maybe_download\n",
+    "from utils_nlp.models.transformers.named_entity_recognition import (\n",
+    "    TokenClassificationProcessor, TokenClassifier)\n"
   ]
  },
  {
@ -93,9 +63,38 @@
    "## Configuration"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The running time shown in this notebook is on a Standard_NC12 Azure Virtual Machine with 2 NVIDIA Tesla K80 GPUs. \n",
+    "> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
+    "\n",
+    "The table below provides some reference running time on different machine configurations.  \n",
+    "\n",
+    "|QUICK_RUN|Machine Configurations|Running time|\n",
+    "|:---------|:----------------------|:------------|\n",
+    "|True|4 CPUs, 14GB memory| ~ 2 minutes|\n",
+    "|False|4 CPUs, 14GB memory| ~1.5 hours|\n",
+    "|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1 minute|\n",
+    "|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 7 minutes |\n",
+    "\n",
+    "If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
+    "QUICK_RUN = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
   "metadata": {
    "tags": [
     "parameters"
@ -103,22 +102,17 @@
   },
   "outputs": [],
   "source": [
+    "# Wikigold dataset\n",
+    "DATA_URL = (\n",
+    "    \"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets\"\n",
+    "    \"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt\"\n",
+    ")\n",
+    "\n",
    "# fraction of the dataset used for testing\n",
    "TEST_DATA_FRACTION = 0.3\n",
    "\n",
-    "# sub-sampling ratio for training\n",
-    "TRAIN_SAMPLE_RATIO = 1\n",
-    "\n",
-    "# sub-sampling ratio for testing\n",
-    "TEST_SAMPLE_RATIO = 1\n",
-    "\n",
-    "NUM_TRAIN_EPOCHS = 5\n",
-    "\n",
-    "# update variables for quick run option\n",
-    "if QUICK_RUN:\n",
-    "    TRAIN_SAMPLE_RATIO = 0.1\n",
-    "    TEST_SAMPLE_RATIO = 0.1\n",
-    "    NUM_TRAIN_EPOCHS = 1\n",
+    "# sub-sampling ratio\n",
+    "SAMPLE_RATIO = 1\n",
    "\n",
    "# the data path used to save the downloaded data file\n",
    "DATA_PATH = TemporaryDirectory().name\n",
@ -131,16 +125,18 @@
    "torch.manual_seed(RANDOM_SEED)\n",
    "\n",
    "# model configurations\n",
+    "NUM_TRAIN_EPOCHS = 5\n",
    "MODEL_NAME = \"bert-base-cased\"\n",
    "DO_LOWER_CASE = False\n",
    "MAX_SEQ_LENGTH = 200\n",
    "TRAILING_PIECE_TAG = \"X\"\n",
-    "DEVICE = \"cuda\"\n",
+    "NUM_GPUS = None  # uses all if available\n",
+    "BATCH_SIZE = 16\n",
    "\n",
-    "if torch.cuda.is_available():\n",
-    "    BATCH_SIZE = 16\n",
-    "else:\n",
-    "    BATCH_SIZE = 8\n"
+    "# update variables for quick run option\n",
+    "if QUICK_RUN:\n",
+    "    SAMPLE_RATIO = 0.1\n",
+    "    NUM_TRAIN_EPOCHS = 1"
   ]
  },
  {
@ -151,45 +147,275 @@
    "\n",
    "The dataset used in this notebook is the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302). The wikigold dataset consists of 145 mannually labelled Wikipedia articles, including 1841 sentences and 40k tokens in total. The dataset can be directly downloaded from [here](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold). \n",
    "\n",
-    "A helper function `load_dataset` downloads the raw wikigold data, splits it into training and testing datasets (also sub-sampling if the sampling ratio is smaller than 1.0), and then process for the transformer model. Everything is done in one function call, and you can use the processed training and testing Pytorch datasets to fine tune the model and evaluate the performance of the model."
+    "In the following cell, we download the data file, parse the tokens and labels, sample a given number of sentences, and split the dataset for training and testing."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Maximum sequence length is: 144\n"
+     ]
+    }
+   ],
   "source": [
-    "train_dataloader, test_dataloader, label_map, test_dataset = wikigold.load_dataset(\n",
-    "    local_path=DATA_PATH,\n",
-    "    test_fraction=TEST_DATA_FRACTION,\n",
-    "    random_seed=RANDOM_SEED,\n",
-    "    train_sample_ratio=TRAIN_SAMPLE_RATIO,\n",
-    "    test_sample_ratio=TEST_SAMPLE_RATIO,\n",
-    "    model_name=MODEL_NAME,\n",
-    "    to_lower=DO_LOWER_CASE,\n",
-    "    cache_dir=CACHE_DIR,\n",
-    "    max_len=MAX_SEQ_LENGTH,\n",
-    "    trailing_piece_tag=TRAILING_PIECE_TAG,\n",
-    "    batch_size=BATCH_SIZE,\n",
-    "    num_gpus=None\n",
+    "# download data\n",
+    "file_name = DATA_URL.split(\"/\")[-1]  # a name for the downloaded file\n",
+    "maybe_download(DATA_URL, file_name, DATA_PATH)\n",
+    "data_file = os.path.join(DATA_PATH, file_name)\n",
+    "\n",
+    "# parse CoNll file\n",
+    "sentence_list, labels_list = read_conll_file(data_file, sep=\" \")\n",
+    "\n",
+    "# sub-sample (optional)\n",
+    "random.seed(RANDOM_SEED)\n",
+    "sample_size = int(SAMPLE_RATIO * len(sentence_list))\n",
+    "sentence_list, labels_list = list(\n",
+    "    zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))\n",
+    ")\n",
+    "\n",
+    "# train-test split\n",
+    "train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(\n",
+    "    sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED\n",
    ")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following is an example input sentence of the training set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>token</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>In</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1999</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>,</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>the</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Caloi</td>\n",
+       "      <td>I-PER</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>family</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>sold</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>the</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>majority</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>of</td>\n",
+       "      <td>O</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Caloi</td>\n",
+       "      <td>I-ORG</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       token  label\n",
+       "0         In      O\n",
+       "1       1999      O\n",
+       "2          ,      O\n",
+       "3        the      O\n",
+       "4      Caloi  I-PER\n",
+       "5     family      O\n",
+       "6       sold      O\n",
+       "7        the      O\n",
+       "8   majority      O\n",
+       "9         of      O\n",
+       "10     Caloi  I-ORG"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame({\"token\": train_sentence_list[0], \"label\": train_labels_list[0]}).head(11)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> If your data is unlabeled, try using an annotation tool to simplify the process of labeling. The example [here](../annotation/Doccano.md) introduces [Doccanno](https://github.com/chakki-works/doccano) and shows how it can be used for NER annotation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create PyTorch Datasets and Dataloaders\n",
+    "Given the tokenized input and corresponding labels, we use a custom processer to convert our input lists into a PyTorch dataset that can be used with our token classifier. Next, we create PyTorch dataloaders for training and testing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:Token lists with length > 512 will be truncated\n",
+      "WARNING:root:Token lists with length > 512 will be truncated\n"
+     ]
+    }
+   ],
+   "source": [
+    "processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)\n",
+    "\n",
+    "label_map = TokenClassificationProcessor.create_label_map(\n",
+    "    label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG\n",
+    ")\n",
+    "\n",
+    "train_dataset = processor.preprocess(\n",
+    "    text=train_sentence_list,\n",
+    "    max_len=MAX_SEQ_LENGTH,\n",
+    "    labels=train_labels_list,\n",
+    "    label_map=label_map,\n",
+    "    trailing_piece_tag=TRAILING_PIECE_TAG,\n",
+    ")\n",
+    "train_dataloader = dataloader_from_dataset(\n",
+    "    train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False\n",
+    ")\n",
+    "\n",
+    "test_dataset = processor.preprocess(\n",
+    "    text=test_sentence_list,\n",
+    "    max_len=MAX_SEQ_LENGTH,\n",
+    "    labels=test_labels_list,\n",
+    "    label_map=label_map,\n",
+    "    trailing_piece_tag=TRAILING_PIECE_TAG,\n",
+    ")\n",
+    "test_dataloader = dataloader_from_dataset(\n",
+    "    test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False\n",
+    ")\n"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train Model\n",
    "\n",
-    "There are two steps to train a NER model using pretrained transformer model: 1). instantiate a TokenClassifier class which is a wrapper of the transformer using BERT architecture, and 2), fit the model using the preprocessed training dataset. The member method `fit` of TokenClassifier class is used to fine tune the model."
+    "There are two steps to train a NER model using pretrained transformer model: 1) Instantiate a TokenClassifier class which is a wrapper of a transformer-based network, and 2) Fit the model using the preprocessed training dataloader. The member method `fit` of TokenClassifier class is used to fine-tune the model."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7c19dfe849b4bb3b195e792b0ccc809",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(IntProgress(value=0, description='Downloading', max=435779157, style=ProgressStyle(description_…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/bleik2/backup/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training time : 0.075 hrs\n"
+     ]
+    }
+   ],
   "source": [
    "# Instantiate a TokenClassifier class for NER using pretrained transformer model\n",
    "model = TokenClassifier(\n",
@ -203,13 +429,13 @@
    "    model.fit(\n",
    "        train_dataloader=train_dataloader,\n",
    "        num_epochs=NUM_TRAIN_EPOCHS,\n",
-    "        num_gpus=None,\n",
+    "        num_gpus=NUM_GPUS,\n",
    "        local_rank=-1,\n",
    "        weight_decay=0.0,\n",
    "        learning_rate=5e-5,\n",
    "        adam_epsilon=1e-8,\n",
    "        warmup_steps=0,\n",
-    "        verbose=True,\n",
+    "        verbose=False,\n",
    "        seed=RANDOM_SEED\n",
    "    )\n",
    "\n",
@ -227,9 +453,31 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Scoring: 100%|██████████| 18/18 [00:08<00:00,  2.49it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prediction time : 0.002 hrs\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "with Timer() as t:\n",
    "    preds = model.predict(\n",
@ -245,12 +493,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Get the true token labels of the testing dataset. "
+    "Get the true token labels of the testing dataset:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@ -266,9 +514,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "           precision    recall  f1-score   support\n",
+      "\n",
+      "     MISC       0.68      0.67      0.68       221\n",
+      "      LOC       0.79      0.85      0.82       317\n",
+      "      ORG       0.73      0.81      0.76       274\n",
+      "      PER       0.92      0.93      0.92       257\n",
+      "\n",
+      "micro avg       0.78      0.82      0.80      1069\n",
+      "macro avg       0.78      0.82      0.80      1069\n",
+      "\n"
+     ]
+    }
+   ],
   "source": [
    "predicted_labels = model.get_predicted_token_labels(\n",
    "    predictions=preds,\n",
@ -284,6 +549,94 @@
    "print(report)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Score Example Sentences\n",
+    "Finally, we test the model on some random input sentences."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:Token lists with length > 512 will be truncated\n",
+      "Scoring: 100%|██████████| 1/1 [00:00<00:00,  7.56it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      " Is it true that Jane works at Microsoft?\n",
+      "       tokens labels\n",
+      "0          Is      O\n",
+      "1          it      O\n",
+      "2        true      O\n",
+      "3        that      O\n",
+      "4        Jane  I-PER\n",
+      "5       works      O\n",
+      "6          at      O\n",
+      "7  Microsoft?  I-ORG\n",
+      "\n",
+      " Joe now lives in Copenhagen.\n",
+      "        tokens labels\n",
+      "0          Joe  I-PER\n",
+      "1          now      O\n",
+      "2        lives      O\n",
+      "3           in      O\n",
+      "4  Copenhagen.  I-LOC\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# test\n",
+    "sample_text = [    \n",
+    "    \"Is it true that Jane works at Microsoft?\",\n",
+    "    \"Joe now lives in Copenhagen.\"\n",
+    "]\n",
+    "sample_tokens = [x.split() for x in sample_text]\n",
+    "\n",
+    "sample_dataset = processor.preprocess(\n",
+    "    text=sample_tokens,\n",
+    "    max_len=MAX_SEQ_LENGTH,\n",
+    "    labels=None,\n",
+    "    label_map=label_map,\n",
+    "    trailing_piece_tag=TRAILING_PIECE_TAG,\n",
+    ")\n",
+    "sample_dataloader = dataloader_from_dataset(\n",
+    "    sample_dataset, batch_size=BATCH_SIZE, num_gpus=None, shuffle=False, distributed=False\n",
+    ")\n",
+    "preds = model.predict(\n",
+    "        test_dataloader=sample_dataloader,\n",
+    "        num_gpus=None,\n",
+    "        verbose=True\n",
+    ")\n",
+    "predicted_labels = model.get_predicted_token_labels(\n",
+    "    predictions=preds,\n",
+    "    label_map=label_map,\n",
+    "    dataset=sample_dataset\n",
+    ")\n",
+    "\n",
+    "for i in range(len(sample_text)):\n",
+    "    print(\"\\n\", sample_text[i])\n",
+    "    print(pd.DataFrame({\"tokens\": sample_tokens[i] , \"labels\":predicted_labels[i]}))  "
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -293,9 +646,64 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.78,
+       "encoder": "json",
+       "name": "precision",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "precision"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.82,
+       "encoder": "json",
+       "name": "recall",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "recall"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.8,
+       "encoder": "json",
+       "name": "f1",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "f1"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
   "source": [
    "report_splits = report.split('\\n')[-2].split()\n",
    "\n",
@ -306,11 +714,10 @@
  }
 ],
 "metadata": {
-  "celltoolbar": "Tags",
  "kernelspec": {
-   "display_name": "Python 3.6 - AzureML",
+   "display_name": "nlp_gpu",
   "language": "python",
-   "name": "python3-azureml"
+   "name": "nlp_gpu"
  },
  "language_info": {
   "codemirror_mode": {
@ -326,5 +733,5 @@
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/tests/unit/test_dataset.py
+++ b/tests/unit/test_dataset.py
@ -112,7 +112,7 @@ def test_wikigold(tmp_path):


 def test_ner_utils(ner_utils_test_data):
-    output = preprocess_conll(ner_utils_test_data["input"])
+    output = preprocess_conll(ner_utils_test_data["input"], sep=" ")
    assert output == ner_utils_test_data["expected_output"]


--- a/tests/unit/test_models_transformers_question_answering.py
+++ b/tests/unit/test_models_transformers_question_answering.py
@ -85,7 +85,9 @@ def qa_test_data(qa_test_df, tmp_module):
    )

    # xlnet
-    qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
+    qa_processor_xlnet = QAProcessor(
+        model_name="xlnet-base-cased", cache_dir=tmp_module
+    )
    train_features_xlnet = qa_processor_xlnet.preprocess(
        train_dataset,
        is_training=True,
@ -148,13 +150,19 @@ def test_QAProcessor(qa_test_data, tmp_module):
    ]:
        qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
        qa_processor.preprocess(
-            qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset"],
+            is_training=True,
+            feature_cache_dir=tmp_module,
        )
        qa_processor.preprocess(
-            qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_list"],
+            is_training=True,
+            feature_cache_dir=tmp_module,
        )
        qa_processor.preprocess(
-            qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
+            qa_test_data["test_dataset"],
+            is_training=False,
+            feature_cache_dir=tmp_module,
        )

    # test unsupported model type
@ -188,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
    # bert
    qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)
    train_loader_bert = dataloader_from_dataset(qa_test_data["train_features_bert"])
-    test_loader_bert = dataloader_from_dataset(qa_test_data["test_features_bert"], shuffle=False)
+    test_loader_bert = dataloader_from_dataset(
+        qa_test_data["test_features_bert"], shuffle=False
+    )
    qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)

    # test saving fine-tuned model
@ -203,13 +213,19 @@ def test_AnswerExtractor(qa_test_data, tmp_module):

    # xlnet
    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_xlnet"])
-    test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_xlnet"], shuffle=False)
-    qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
+    test_loader_xlnet = dataloader_from_dataset(
+        qa_test_data["test_features_xlnet"], shuffle=False
+    )
+    qa_extractor_xlnet = AnswerExtractor(
+        model_name="xlnet-base-cased", cache_dir=tmp_module
+    )
    qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False)
    qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)

    # distilbert
-    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
+    train_loader_xlnet = dataloader_from_dataset(
+        qa_test_data["train_features_distilbert"]
+    )
    test_loader_xlnet = dataloader_from_dataset(
        qa_test_data["test_features_distilbert"], shuffle=False
    )
--- a/tests/unit/test_transformers_token_classification.py
+++ b/tests/unit/test_transformers_token_classification.py
@ -23,7 +23,7 @@ def test_token_classifier_fit_predict(tmpdir, ner_test_data):
    )

    # test fit, no warmup
-    train_dataset = processor.preprocess_for_bert(
+    train_dataset = processor.preprocess(
        text=ner_test_data["INPUT_TEXT"],
        max_len=max_seq_len,
        labels=ner_test_data["INPUT_LABELS"],
--- a/tools/generate_conda_file.py
+++ b/tools/generate_conda_file.py
@ -81,7 +81,7 @@ PIP_BASE = {
        "https://github.com/explosion/spacy-models/releases/download/"
        "en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz"
    ),
-    "transformers": "transformers==2.5.0",
+    "transformers": "transformers==2.9.0",
    "gensim": "gensim>=3.7.0",
    "nltk": "nltk>=3.4",
    "seqeval": "seqeval>=0.0.12",
--- a/utils_nlp/dataset/ner_utils.py
+++ b/utils_nlp/dataset/ner_utils.py
@ -4,10 +4,9 @@
 """Common helper functions for preprocessing Named Entity Recognition (NER) datasets."""


-def preprocess_conll(text, data_type=""):
+def preprocess_conll(text, sep="\t"):
    """
-    Helper function converting data in conll format to word lists
-    and token label lists.
+    Converts data in CoNLL format to word and label lists.

    Args:
        text (str): Text string in conll format, e.g.
@ -20,8 +19,8 @@ def preprocess_conll(text, data_type=""):
             of I-ORG
             Minnesota I-ORG
             . O"
-        data_type (str, optional): String that briefly describes the data,
-            e.g. "train"
+        sep (str, optional): Column separator
+            Defaults to \t
    Returns:
        tuple:
            (list of word lists, list of token label lists)
@ -37,11 +36,29 @@ def preprocess_conll(text, data_type=""):
        # split each sentence string into "word label" pairs
        s_split = s.split("\n")
        # split "word label" pairs
-        s_split_split = [t.split() for t in s_split]
+        s_split_split = [t.split(sep) for t in s_split]
        sentence_list.append([t[0] for t in s_split_split if len(t) > 1])
        labels_list.append([t[1] for t in s_split_split if len(t) > 1])

        if len(s_split_split) > max_seq_len:
            max_seq_len = len(s_split_split)
-    print("Maximum sequence length in the {0} data is: {1}".format(data_type, max_seq_len))
+    print("Maximum sequence length is: {0}".format(max_seq_len))
    return sentence_list, labels_list
+
+
+def read_conll_file(file_path, sep="\t", encoding=None):
+    """
+    Reads a data file in CoNLL format and returns word and label lists.
+
+    Args:
+        file_path (str): Data file path.
+        sep (str, optional): Column separator. Defaults to "\t".
+        encoding (str): File encoding used when reading the file.
+            Defaults to None.
+
+    Returns:
+        (list, list): A tuple of word and label lists (list of lists).
+    """
+    with open(file_path, encoding=encoding) as f:
+        data = f.read()
+    return preprocess_conll(data, sep=sep)
--- a/utils_nlp/dataset/wikigold.py
+++ b/utils_nlp/dataset/wikigold.py
@ -18,7 +18,9 @@ from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.ner_utils import preprocess_conll
 from utils_nlp.dataset.url_utils import maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
-from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor
+from utils_nlp.models.transformers.named_entity_recognition import (
+    TokenClassificationProcessor,
+)

 URL = (
    "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
@ -68,7 +70,9 @@ def load_train_test_dfs(local_cache_path="./", test_fraction=0.5, random_seed=No
    train_sentence_list = sentence_list[test_sentence_count:]
    train_labels_list = labels_list[test_sentence_count:]

-    train_df = pd.DataFrame({"sentence": train_sentence_list, "labels": train_labels_list})
+    train_df = pd.DataFrame(
+        {"sentence": train_sentence_list, "labels": train_labels_list}
+    )

    test_df = pd.DataFrame({"sentence": test_sentence_list, "labels": test_labels_list})

@ -152,7 +156,9 @@ def load_dataset(
    """

    train_df, test_df = load_train_test_dfs(
-        local_cache_path=local_path, test_fraction=test_fraction, random_seed=random_seed
+        local_cache_path=local_path,
+        test_fraction=test_fraction,
+        random_seed=random_seed,
    )

    if train_sample_ratio > 1.0:
@ -160,7 +166,9 @@ def load_dataset(
        logging.warning("Setting the training sample ratio to 1.0")
    elif train_sample_ratio < 0:
        logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
-        raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
+        raise ValueError(
+            "Invalid training sample ration: {}".format(train_sample_ratio)
+        )

    if test_sample_ratio > 1.0:
        test_sample_ratio = 1.0
@ -174,7 +182,9 @@ def load_dataset(
    if test_sample_ratio < 1.0:
        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)

-    processor = TokenClassificationProcessor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
+    processor = TokenClassificationProcessor(
+        model_name=model_name, to_lower=to_lower, cache_dir=cache_dir
+    )

    label_map = TokenClassificationProcessor.create_label_map(
        label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag
@ -197,11 +207,19 @@ def load_dataset(
    )

    train_dataloader = dataloader_from_dataset(
-        train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True, distributed=False
+        train_dataset,
+        batch_size=batch_size,
+        num_gpus=num_gpus,
+        shuffle=True,
+        distributed=False,
    )

    test_dataloader = dataloader_from_dataset(
-        test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False, distributed=False
+        test_dataset,
+        batch_size=batch_size,
+        num_gpus=num_gpus,
+        shuffle=False,
+        distributed=False,
    )

    return (train_dataloader, test_dataloader, label_map, test_dataset)
--- a/utils_nlp/models/transformers/abstractive_summarization_bertsum.py
+++ b/utils_nlp/models/transformers/abstractive_summarization_bertsum.py
@ -5,34 +5,29 @@
 # This script reuses some code from https://github.com/huggingface/transformers/
 # Add to noticefile

-from collections import namedtuple
 import logging
 import os
 import pickle
-from tqdm import tqdm
+from collections import namedtuple

 import torch
-from torch.utils.data import (
-    DataLoader,
-    SequentialSampler,
-    RandomSampler,
-)
-
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-from transformers import BertModel
+from tqdm import tqdm
+from transformers import AutoTokenizer, BertModel

 from utils_nlp.common.pytorch_utils import (
    compute_training_steps,
-    get_device,
    get_amp,
+    get_device,
    move_model_to_device,
    parallelize_model,
 )
 from utils_nlp.eval import compute_rouge_python
-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
 from utils_nlp.models.transformers.bertsum import model_builder
 from utils_nlp.models.transformers.bertsum.model_builder import AbsSummarizer
 from utils_nlp.models.transformers.bertsum.predictor import build_predictor
+from utils_nlp.models.transformers.common import Transformer

 MODEL_CLASS = {"bert-base-uncased": BertModel}

@ -134,8 +129,11 @@ class BertSumAbsProcessor:

        """
        self.model_name = model_name
-        self.tokenizer = TOKENIZER_CLASS[self.model_name].from_pretrained(
-            self.model_name, do_lower_case=to_lower, cache_dir=cache_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            do_lower_case=to_lower,
+            cache_dir=cache_dir,
+            output_loading_info=False,
        )

        self.symbols = {
@ -156,7 +154,7 @@ class BertSumAbsProcessor:

    @staticmethod
    def list_supported_models():
-        return list(MODEL_CLASS.keys())
+        return list(MODEL_CLASS)

    @property
    def model_name(self):
@ -184,7 +182,7 @@ class BertSumAbsProcessor:
                also contains the target ids and the number of tokens
                in the target and target text.
            device (torch.device): A PyTorch device.
-            model_name (bool, optional): Model name used to format the inputs.
+            model_name (bool): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.

@ -403,7 +401,8 @@ class BertSumAbs(Transformer):
                check MODEL_CLASS for supported models. Defaults to "bert-base-uncased".
            finetune_bert (bool, option): Whether the bert model in the encoder is
                finetune or not. Defaults to True.
-            cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
+            cache_dir (str, optional): Directory to cache the tokenizer.
+                Defaults to ".".
            label_smoothing (float, optional): The amount of label smoothing.
                Value range is [0, 1]. Defaults to 0.1.
            test (bool, optional): Whether the class is initiated for test or not.
@ -412,13 +411,11 @@ class BertSumAbs(Transformer):
            max_pos_length (int, optional): maximum postional embedding length for the
                input. Defaults to 768.
        """
-
-        super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=0,
-            cache_dir=cache_dir,
+        model = MODEL_CLASS[model_name].from_pretrained(
+            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
        )
+        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
+
        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by BertSumAbs. "
@ -616,10 +613,7 @@ class BertSumAbs(Transformer):
            )

        train_dataloader = DataLoader(
-            train_dataset,
-            sampler=sampler,
-            batch_size=batch_size,
-            collate_fn=collate_fn,
+            train_dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn
        )

        # compute the max number of training steps
--- a/utils_nlp/models/transformers/abstractive_summarization_seq2seq.py
+++ b/utils_nlp/models/transformers/abstractive_summarization_seq2seq.py
@ -13,7 +13,7 @@ from torch.utils.data.distributed import DistributedSampler

 from transformers import RobertaConfig, BertConfig

-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
+from utils_nlp.models.transformers.common import Transformer
 from utils_nlp.common.pytorch_utils import (
    get_device,
    move_model_to_device,
@ -52,8 +52,7 @@ MODEL_CLASS.update(
    {k: BertForSequenceToSequence for k in MINILM_PRETRAINED_MODEL_ARCHIVE_MAP}
 )

-
-
+TOKENIZER_CLASS = {}
 TOKENIZER_CLASS.update({k: UnilmTokenizer for k in UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP})
 TOKENIZER_CLASS.update({k: MinilmTokenizer for k in MINILM_PRETRAINED_CONFIG_ARCHIVE_MAP})

--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@ -14,32 +14,14 @@ import numpy as np
 import torch
 from tqdm import tqdm
 from transformers import AdamW, get_linear_schedule_with_warmup
-from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
-from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
-from transformers.tokenization_bert import BertTokenizer
-from transformers.tokenization_distilbert import DistilBertTokenizer
-from transformers.tokenization_roberta import RobertaTokenizer
-from transformers.tokenization_xlnet import XLNetTokenizer

 from utils_nlp.common.pytorch_utils import (
+    get_amp,
    get_device,
    move_model_to_device,
-    get_amp,
    parallelize_model,
 )

-TOKENIZER_CLASS = {}
-TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
-TOKENIZER_CLASS.update(
-    {k: RobertaTokenizer for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
-TOKENIZER_CLASS.update({k: XLNetTokenizer for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
-TOKENIZER_CLASS.update(
-    {k: DistilBertTokenizer for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
-
 MAX_SEQ_LEN = 512

 logger = logging.getLogger(__name__)
@ -48,35 +30,14 @@ logger = logging.getLogger(__name__)
 class Transformer:
    def __init__(
        self,
-        model_class,
-        model_name="bert-base-cased",
-        num_labels=2,
-        cache_dir=".",
-        load_model_from_dir=None,
+        model_name,
+        model,
+        cache_dir,
    ):
-
-        if model_name not in self.list_supported_models():
-            raise ValueError(
-                "Model name {0} is not supported by {1}. "
-                "Call '{1}.list_supported_models()' to get all supported model "
-                "names.".format(model_name, self.__class__.__name__)
-            )
        self._model_name = model_name
        self._model_type = model_name.split("-")[0]
+        self.model = model
        self.cache_dir = cache_dir
-        self.load_model_from_dir = load_model_from_dir
-        if load_model_from_dir is None:
-            self.model = model_class[model_name].from_pretrained(
-                model_name,
-                cache_dir=cache_dir,
-                num_labels=num_labels,
-                output_loading_info=False,
-            )
-        else:
-            logger.info("Loading cached model from {}".format(load_model_from_dir))
-            self.model = model_class[model_name].from_pretrained(
-                load_model_from_dir, num_labels=num_labels, output_loading_info=False
-            )

    @property
    def model_name(self):
@ -241,7 +202,8 @@ class Transformer:
                if isinstance(outputs, tuple):
                    loss = outputs[0]
                else:
-                    # Accomondate models based on older versions of Transformers, e.g. UniLM
+                    # Accomondate models based on older versions of Transformers,
+                    # e.g. UniLM
                    loss = outputs

                if num_gpus > 1:
@ -317,7 +279,7 @@ class Transformer:
                        saved_model_path = os.path.join(
                            self.cache_dir, f"{self.model_name}_step_{global_step}.pt"
                        )
-                        self.save_model(global_step, saved_model_path)
+                        self.save_model(saved_model_path)
                        if validation_function:
                            validation_log = validation_function(self)
                            logger.info(validation_log)
@ -327,7 +289,7 @@ class Transformer:
                    break
        if fp16 and amp:
            self.amp_state_dict = amp.state_dict()
-        
+
        # release GPU memories
        self.model.cpu()
        torch.cuda.empty_cache()
--- a/utils_nlp/models/transformers/extractive_summarization.py
+++ b/utils_nlp/models/transformers/extractive_summarization.py
@ -12,14 +12,20 @@ from multiprocessing import Pool, cpu_count

 import numpy as np
 import torch
-from torch.utils.data import (
-    DataLoader,
-    SequentialSampler,
-    RandomSampler,
-)
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-from transformers import BertModel, DistilBertModel
+from transformers import AutoTokenizer, BertModel, DistilBertModel

+from utils_nlp.common.pytorch_utils import (
+    compute_training_steps,
+    get_device,
+    move_model_to_device,
+    parallelize_model,
+)
+from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
+from utils_nlp.models.transformers.abstractive_summarization_bertsum import (
+    fit_to_block_size,
+)

 from utils_nlp.models.transformers.bertsum import model_builder
 from utils_nlp.models.transformers.bertsum.data_loader import (
@ -32,17 +38,7 @@ from utils_nlp.models.transformers.bertsum.dataset import (
    ExtSumProcessedIterableDataset,
 )
 from utils_nlp.models.transformers.bertsum.model_builder import BertSumExt
-from utils_nlp.common.pytorch_utils import (
-    compute_training_steps,
-    get_device,
-    move_model_to_device,
-    parallelize_model,
-)
-from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
-from utils_nlp.models.transformers.abstractive_summarization_bertsum import (
-    fit_to_block_size,
-)
+from utils_nlp.models.transformers.common import Transformer

 MODEL_CLASS = {
    "bert-base-uncased": BertModel,
@ -302,7 +298,7 @@ def parallel_preprocess(input_data, preprocess, num_pool=-1):
    p = Pool(num_pool)

    results = p.map(
-        preprocess, input_data, chunksize=min(1, int(len(input_data) / num_pool)),
+        preprocess, input_data, chunksize=min(1, int(len(input_data) / num_pool))
    )
    p.close()
    p.join()
@ -347,8 +343,11 @@ class ExtSumProcessor:

        """
        self.model_name = model_name
-        self.tokenizer = TOKENIZER_CLASS[self.model_name].from_pretrained(
-            self.model_name, do_lower_case=to_lower, cache_dir=cache_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            do_lower_case=to_lower,
+            cache_dir=cache_dir,
+            output_loading_info=False,
        )
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
@ -361,7 +360,7 @@ class ExtSumProcessor:

    @staticmethod
    def list_supported_models():
-        return list(TOKENIZER_CLASS.keys())
+        return list(MODEL_CLASS)

    @property
    def model_name(self):
@ -389,7 +388,7 @@ class ExtSumProcessor:
                text. If train_model is True, it also contains the labels and target
                text.
            device (torch.device): A PyTorch device.
-            model_name (bool, optional): Model name used to format the inputs.
+            model_name (bool): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.

@ -500,7 +499,6 @@ class ExtSumProcessor:

        if len(src) == 0:
            raise ValueError("source doesn't have any sentences")
-            return None

        original_src_txt = [" ".join(s) for s in src]
        # no filtering for prediction
@ -588,12 +586,11 @@ class ExtractiveSummarizer(Transformer):
                Defaults to ".".
        """

-        super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=0,
-            cache_dir=cache_dir,
+        model = MODEL_CLASS[model_name].from_pretrained(
+            model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
        )
+        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
+
        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by ExtractiveSummarizer. "
@ -621,7 +618,7 @@ class ExtractiveSummarizer(Transformer):

    @staticmethod
    def list_supported_models():
-        return list(MODEL_CLASS.keys())
+        return list(MODEL_CLASS)

    def fit(
        self,
--- a/utils_nlp/models/transformers/named_entity_recognition.py
+++ b/utils_nlp/models/transformers/named_entity_recognition.py
@ -7,32 +7,21 @@ from collections import Iterable
 import numpy as np
 import torch
 from torch.utils.data import TensorDataset
-from transformers.modeling_bert import (
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    BertForTokenClassification,
-)
-from transformers.modeling_distilbert import (
-    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForTokenClassification,
+from transformers import (
+    MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
 )

 from utils_nlp.common.pytorch_utils import compute_training_steps
-from utils_nlp.models.transformers.common import (
-    MAX_SEQ_LEN,
-    TOKENIZER_CLASS,
-    Transformer,
-)
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer

-TC_MODEL_CLASS = {}
-TC_MODEL_CLASS.update(
-    {k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
-TC_MODEL_CLASS.update(
-    {
-        k: DistilBertForTokenClassification
-        for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    }
-)
+supported_models = [
+    list(x.pretrained_config_archive_map)
+    for x in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
+]
+supported_models = sorted([x for y in supported_models for x in y])


 class TokenClassificationProcessor:
@ -52,7 +41,7 @@ class TokenClassificationProcessor:
        self.model_name = model_name
        self.to_lower = to_lower
        self.cache_dir = cache_dir
-        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
+        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
@ -68,7 +57,7 @@ class TokenClassificationProcessor:
            batch (tuple): A tuple containing input ids, attention mask,
                segment ids, and labels tensors.
            device (torch.device): A PyTorch device.
-            model_name (bool, optional): Model name used to format the inputs.
+            model_name (bool): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.

@ -77,7 +66,7 @@ class TokenClassificationProcessor:
                Labels are only returned when train_mode is True.
        """
        batch = tuple(t.to(device) for t in batch)
-        if model_name.split("-")[0] in ["bert", "distilbert"]:
+        if model_name in supported_models:
            if train_mode:
                inputs = {
                    "input_ids": batch[0],
@ -110,17 +99,15 @@ class TokenClassificationProcessor:
            dict: A dictionary object to map a label (str) to an ID (int).
        """

-        label_set = set()
-        for labels in label_lists:
-            label_set.update(labels)
+        unique_labels = sorted(set([x for y in label_lists for x in y]))
+        label_map = {label: i for i, label in enumerate(unique_labels)}

-        label_map = {label: i for i, label in enumerate(label_set)}
+        if trailing_piece_tag not in unique_labels:
+            label_map[trailing_piece_tag] = len(unique_labels)

-        if trailing_piece_tag not in label_set:
-            label_map[trailing_piece_tag] = len(label_set)
        return label_map

-    def preprocess_for_bert(
+    def preprocess(
        self,
        text,
        max_len=MAX_SEQ_LEN,
@ -187,6 +174,10 @@ class TokenClassificationProcessor:
            )
            max_len = MAX_SEQ_LEN

+        logging.warn(
+            "Token lists with length > {} will be truncated".format(MAX_SEQ_LEN)
+        )
+
        if not _is_iterable_but_not_string(text):
            # The input text must be an non-string Iterable
            raise ValueError("Input text must be an iterable and not a string.")
@ -233,11 +224,6 @@ class TokenClassificationProcessor:
                    new_tokens.append(sub_word)

            if len(new_tokens) > max_len:
-                logging.warn(
-                    "Text after tokenization with length {} has been truncated".format(
-                        len(new_tokens)
-                    )
-                )
                new_tokens = new_tokens[:max_len]
                new_labels = new_labels[:max_len]
            input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
@ -269,16 +255,16 @@ class TokenClassificationProcessor:

        if label_available:
            td = TensorDataset(
-                torch.tensor(input_ids_all, dtype=torch.long),
-                torch.tensor(input_mask_all, dtype=torch.long),
-                torch.tensor(trailing_token_mask_all, dtype=torch.long),
-                torch.tensor(label_ids_all, dtype=torch.long),
+                torch.LongTensor(input_ids_all),
+                torch.LongTensor(input_mask_all),
+                torch.LongTensor(trailing_token_mask_all),
+                torch.LongTensor(label_ids_all),
            )
        else:
            td = TensorDataset(
-                torch.tensor(input_ids_all, dtype=torch.long),
-                torch.tensor(input_mask_all, dtype=torch.long),
-                torch.tensor(trailing_token_mask_all, dtype=torch.long),
+                torch.LongTensor(input_ids_all),
+                torch.LongTensor(input_mask_all),
+                torch.LongTensor(trailing_token_mask_all),
            )
        return td

@ -297,16 +283,17 @@ class TokenClassifier(Transformer):
    """

    def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
-        super().__init__(
-            model_class=TC_MODEL_CLASS,
-            model_name=model_name,
-            num_labels=num_labels,
-            cache_dir=cache_dir,
+        config = AutoConfig.from_pretrained(
+            model_name, num_labels=num_labels, cache_dir=cache_dir
        )
+        model = AutoModelForTokenClassification.from_pretrained(
+            model_name, cache_dir=cache_dir, config=config, output_loading_info=False
+        )
+        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)

    @staticmethod
    def list_supported_models():
-        return list(TC_MODEL_CLASS)
+        return supported_models

    def fit(
        self,
@ -398,7 +385,9 @@ class TokenClassifier(Transformer):

        # init scheduler
        scheduler = Transformer.get_default_scheduler(
-            optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
+            optimizer=self.optimizer,
+            warmup_steps=warmup_steps,
+            num_training_steps=max_steps,
        )

        # fine tune
--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@ -27,6 +27,8 @@ import jsonlines
 import torch
 from torch.utils.data import TensorDataset
 from tqdm import tqdm
+
+from transformers import AutoTokenizer
 from transformers.modeling_albert import (
    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    AlbertForQuestionAnswering,
@ -51,19 +53,21 @@ from utils_nlp.common.pytorch_utils import (
    move_model_to_device,
    parallelize_model,
 )
-from utils_nlp.models.transformers.common import (
-    MAX_SEQ_LEN,
-    TOKENIZER_CLASS,
-    Transformer,
-)
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer

 MODEL_CLASS = {}
-MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update(
+    {k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
+)
+MODEL_CLASS.update(
+    {k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP}
+)
 MODEL_CLASS.update(
    {k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
 )
-MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update(
+    {k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
+)

 # cached files during preprocessing
 # these are used in postprocessing to generate the final answer texts
@ -103,11 +107,18 @@ class QAProcessor:
    """

    def __init__(
-        self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
+        self,
+        model_name="bert-base-cased",
+        to_lower=False,
+        custom_tokenize=None,
+        cache_dir=".",
    ):
        self.model_name = model_name
-        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
-            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            do_lower_case=to_lower,
+            cache_dir=cache_dir,
+            output_loading_info=False,
        )
        self.do_lower_case = to_lower
        self.custom_tokenize = custom_tokenize
@ -218,7 +229,9 @@ class QAProcessor:
            os.makedirs(feature_cache_dir)

        if is_training and not qa_dataset.actual_answer_available:
-            raise Exception("answer_start and answer_text must be provided for training data.")
+            raise Exception(
+                "answer_start and answer_text must be provided for training data."
+            )

        if is_training:
            examples_file = os.path.join(feature_cache_dir, CACHED_EXAMPLES_TRAIN_FILE)
@ -245,7 +258,10 @@ class QAProcessor:
                qa_examples.append(qa_example_cur)

                qa_examples_json.append(
-                    {"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
+                    {
+                        "qa_id": qa_example_cur.qa_id,
+                        "doc_tokens": qa_example_cur.doc_tokens,
+                    }
                )

                features_cur = _create_qa_features(
@ -289,8 +305,12 @@ class QAProcessor:
        p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)

        if is_training:
-            start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-            end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            start_positions = torch.tensor(
+                [f.start_position for f in features], dtype=torch.long
+            )
+            end_positions = torch.tensor(
+                [f.end_position for f in features], dtype=torch.long
+            )
            qa_dataset = TensorDataset(
                input_ids,
                input_mask,
@ -421,7 +441,9 @@ class QAProcessor:
        return final_answers, answer_probs, nbest_answers


-QAResult_ = collections.namedtuple("QAResult", ["unique_id", "start_logits", "end_logits"])
+QAResult_ = collections.namedtuple(
+    "QAResult", ["unique_id", "start_logits", "end_logits"]
+)


 # create a wrapper class so that we can add docstrings
@ -503,15 +525,15 @@ class AnswerExtractor(Transformer):

    """

-    def __init__(self, model_name="bert-base-cased", cache_dir=".", load_model_from_dir=None):
-
-        super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=2,
+    def __init__(
+        self, model_name="bert-base-cased", cache_dir=".", load_model_from_dir=None
+    ):
+        model = MODEL_CLASS[model_name].from_pretrained(
+            model_name if load_model_from_dir is None else load_model_from_dir,
            cache_dir=cache_dir,
-            load_model_from_dir=load_model_from_dir,
+            output_loading_info=False,
        )
+        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)

    @staticmethod
    def list_supported_models():
@ -613,7 +635,9 @@ class AnswerExtractor(Transformer):

        # inin scheduler
        scheduler = Transformer.get_default_scheduler(
-            optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
+            optimizer=self.optimizer,
+            warmup_steps=warmup_steps,
+            num_training_steps=max_steps,
        )

        # fine tune
@ -668,13 +692,19 @@ class AnswerExtractor(Transformer):

        # parallelize model
        self.model = parallelize_model(
-            model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=-1,
+            model=self.model,
+            device=device,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,
+            local_rank=-1,
        )

        all_results = []
        for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose):
            with torch.no_grad():
-                inputs = QAProcessor.get_inputs(batch, device, self.model_name, train_mode=False)
+                inputs = QAProcessor.get_inputs(
+                    batch, device, self.model_name, train_mode=False
+                )
                outputs = self.model(**inputs)
                unique_id_tensor = batch[5]

@ -865,7 +895,9 @@ def postprocess_bert_answer(
        # Sort by the sum of the start and end logits in ascending order,
        # so that the first element is the most probable answer
        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True,
        )

        seen_predictions = {}
@ -890,7 +922,9 @@ def postprocess_bert_answer(
                tok_text = " ".join(tok_text.split())
                orig_text = " ".join(orig_tokens)

-                final_text = _get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                final_text = _get_final_text(
+                    tok_text, orig_text, do_lower_case, verbose_logging
+                )
                if final_text in seen_predictions:
                    continue

@ -901,7 +935,9 @@ def postprocess_bert_answer(

            nbest.append(
                _NbestPrediction(
-                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit,
                )
            )
        # if we didn't include the empty option in the n-best, include it
@ -916,7 +952,9 @@ def postprocess_bert_answer(
            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
            if len(nbest) == 1:
-                nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+                nbest.insert(
+                    0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)
+                )

        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
@ -956,7 +994,9 @@ def postprocess_bert_answer(
        else:
            # predict "" iff the null score - the score of best non-null > threshold
            score_diff = (
-                score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
+                score_null
+                - best_non_null_entry.start_logit
+                - (best_non_null_entry.end_logit)
            )
            scores_diff_json[example["qa_id"]] = score_diff
            if score_diff > null_score_diff_threshold:
@ -1129,7 +1169,9 @@ def postprocess_xlnet_answer(
                    )

        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True,
        )

        seen_predictions = {}
@ -1172,7 +1214,9 @@ def postprocess_xlnet_answer(

            nbest.append(
                _NbestPrediction(
-                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit,
                )
            )

@ -1300,7 +1344,9 @@ def _create_qa_example(qa_input, is_training):

    if _is_iterable_but_not_string(a_start):
        if not _is_iterable_but_not_string(a_text):
-            raise Exception("The answer text must be a list when answer start is a list.")
+            raise Exception(
+                "The answer text must be a list when answer start is a list."
+            )
        if len(a_start) != 1 and is_training and not impossible:
            raise Exception("For training, each question should have exactly 1 answer.")
        a_start = a_start[0]
@ -1323,7 +1369,9 @@ def _create_qa_example(qa_input, is_training):
            cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
            if actual_text.find(cleaned_answer_text) == -1:
                logger.warning(
-                    "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
+                    "Could not find answer: '%s' vs. '%s'",
+                    actual_text,
+                    cleaned_answer_text,
                )
                return
        else:
@ -1549,7 +1597,10 @@ def _create_qa_features(
        else:
            tok_end_position = len(all_doc_tokens) - 1
        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
+            all_doc_tokens,
+            tok_start_position,
+            tok_end_position,
+            example.orig_answer_text,
        )

    # The -3 accounts for [CLS], [SEP] and [SEP]
@ -1583,7 +1634,8 @@ def _create_qa_features(

        # p_mask: mask with 1 for token than cannot be in the answer
        # (0 for token which can be in an answer)
-        # Original TF implem also keep the classification token (set to 0) (not sure why...)
+        # Original TF implem also keep the classification token (set to 0)
+        # (not sure why...)
        # TODO: Should we set p_mask = 1 for cls token?
        p_mask = []

@ -1612,9 +1664,11 @@ def _create_qa_features(
            split_token_index = doc_span.start + i
            token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]

-            ## TODO: maybe this can be improved to compute
+            # TODO: maybe this can be improved to compute
            # is_max_context for each token only once.
-            is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
+            is_max_context = _check_is_max_context(
+                doc_spans, doc_span_index, split_token_index
+            )
            token_is_max_context[len(tokens)] = is_max_context
            tokens.append(all_doc_tokens[split_token_index])
            if model_type == "xlnet":
@ -1720,10 +1774,13 @@ def _create_qa_features(
 # -------------------------------------------------------------------------------------------------
 # Post processing helper functions
 _PrelimPrediction = collections.namedtuple(
-    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
+    "PrelimPrediction",
+    ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
 )

-_NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
+_NbestPrediction = collections.namedtuple(
+    "NbestPrediction", ["text", "start_logit", "end_logit"]
+)


 def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
@ -1786,7 +1843,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(
-                "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
+                "Length not equal after stripping spaces: '%s' vs '%s'",
+                orig_ns_text,
+                tok_ns_text,
            )
        return orig_text

--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@ -2,54 +2,22 @@
 # Licensed under the MIT License.

 import numpy as np
-from transformers.modeling_albert import (
-    ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    AlbertForSequenceClassification,
-)
-from transformers.modeling_bert import (
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    BertForSequenceClassification,
-)
-from transformers.modeling_distilbert import (
-    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForSequenceClassification,
-)
-from transformers.modeling_roberta import (
-    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    RobertaForSequenceClassification,
-)
-from transformers.modeling_xlnet import (
-    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetForSequenceClassification,
+from transformers import (
+    MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
 )

 from utils_nlp.common.pytorch_utils import compute_training_steps
-from utils_nlp.models.transformers.common import (
-    MAX_SEQ_LEN,
-    TOKENIZER_CLASS,
-    Transformer,
-)
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer
 from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet

-MODEL_CLASS = {}
-MODEL_CLASS.update(
-    {k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
-MODEL_CLASS.update(
-    {k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
-MODEL_CLASS.update(
-    {k: XLNetForSequenceClassification for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
-MODEL_CLASS.update(
-    {
-        k: DistilBertForSequenceClassification
-        for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    }
-)
-MODEL_CLASS.update(
-    {k: AlbertForSequenceClassification for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
+supported_models = [
+    list(x.pretrained_config_archive_map)
+    for x in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
+]
+supported_models = sorted([x for y in supported_models for x in y])


 class Processor:
@ -68,7 +36,10 @@ class Processor:
    """

    def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."):
-        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
+        self.model_name = model_name
+        self.to_lower = to_lower
+        self.cache_dir = cache_dir
+        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
@ -84,7 +55,7 @@ class Processor:
            batch (tuple): A tuple containing input ids, attention mask,
                segment ids, and labels tensors.
            device (torch.device): A PyTorch device.
-            model_name (bool, optional): Model name used to format the inputs.
+            model_name (bool): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.

@ -93,13 +64,7 @@ class Processor:
                Labels are only returned when train_mode is True.
        """
        batch = tuple(t.to(device) for t in batch)
-        if model_name.split("-")[0] in [
-            "bert",
-            "xlnet",
-            "roberta",
-            "distilbert",
-            "albert",
-        ]:
+        if model_name in supported_models:
            if train_mode:
                inputs = {
                    "input_ids": batch[0],
@ -109,8 +74,8 @@ class Processor:
            else:
                inputs = {"input_ids": batch[0], "attention_mask": batch[1]}

-            # distilbert doesn't support segment ids
-            if model_name.split("-")[0] not in ["distilbert"]:
+            # distilbert, bart don't support segment ids
+            if model_name.split("-")[0] not in ["distilbert", "bart"]:
                inputs["token_type_ids"] = batch[2]

            return inputs
@ -244,16 +209,17 @@ class Processor:

 class SequenceClassifier(Transformer):
    def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
-        super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=num_labels,
-            cache_dir=cache_dir,
+        config = AutoConfig.from_pretrained(
+            model_name, num_labels=num_labels, cache_dir=cache_dir
        )
+        model = AutoModelForSequenceClassification.from_pretrained(
+            model_name, cache_dir=cache_dir, config=config, output_loading_info=False
+        )
+        super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)

    @staticmethod
    def list_supported_models():
-        return list(MODEL_CLASS)
+        return supported_models

    def fit(
        self,
@ -345,7 +311,9 @@ class SequenceClassifier(Transformer):

        # init scheduler
        scheduler = Transformer.get_default_scheduler(
-            optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
+            optimizer=self.optimizer,
+            warmup_steps=warmup_steps,
+            num_training_steps=max_steps,
        )

        # fine tune