Merge pull request #529 from microsoft/bleik/optim-patch

bleik/common transformers utils update
2020-01-24 20:06:37 -05:00 · 2020-01-24 20:06:37 -05:00 · 7dcdc32399
--- a/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
+++ b/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
@ -233,7 +233,7 @@
   "source": [
    "with Timer() as t:\n",
    "    preds = model.predict(\n",
-    "        eval_dataloader=test_dataloader,\n",
+    "        test_dataloader=test_dataloader,\n",
    "        num_gpus=None,\n",
    "        verbose=True\n",
    "    )\n",
--- a/examples/text_classification/tc_mnli_transformers.ipynb
+++ b/examples/text_classification/tc_mnli_transformers.ipynb
@ -32,6 +32,7 @@
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tqdm import tqdm\n",
    "from utils_nlp.common.timer import Timer\n",
+    "from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
    "from utils_nlp.dataset.multinli import load_pandas_df\n",
    "from utils_nlp.models.transformers.sequence_classification import (\n",
    "    Processor, SequenceClassifier)"
@ -93,7 +94,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 222k/222k [01:25<00:00, 2.60kKB/s] \n"
+      "100%|██████████| 222k/222k [01:20<00:00, 2.74kKB/s] \n"
     ]
    }
   ],
@ -196,7 +197,7 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
      "  FutureWarning)\n"
     ]
    }
@ -232,11 +233,11 @@
    {
     "data": {
      "text/plain": [
-       "telephone     1055\n",
-       "slate         1003\n",
-       "travel         961\n",
-       "fiction        952\n",
-       "government     938\n",
+       "telephone     1043\n",
+       "slate          989\n",
+       "fiction        968\n",
+       "travel         964\n",
+       "government     945\n",
       "Name: genre, dtype: int64"
      ]
     },
@ -385,32 +386,108 @@
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
-       "      <td>roberta-base</td>\n",
+       "      <td>bert-base-japanese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
-       "      <td>roberta-large</td>\n",
+       "      <td>bert-base-japanese-whole-word-masking</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
-       "      <td>roberta-large-mnli</td>\n",
+       "      <td>bert-base-japanese-char</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
-       "      <td>xlnet-base-cased</td>\n",
+       "      <td>bert-base-japanese-char-whole-word-masking</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
-       "      <td>xlnet-large-cased</td>\n",
+       "      <td>bert-base-finnish-cased-v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
-       "      <td>distilbert-base-uncased</td>\n",
+       "      <td>bert-base-finnish-uncased-v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
+       "      <td>roberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>roberta-large</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>roberta-large-mnli</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>distilroberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>roberta-base-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>roberta-large-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>xlnet-base-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>xlnet-large-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>distilbert-base-uncased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
       "      <td>distilbert-base-uncased-distilled-squad</td>\n",
       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>distilbert-base-german-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>distilbert-base-multilingual-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>albert-base-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>albert-large-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>albert-xlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>albert-xxlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>albert-base-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>albert-large-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>albert-xlarge-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>albert-xxlarge-v2</td>\n",
+       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
@ -432,13 +509,32 @@
       "12                     bert-base-cased-finetuned-mrpc\n",
       "13                       bert-base-german-dbmdz-cased\n",
       "14                     bert-base-german-dbmdz-uncased\n",
-       "15                                       roberta-base\n",
-       "16                                      roberta-large\n",
-       "17                                 roberta-large-mnli\n",
-       "18                                   xlnet-base-cased\n",
-       "19                                  xlnet-large-cased\n",
-       "20                            distilbert-base-uncased\n",
-       "21            distilbert-base-uncased-distilled-squad"
+       "15                                 bert-base-japanese\n",
+       "16              bert-base-japanese-whole-word-masking\n",
+       "17                            bert-base-japanese-char\n",
+       "18         bert-base-japanese-char-whole-word-masking\n",
+       "19                         bert-base-finnish-cased-v1\n",
+       "20                       bert-base-finnish-uncased-v1\n",
+       "21                                       roberta-base\n",
+       "22                                      roberta-large\n",
+       "23                                 roberta-large-mnli\n",
+       "24                                 distilroberta-base\n",
+       "25                       roberta-base-openai-detector\n",
+       "26                      roberta-large-openai-detector\n",
+       "27                                   xlnet-base-cased\n",
+       "28                                  xlnet-large-cased\n",
+       "29                            distilbert-base-uncased\n",
+       "30            distilbert-base-uncased-distilled-squad\n",
+       "31                       distilbert-base-german-cased\n",
+       "32                 distilbert-base-multilingual-cased\n",
+       "33                                     albert-base-v1\n",
+       "34                                    albert-large-v1\n",
+       "35                                   albert-xlarge-v1\n",
+       "36                                  albert-xxlarge-v1\n",
+       "37                                     albert-base-v2\n",
+       "38                                    albert-large-v2\n",
+       "39                                   albert-xlarge-v2\n",
+       "40                                  albert-xxlarge-v2"
      ]
     },
     "execution_count": 10,
@ -492,18 +588,8 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 231508/231508 [00:00<00:00, 15545441.79B/s]\n",
-      "100%|██████████| 492/492 [00:00<00:00, 560455.61B/s]\n",
-      "100%|██████████| 267967963/267967963 [00:04<00:00, 61255588.46B/s]\n",
-      "/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
-      "  warnings.warn('Was asked to gather along dimension 0, but all '\n",
-      "100%|██████████| 898823/898823 [00:00<00:00, 23932308.55B/s]\n",
-      "100%|██████████| 456318/456318 [00:00<00:00, 23321916.66B/s]\n",
-      "100%|██████████| 473/473 [00:00<00:00, 477015.10B/s]\n",
-      "100%|██████████| 501200538/501200538 [00:07<00:00, 64332558.45B/s]\n",
-      "100%|██████████| 798011/798011 [00:00<00:00, 25002433.16B/s]\n",
-      "100%|██████████| 641/641 [00:00<00:00, 695974.34B/s]\n",
-      "100%|██████████| 467042463/467042463 [00:08<00:00, 55154509.21B/s]\n"
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
     ]
    }
   ],
@ -518,11 +604,17 @@
    "        to_lower=model_name.endswith(\"uncased\"),\n",
    "        cache_dir=CACHE_DIR,\n",
    "    )\n",
-    "    train_dataloader = processor.create_dataloader_from_df(\n",
-    "        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
+    "    train_dataset = processor.dataset_from_dataframe(\n",
+    "        df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
    "    )\n",
-    "    test_dataloader = processor.create_dataloader_from_df(\n",
-    "        df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
+    "    train_dataloader = dataloader_from_dataset(\n",
+    "        train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
+    "    )\n",
+    "    test_dataset = processor.dataset_from_dataframe(\n",
+    "        df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
+    "    )\n",
+    "    test_dataloader = dataloader_from_dataset(\n",
+    "        test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
    "    )\n",
    "\n",
    "    # fine-tune\n",
@ -531,17 +623,12 @@
    "    )\n",
    "    with Timer() as t:\n",
    "        classifier.fit(\n",
-    "            train_dataloader,\n",
-    "            num_epochs=NUM_EPOCHS,\n",
-    "            num_gpus=NUM_GPUS,\n",
-    "            verbose=False,\n",
+    "            train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,\n",
    "        )\n",
    "    train_time = t.interval / 3600\n",
    "\n",
    "    # predict\n",
-    "    preds = classifier.predict(\n",
-    "        test_dataloader, num_gpus=NUM_GPUS, verbose=False\n",
-    "    )\n",
+    "    preds = classifier.predict(test_dataloader, num_gpus=NUM_GPUS, verbose=False)\n",
    "\n",
    "    # eval\n",
    "    accuracy = accuracy_score(df_test[LABEL_COL], preds)\n",
@ -600,21 +687,21 @@
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>accuracy</th>\n",
-       "      <td>0.895477</td>\n",
-       "      <td>0.879584</td>\n",
-       "      <td>0.894866</td>\n",
+       "      <td>0.889364</td>\n",
+       "      <td>0.885697</td>\n",
+       "      <td>0.886308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>f1-score</th>\n",
-       "      <td>0.896656</td>\n",
-       "      <td>0.881218</td>\n",
-       "      <td>0.896108</td>\n",
+       "      <td>0.885225</td>\n",
+       "      <td>0.880926</td>\n",
+       "      <td>0.881819</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time(hrs)</th>\n",
-       "      <td>0.021865</td>\n",
-       "      <td>0.035351</td>\n",
-       "      <td>0.046295</td>\n",
+       "      <td>0.023326</td>\n",
+       "      <td>0.044209</td>\n",
+       "      <td>0.052801</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@ -622,9 +709,9 @@
      ],
      "text/plain": [
       "           distilbert-base-uncased  roberta-base  xlnet-base-cased\n",
-       "accuracy                  0.895477      0.879584          0.894866\n",
-       "f1-score                  0.896656      0.881218          0.896108\n",
-       "time(hrs)                 0.021865      0.035351          0.046295"
+       "accuracy                  0.889364      0.885697          0.886308\n",
+       "f1-score                  0.885225      0.880926          0.881819\n",
+       "time(hrs)                 0.023326      0.044209          0.052801"
      ]
     },
     "execution_count": 13,
@ -645,7 +732,7 @@
    {
     "data": {
      "application/scrapbook.scrap.json+json": {
-       "data": 0.8899755501222494,
+       "data": 0.887123064384678,
       "encoder": "json",
       "name": "accuracy",
       "version": 1
@ -663,7 +750,7 @@
    {
     "data": {
      "application/scrapbook.scrap.json+json": {
-       "data": 0.8913273009038569,
+       "data": 0.8826569624491233,
       "encoder": "json",
       "name": "f1",
       "version": 1
@ -688,9 +775,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "nlp_gpu",
+   "display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
   "language": "python",
-   "name": "nlp_gpu"
+   "name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
  },
  "language_info": {
   "codemirror_mode": {
--- a/examples/text_classification/tc_multi_languages_transformers.ipynb
+++ b/examples/text_classification/tc_multi_languages_transformers.ipynb
@ -13,7 +13,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -69,7 +69,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {
    "tags": [
     "parameters"
@ -183,32 +183,108 @@
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
-       "      <td>roberta-base</td>\n",
+       "      <td>bert-base-japanese</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
-       "      <td>roberta-large</td>\n",
+       "      <td>bert-base-japanese-whole-word-masking</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
-       "      <td>roberta-large-mnli</td>\n",
+       "      <td>bert-base-japanese-char</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
-       "      <td>xlnet-base-cased</td>\n",
+       "      <td>bert-base-japanese-char-whole-word-masking</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
-       "      <td>xlnet-large-cased</td>\n",
+       "      <td>bert-base-finnish-cased-v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
-       "      <td>distilbert-base-uncased</td>\n",
+       "      <td>bert-base-finnish-uncased-v1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
+       "      <td>roberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>roberta-large</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>roberta-large-mnli</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>distilroberta-base</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>roberta-base-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>roberta-large-openai-detector</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>xlnet-base-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>xlnet-large-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>distilbert-base-uncased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
       "      <td>distilbert-base-uncased-distilled-squad</td>\n",
       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>distilbert-base-german-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>distilbert-base-multilingual-cased</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>albert-base-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>albert-large-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>albert-xlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>albert-xxlarge-v1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>albert-base-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>albert-large-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>albert-xlarge-v2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>40</th>\n",
+       "      <td>albert-xxlarge-v2</td>\n",
+       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
@ -230,13 +306,32 @@
       "12                     bert-base-cased-finetuned-mrpc\n",
       "13                       bert-base-german-dbmdz-cased\n",
       "14                     bert-base-german-dbmdz-uncased\n",
-       "15                                       roberta-base\n",
-       "16                                      roberta-large\n",
-       "17                                 roberta-large-mnli\n",
-       "18                                   xlnet-base-cased\n",
-       "19                                  xlnet-large-cased\n",
-       "20                            distilbert-base-uncased\n",
-       "21            distilbert-base-uncased-distilled-squad"
+       "15                                 bert-base-japanese\n",
+       "16              bert-base-japanese-whole-word-masking\n",
+       "17                            bert-base-japanese-char\n",
+       "18         bert-base-japanese-char-whole-word-masking\n",
+       "19                         bert-base-finnish-cased-v1\n",
+       "20                       bert-base-finnish-uncased-v1\n",
+       "21                                       roberta-base\n",
+       "22                                      roberta-large\n",
+       "23                                 roberta-large-mnli\n",
+       "24                                 distilroberta-base\n",
+       "25                       roberta-base-openai-detector\n",
+       "26                      roberta-large-openai-detector\n",
+       "27                                   xlnet-base-cased\n",
+       "28                                  xlnet-large-cased\n",
+       "29                            distilbert-base-uncased\n",
+       "30            distilbert-base-uncased-distilled-squad\n",
+       "31                       distilbert-base-german-cased\n",
+       "32                 distilbert-base-multilingual-cased\n",
+       "33                                     albert-base-v1\n",
+       "34                                    albert-large-v1\n",
+       "35                                   albert-xlarge-v1\n",
+       "36                                  albert-xxlarge-v1\n",
+       "37                                     albert-base-v2\n",
+       "38                                    albert-large-v2\n",
+       "39                                   albert-xlarge-v2\n",
+       "40                                  albert-xxlarge-v2"
      ]
     },
     "execution_count": 3,
@ -264,7 +359,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -281,7 +376,7 @@
    "    'num_train_epochs': 5,\n",
    "    'num_gpus': 2,\n",
    "    'batch_size': 16,\n",
-    "    'verbose': True,\n",
+    "    'verbose': False,\n",
    "    'load_dataset_func': None,\n",
    "    'get_labels_func': None\n",
    "}\n",
@ -325,9 +420,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 80.1k/80.1k [00:02<00:00, 30.8kKB/s]\n",
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
+      "  FutureWarning)\n"
+     ]
+    }
+   ],
   "source": [
    "train_dataloader, test_dataloader, label_encoder, test_labels = CONFIG['load_dataset_func'](\n",
    "    local_path=CONFIG['local_path'],\n",
@ -354,11 +459,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
+      "  warnings.warn('Was asked to gather along dimension 0, but all '\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training time : 0.190 hrs\n"
+     ]
+    }
+   ],
   "source": [
    "model = SequenceClassifier(\n",
    "    model_name=CONFIG['model_name'],\n",
@ -390,9 +511,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prediction time : 0.021 hrs\n"
+     ]
+    }
+   ],
   "source": [
    "with Timer() as t:\n",
    "    preds = model.predict(\n",
@ -422,11 +551,11 @@
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
-      "     culture       0.89      0.89      0.89       843\n",
-      "     diverse       0.99      0.99      0.99      1738\n",
-      "     economy       0.96      0.96      0.96       661\n",
-      "    politics       0.94      0.94      0.94       530\n",
-      "      sports       0.87      0.87      0.87       580\n",
+      "     culture       0.93      0.94      0.93       548\n",
+      "     diverse       0.94      0.94      0.94       640\n",
+      "     economy       0.90      0.88      0.89       570\n",
+      "    politics       0.87      0.88      0.88       809\n",
+      "      sports       0.99      0.98      0.99      1785\n",
      "\n",
      "   micro avg       0.94      0.94      0.94      4352\n",
      "   macro avg       0.93      0.93      0.93      4352\n",
@ -449,9 +578,64 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.94,
+       "encoder": "json",
+       "name": "precision",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "precision"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.94,
+       "encoder": "json",
+       "name": "recall",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "recall"
+      }
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/scrapbook.scrap.json+json": {
+       "data": 0.94,
+       "encoder": "json",
+       "name": "f1",
+       "version": 1
+      }
+     },
+     "metadata": {
+      "scrapbook": {
+       "data": true,
+       "display": false,
+       "name": "f1"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
   "source": [
    "# for testing\n",
    "report_splits = report.split('\\n')[-2].split()\n",
@ -463,11 +647,10 @@
  }
 ],
 "metadata": {
-  "celltoolbar": "Tags",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
   "language": "python",
-   "name": "python3"
+   "name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
  },
  "language_info": {
   "codemirror_mode": {
--- a/tests/integration/test_notebooks_extractive_summarization.py
+++ b/tests/integration/test_notebooks_extractive_summarization.py
@ -1,14 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

-import os
-import json
-import shutil
-import pytest
 import papermill as pm
+import pytest
 import scrapbook as sb
-from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
-
+from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK

 ABS_TOL = 0.02

@ -31,13 +27,10 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp):
            CACHE_DIR=tmp,
            BATCH_SIZE=3000,
            REPORT_EVERY=50,
-            MAX_STEPS=1e3,
+            MAX_STEPS=1000,
            WARMUP_STEPS=5e2,
            MODEL_NAME="distilbert-base-uncased",
        ),
    )
    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
-    print(result)
    assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL)
-
-
--- a/tests/integration/test_notebooks_text_classification.py
+++ b/tests/integration/test_notebooks_text_classification.py
@ -33,8 +33,8 @@ def test_tc_mnli_transformers(notebooks, tmp):
        ),
    )
    result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
-    assert pytest.approx(result["accuracy"], 0.89, abs=ABS_TOL)
-    assert pytest.approx(result["f1"], 0.89, abs=ABS_TOL)
+    assert pytest.approx(result["accuracy"], 0.885, abs=ABS_TOL)
+    assert pytest.approx(result["f1"], 0.885, abs=ABS_TOL)


@pytest.mark.integration
--- a/tests/smoke/test_gpu_utils.py
+++ b/tests/smoke/test_gpu_utils.py
@ -9,4 +9,3 @@ import torch
@pytest.mark.gpu
 def test_machine_is_gpu_machine():
    assert torch.cuda.is_available() is True
-
--- a/tests/unit/test_bert_token_classification.py
+++ b/tests/unit/test_bert_token_classification.py
@ -1,79 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-import pytest
-
-from utils_nlp.models.bert.token_classification import (
-    BERTTokenClassifier,
-    postprocess_token_labels,
-)
-
-
-def test_token_classifier_num_labels():
-    with pytest.raises(ValueError):
-        BERTTokenClassifier(num_labels=1)
-
-
-def test_token_classifier_fit_predict(tmp_path, ner_test_data):
-    token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=tmp_path)
-
-    # test fit, no warmup
-    token_classifier.fit(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-    )
-
-    # test fit, with warmup
-    token_classifier.fit(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-        warmup_proportion=0.1,
-    )
-    # test predict, no labels
-    token_classifier.predict(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-    )
-
-    # test predict, with labels
-    token_classifier.predict(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-    )
-
-    # test output probabilities
-    predictions = token_classifier.predict(
-        token_ids=ner_test_data["INPUT_TOKEN_IDS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        labels=ner_test_data["INPUT_LABEL_IDS"],
-        probabilities=True,
-    )
-    assert len(predictions.classes) == predictions.probabilities.shape[0]
-
-
-def test_postprocess_token_labels(ner_test_data):
-    labels_no_padding = postprocess_token_labels(
-        labels=ner_test_data["PREDICTED_LABELS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        label_map=ner_test_data["LABEL_MAP"],
-    )
-
-    assert labels_no_padding == ner_test_data["EXPECTED_TOKENS_NO_PADDING"]
-
-
-def test_postprocess_token_labels_remove_trailing(ner_test_data):
-    labels_no_padding_no_trailing = postprocess_token_labels(
-        labels=ner_test_data["PREDICTED_LABELS"],
-        input_mask=ner_test_data["INPUT_MASK"],
-        label_map=ner_test_data["LABEL_MAP"],
-        remove_trailing_word_pieces=True,
-        trailing_token_mask=ner_test_data["TRAILING_TOKEN_MASK"],
-    )
-
-    assert (
-        labels_no_padding_no_trailing
-        == ner_test_data["EXPECTED_TOKENS_NO_PADDING_NO_TRAILING"]
-    )
--- a/tests/unit/test_common_pytorch_utils.py
+++ b/tests/unit/test_common_pytorch_utils.py
@ -1,14 +1,15 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

+"""PyTorch utils tests."""

 import pytest
 import torch
 import torch.nn as nn
-from torch.nn.parallel.data_parallel import DataParallel
 from torch.nn.modules.container import Sequential
+from torch.nn.parallel.data_parallel import DataParallel

-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device


@pytest.fixture
@ -55,49 +56,47 @@ def test_get_device_local_rank():

 def test_move_to_device_cpu(model):
    # test when device.type="cpu"
-    model_cpu = move_to_device(model, torch.device("cpu"))
+    model_cpu = move_model_to_device(model, torch.device("cpu"))
    assert isinstance(model_cpu, nn.modules.container.Sequential)


 def test_move_to_device_cpu_parallelized(model):
    # test when input model is parallelized
    model_parallelized = nn.DataParallel(model)
-    model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu"))
+    model_parallelized_output = move_model_to_device(model_parallelized, torch.device("cpu"))
    assert isinstance(model_parallelized_output, nn.modules.container.Sequential)


 def test_move_to_device_exception_not_torch_device(model):
    # test when device is not torch.device
    with pytest.raises(ValueError):
-        move_to_device(model, "abc")
+        move_model_to_device(model, "abc")


 def test_move_to_device_exception_wrong_type(model):
    # test when device.type is not "cuda" or "cpu"
    with pytest.raises(Exception):
-        move_to_device(model, torch.device("opengl"))
+        move_model_to_device(model, torch.device("opengl"))


-@pytest.mark.skipif(
-    torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine"
-)
+@pytest.mark.skipif(torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine")
 def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
    # test when the model is moved to a gpu but it is a cpu machine
    with pytest.raises(Exception):
-        move_to_device(model, torch.device("cuda"))
+        move_model_to_device(model, torch.device("cuda"))


@pytest.mark.gpu
 def test_move_to_device_exception_cuda_zero_gpus(model):
    # test when device.type is cuda, but num_gpus is 0
    with pytest.raises(ValueError):
-        move_to_device(model, torch.device("cuda"), num_gpus=0)
+        move_model_to_device(model, torch.device("cuda"), num_gpus=0)


@pytest.mark.gpu
 def test_move_to_device_gpu(model):
    # test when device.type="cuda"
-    model_cuda = move_to_device(model, torch.device("cuda"))
+    model_cuda = move_model_to_device(model, torch.device("cuda"))
    num_cuda_devices = torch.cuda.device_count()

    if num_cuda_devices > 1:
@ -105,18 +104,16 @@ def test_move_to_device_gpu(model):
    else:
        assert isinstance(model_cuda, Sequential)

-    model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1)
+    model_cuda_1_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=1)
    assert isinstance(model_cuda_1_gpu, Sequential)

-    model_cuda_1_more_gpu = move_to_device(
-        model, torch.device("cuda"), num_gpus=num_cuda_devices + 1
-    )
+    model_cuda_1_more_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices + 1)
    if num_cuda_devices > 1:
        assert isinstance(model_cuda_1_more_gpu, DataParallel)
    else:
        assert isinstance(model_cuda_1_more_gpu, Sequential)

-    model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
+    model_cuda_same_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
    if num_cuda_devices > 1:
        assert isinstance(model_cuda_same_gpu, DataParallel)
    else:
--- a/tests/unit/test_extractive_summarization.py
+++ b/tests/unit/test_extractive_summarization.py
@ -1,14 +1,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

-import nltk
-
-nltk.download("punkt")
-from nltk import tokenize
-import pytest
 import os
-import shutil

+import nltk
+nltk.download("punkt")
+import pytest
+from nltk import tokenize

 from utils_nlp.models.transformers.datasets import SummarizationDataset
 from utils_nlp.models.transformers.extractive_summarization import (
@ -17,6 +15,9 @@ from utils_nlp.models.transformers.extractive_summarization import (
    ExtSumProcessor,
 )

+
+
+
 # @pytest.fixture()
 def source_data():
    return (
@ -48,18 +49,10 @@ def data_to_file(tmp_module):
    f.write(target)
    f.close()
    train_dataset = SummarizationDataset(
-        source_file,
-        target_file,
-        [tokenize.sent_tokenize],
-        [tokenize.sent_tokenize],
-        nltk.word_tokenize,
+        source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
    )
    test_dataset = SummarizationDataset(
-        source_file,
-        target_file,
-        [tokenize.sent_tokenize],
-        [tokenize.sent_tokenize],
-        nltk.word_tokenize,
+        source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
    )

    processor = ExtSumProcessor(
@ -70,20 +63,12 @@ def data_to_file(tmp_module):
        min_nsents=0,
        min_src_ntokens=1,
    )
-    ext_sum_train = processor.preprocess(
-        train_dataset, train_dataset.get_target(), oracle_mode="greedy"
-    )
-    ext_sum_test = processor.preprocess(
-        test_dataset, test_dataset.get_target(), oracle_mode="greedy"
-    )
+    ext_sum_train = processor.preprocess(train_dataset, train_dataset.get_target(), oracle_mode="greedy")
+    ext_sum_test = processor.preprocess(test_dataset, test_dataset.get_target(), oracle_mode="greedy")

    save_path = os.path.join(tmp_module, "processed")
-    train_files = ExtSumProcessedData.save_data(
-        ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000
-    )
-    test_files = ExtSumProcessedData.save_data(
-        ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000
-    )
+    train_files = ExtSumProcessedData.save_data(ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000)
+    test_files = ExtSumProcessedData.save_data(ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000)
    print(train_files)
    print(test_files)
    assert os.path.exists(train_files[0])
@ -96,10 +81,10 @@ def test_bert_training(data_to_file, tmp_module):

    CACHE_DIR = tmp_module
    ENCODER = "transformer"
-    BATCH_SIZE = 200
+    BATCH_SIZE = 128
    LEARNING_RATE = 2e-3
-    REPORT_EVERY = 100
-    MAX_STEPS = 5e2
+    REPORT_EVERY = 50
+    MAX_STEPS = 2e2
    WARMUP_STEPS = 1e2
    DATA_SAVED_PATH = data_to_file
    result_base_path = "./results"
--- a/tests/unit/test_models_transformers_question_answering.py
+++ b/tests/unit/test_models_transformers_question_answering.py
@ -1,18 +1,20 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

-import pytest
 import os
+
+import pytest
+import torch
+
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.models.transformers.datasets import QADataset
 from utils_nlp.models.transformers.question_answering import (
-    QAProcessor,
-    AnswerExtractor,
    CACHED_EXAMPLES_TEST_FILE,
    CACHED_FEATURES_TEST_FILE,
+    AnswerExtractor,
+    QAProcessor,
 )

-import torch
-
 NUM_GPUS = max(1, torch.cuda.device_count())
 BATCH_SIZE = 8

@ -109,9 +111,7 @@ def qa_test_data(qa_test_df, tmp_module):
        feature_cache_dir=tmp_module,
    )

-    qa_processor_distilbert = QAProcessor(
-        model_name="distilbert-base-uncased", cache_dir=tmp_module
-    )
+    qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
    train_features_distilbert = qa_processor_distilbert.preprocess(
        train_dataset,
        batch_size=BATCH_SIZE,
@ -153,15 +153,9 @@ def qa_test_data(qa_test_df, tmp_module):
 def test_QAProcessor(qa_test_data, tmp_module):
    for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
        qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
-        qa_processor.preprocess(
-            qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module
-        )
-        qa_processor.preprocess(
-            qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module
-        )
-        qa_processor.preprocess(
-            qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module
-        )
+        qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
+        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)

    # test unsupported model type
    with pytest.raises(ValueError):
@ -169,51 +163,49 @@ def test_QAProcessor(qa_test_data, tmp_module):

    # test training data has no ground truth exception
    with pytest.raises(Exception):
-        qa_processor.preprocess(
-            qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
-        )
+        qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)

    # test when answer start is a list, but answer text is not
    with pytest.raises(Exception):
        qa_processor.preprocess(
-            qa_test_data["train_dataset_start_text_mismatch"],
-            is_training=True,
-            feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
        )

    # test when training data has multiple answers
    with pytest.raises(Exception):
        qa_processor.preprocess(
-            qa_test_data["train_dataset_multi_answers"],
-            is_training=True,
-            feature_cache_dir=tmp_module,
+            qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
        )


 def test_AnswerExtractor(qa_test_data, tmp_module):
-    # test bert
+    # bert
    qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)
-    qa_extractor_bert.fit(qa_test_data["train_features_bert"], cache_model=True)
+    train_loader_bert = dataloader_from_dataset(qa_test_data["train_features_bert"])
+    test_loader_bert = dataloader_from_dataset(qa_test_data["test_features_bert"], shuffle=False)
+    qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)

    # test saving fine-tuned model
    model_output_dir = os.path.join(tmp_module, "fine_tuned")
    assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
    assert os.path.exists(os.path.join(model_output_dir, "config.json"))

-    qa_extractor_from_cache = AnswerExtractor(
-        cache_dir=tmp_module, load_model_from_dir=model_output_dir
-    )
-    qa_extractor_from_cache.predict(qa_test_data["test_features_bert"])
+    qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
+    qa_extractor_from_cache.predict(test_loader_bert, verbose=False)

+    # xlnet
+    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_xlnet"])
+    test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_xlnet"], shuffle=False)
    qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
-    qa_extractor_xlnet.fit(qa_test_data["train_features_xlnet"], cache_model=False)
-    qa_extractor_xlnet.predict(qa_test_data["test_features_xlnet"])
+    qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False)
+    qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)

-    qa_extractor_distilbert = AnswerExtractor(
-        model_name="distilbert-base-uncased", cache_dir=tmp_module
-    )
-    qa_extractor_distilbert.fit(qa_test_data["train_features_distilbert"], cache_model=False)
-    qa_extractor_distilbert.predict(qa_test_data["test_features_distilbert"])
+    # distilbert
+    train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
+    test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
+    qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
+    qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
+    qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)


 def test_postprocess_bert_answer(qa_test_data, tmp_module):
@ -226,8 +218,9 @@ def test_postprocess_bert_answer(qa_test_data, tmp_module):
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
+    test_loader = dataloader_from_dataset(test_features, shuffle=False)
    qa_extractor = AnswerExtractor(cache_dir=tmp_module)
-    predictions = qa_extractor.predict(test_features)
+    predictions = qa_extractor.predict(test_loader)

    qa_processor.postprocess(
        results=predictions,
@ -260,8 +253,9 @@ def test_postprocess_xlnet_answer(qa_test_data, tmp_module):
        doc_stride=32,
        feature_cache_dir=tmp_module,
    )
+    test_loader = dataloader_from_dataset(test_features, shuffle=False)
    qa_extractor = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
-    predictions = qa_extractor.predict(test_features)
+    predictions = qa_extractor.predict(test_loader)

    qa_processor.postprocess(
        results=predictions,
--- a/tests/unit/test_transformers_sequence_classification.py
+++ b/tests/unit/test_transformers_sequence_classification.py
@ -5,6 +5,7 @@ import pytest
 import pandas as pd

 from utils_nlp.models.transformers.sequence_classification import SequenceClassifier, Processor
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset


@pytest.fixture()
@ -19,12 +20,11 @@ def test_classifier(data, tmpdir):
    num_labels = len(pd.unique(data[1]))
    model_name = "bert-base-uncased"
    processor = Processor(model_name=model_name, cache_dir=tmpdir)
-    train_dataloader = processor.create_dataloader_from_df(
-        df, "text", "label", batch_size=2, num_gpus=0
-    )
+    ds = processor.dataset_from_dataframe(df, "text", "label")
+    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)
    classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
-    classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=0, verbose=False)
-    preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
+    classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False)
+    preds = classifier.predict(dl, num_gpus=0, verbose=False)
    assert len(preds) == len(data[1])


@ -35,17 +35,16 @@ def test_classifier_gpu_train_cpu_predict(data, tmpdir):
    num_labels = len(pd.unique(data[1]))
    model_name = "bert-base-uncased"
    processor = Processor(model_name=model_name, cache_dir=tmpdir)
-    train_dataloader = processor.create_dataloader_from_df(
-        df, "text", "label", batch_size=2, num_gpus=1
-    )
+    ds = processor.dataset_from_dataframe(df, "text", "label")
+    dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)
    classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
-    classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=1, verbose=False)
+    classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False)

    assert next(classifier.model.parameters()).is_cuda is True
    # gpu prediction, no model move
-    preds = classifier.predict(train_dataloader, num_gpus=1, verbose=False)
+    preds = classifier.predict(dl, num_gpus=1, verbose=False)
    assert len(preds) == len(data[1])
    # cpu prediction, need model move
    assert next(classifier.model.parameters()).is_cuda is True
-    preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
+    preds = classifier.predict(dl, num_gpus=0, verbose=False)
    assert next(classifier.model.parameters()).is_cuda is False
--- a/tests/unit/test_transformers_token_classification.py
+++ b/tests/unit/test_transformers_token_classification.py
@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import pytest
+
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
+from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor, TokenClassifier
+
+
+@pytest.mark.cpu
+def test_token_classifier_fit_predict(tmpdir, ner_test_data):
+    token_classifier = TokenClassifier(model_name="bert-base-uncased", num_labels=6, cache_dir=tmpdir)
+    processor = TokenClassificationProcessor(model_name="bert-base-uncased", cache_dir=tmpdir)
+
+    # test fit, no warmup
+    train_dataset = processor.preprocess_for_bert(
+        text=ner_test_data["INPUT_TEXT"], labels=ner_test_data["INPUT_LABELS"], label_map=ner_test_data["LABEL_MAP"],
+    )
+    train_dataloader = dataloader_from_dataset(train_dataset)
+    token_classifier.fit(train_dataloader)
+
+    # test predict, no labels
+    _ = token_classifier.predict(train_dataloader, verbose=False)
--- a/utils_nlp/common/pytorch_utils.py
+++ b/utils_nlp/common/pytorch_utils.py
@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.

-"""Common PyTorch utilities that facilitate building Pytorch models."""
+"""Common PyTorch utilities that facilitate building PyTorch models."""

 import torch
-import torch.nn as nn
-import warnings
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler


 def get_device(
@ -17,11 +17,7 @@ def get_device(
    #    init_method="file:///distributed",
 ):
    if local_rank == -1:
-        num_gpus = (
-            min(num_gpus, torch.cuda.device_count())
-            if num_gpus is not None
-            else torch.cuda.device_count()
-        )
+        num_gpus = min(num_gpus, torch.cuda.device_count()) if num_gpus is not None else torch.cuda.device_count()
        device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
    else:
        torch.cuda.set_device(local_rank)
@ -32,59 +28,109 @@ def get_device(
    return device, num_gpus


-def move_to_device(model, device, num_gpus=None):
+def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=-1):
    """Moves a model to the specified device (cpu or gpu/s)
       and implements data parallelism when multiple gpus are specified.

    Args:
-        model (Module): A PyTorch model
-        device (torch.device): A PyTorch device
-        num_gpus (int): The number of GPUs to be used. Defaults to None,
-            all gpus are used.
+        model (Module): A PyTorch model.
+        device (torch.device): A PyTorch device.
+        num_gpus (int): The number of GPUs to be used.
+            If set to None, all available GPUs will be used.
+            Defaults to None.
+        gpu_ids (list): List of GPU IDs to be used.
+            If None, the first num_gpus GPUs will be used.
+            If not None, overrides num_gpus.
+            Defaults to None.
+        local_rank (int): Local GPU ID within a node. Used in distributed environments.
+            If not -1, num_gpus and gpu_ids are ignored.
+            Defaults to -1.
+
+    Returns:
+        Module, DataParallel, DistributedDataParallel: A PyTorch Module or
+            a DataParallel/DistributedDataParallel wrapper (when multiple gpus are used).
+    """
+    if not isinstance(device, torch.device):
+        raise ValueError("device must be of type torch.device.")
+
+    # unwrap model
+    if isinstance(model, torch.nn.DataParallel):
+        model = model.module
+    # wrap in DataParallel or DistributedDataParallel
+    if local_rank != -1:
+        self.model = torch.nn.parallel.DistributedDataParallel(
+            self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True,
+        )
+    else:
+        if device.type == "cuda":
+            if num_gpus is not None:
+                if num_gpus < 1:
+                    raise ValueError("num_gpus must be at least 1 or None")
+            num_cuda_devices = torch.cuda.device_count()
+            if num_cuda_devices < 1:
+                raise Exception("CUDA devices are not available.")
+            if gpu_ids is None:
+                num_gpus = num_cuda_devices if num_gpus is None else min(num_gpus, num_cuda_devices)
+                gpu_ids = list(range(num_gpus))
+            if len(gpu_ids) > 1:
+                model = torch.nn.DataParallel(model, device_ids=gpu_ids)
+    # move to device
+    return model.to(device)
+
+
+def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False):
+    """Creates a PyTorch DataLoader given a Dataset object.
+
+    Args:
+        ds (torch.utils.data.DataSet): A PyTorch dataset.
+        batch_size (int, optional): Batch size.
+            If more than 1 gpu is used, this would be the batch size per gpu.
+            Defaults to 32.
+        num_gpus (int, optional): The number of GPUs to be used. Defaults to None.
+        shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.
+        distributed (book, optional): If True, a DistributedSampler is used. Defaults to False.

    Returns:
        Module, DataParallel: A PyTorch Module or
            a DataParallel wrapper (when multiple gpus are used).
    """
-    if isinstance(model, nn.DataParallel):
-        model = model.module
+    if num_gpus is None:
+        num_gpus = torch.cuda.device_count()

-    if not isinstance(device, torch.device):
-        raise ValueError("device must be of type torch.device.")
-
-    if device.type == "cuda":
-        model.to(device)  # inplace
-        if num_gpus == 0:
-            raise ValueError("num_gpus must be non-zero when device.type is 'cuda'")
-        elif num_gpus == 1:
-            return model
-        else:
-            # parallelize
-            num_cuda_devices = torch.cuda.device_count()
-            if num_cuda_devices < 1:
-                raise Exception("CUDA devices are not available.")
-            elif num_cuda_devices < 2:
-                print("Warning: Only 1 CUDA device is available. Data parallelism is not possible.")
-                return model
-            else:
-                if num_gpus is None:
-                    # use all available devices
-                    return nn.DataParallel(model, device_ids=None)
-                elif num_gpus > num_cuda_devices:
-                    print(
-                        "Warning: Only {0} devices are available. "
-                        "Setting the number of gpus to {0}".format(num_cuda_devices)
-                    )
-                    return nn.DataParallel(model, device_ids=None)
-                else:
-                    return nn.DataParallel(model, device_ids=list(range(num_gpus)))
-    elif device.type == "cpu":
-        if num_gpus != 0 and num_gpus is not None:
-            warnings.warn("Device type is 'cpu'. num_gpus is ignored.")
-        return model.to(device)
+    batch_size = batch_size * max(1, num_gpus)

+    if distributed:
+        sampler = DistributedSampler(ds)
    else:
-        raise Exception(
-            "Device type '{}' not supported. Currently, only cpu "
-            "and cuda devices are supported.".format(device.type)
-        )
+        sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
+
+    return DataLoader(ds, sampler=sampler, batch_size=batch_size)
+
+
+def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1):
+    """Computes the max training steps given a dataloader.
+
+    Args:
+        dataloader (Dataloader): A PyTorch DataLoader.
+        num_epochs (int, optional): Number of training epochs. Defaults to 1.
+        max_steps (int, optional): Total number of training steps.
+            If set to a positive value, it overrides num_epochs.
+            Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+            Defualts to -1.
+        gradient_accumulation_steps (int, optional): Number of steps to accumulate
+            before performing a backward/update pass.
+            Default to 1.
+
+    Returns:
+        int: The max number of steps to be used in a training loop.
+    """
+    try:
+        dataset_length = len(dataloader)
+    except Exception:
+        dataset_length = -1
+    if max_steps <= 0:
+        if dataset_length != -1 and num_epochs > 0:
+            max_steps = dataset_length // gradient_accumulation_steps * num_epochs
+    if max_steps <= 0:
+        raise Exception("Max steps cannot be determined.")
+    return max_steps
--- a/utils_nlp/dataset/bbc_hindi.py
+++ b/utils_nlp/dataset/bbc_hindi.py
@ -7,24 +7,21 @@
    https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1
 """

-import os
-import pandas as pd
 import logging
-import numpy as np
+import os
 import tarfile
-
 from tempfile import TemporaryDirectory
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.url_utils import maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
 from utils_nlp.models.transformers.sequence_classification import Processor
-from sklearn.preprocessing import LabelEncoder
-from sklearn.model_selection import train_test_split

-
-URL = (
-    "https://github.com/NirantK/hindi2vec/releases/"
-    "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"
-)
+URL = "https://github.com/NirantK/hindi2vec/releases/" "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"


 def load_pandas_df(local_cache_path=TemporaryDirectory().name):
@ -49,19 +46,9 @@ def load_pandas_df(local_cache_path=TemporaryDirectory().name):
    train_csv_file_path = os.path.join(local_cache_path, "hindi-train.csv")
    test_csv_file_path = os.path.join(local_cache_path, "hindi-test.csv")

-    train_df = pd.read_csv(
-        train_csv_file_path,
-        sep="\t",
-        encoding='utf-8',
-        header=None
-    )
+    train_df = pd.read_csv(train_csv_file_path, sep="\t", encoding="utf-8", header=None)

-    test_df = pd.read_csv(
-        test_csv_file_path,
-        sep="\t",
-        encoding='utf-8',
-        header=None
-    )
+    test_df = pd.read_csv(test_csv_file_path, sep="\t", encoding="utf-8", header=None)

    train_df = train_df.fillna("")
    test_df = test_df.fillna("")
@ -80,7 +67,7 @@ def load_tc_dataset(
    cache_dir=TemporaryDirectory().name,
    max_len=MAX_SEQ_LEN,
    batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
    """
    Load the multinli dataset and split into training and testing datasets.
@ -105,7 +92,7 @@ def load_tc_dataset(
        cache_dir (str, optional): The default folder for saving cache files.
            Defaults to TemporaryDirectory().name.
        max_len (int, optional): Maximum length of the list of tokens. Lists longer
-            than this are truncated and shorter ones are padded with "O"s. 
+            than this are truncated and shorter ones are padded with "O"s.
            Default value is BERT_MAX_LEN=512.
        batch_size (int, optional): The batch size for training and testing.
            Defaults to 32.
@ -114,15 +101,15 @@ def load_tc_dataset(

    Returns:
        tuple. The tuple contains four elements:
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
+
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.

-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
-        
        label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
            can be retrieved by calling the `inverse_transform` function.
-        
+
        test_labels (Series): a Pandas Series of testing label (in label ID format). If
-            the labels are in raw label values format, we will need to transform it to 
+            the labels are in raw label values format, we will need to transform it to
            label IDs by using the label_encoder.transform function.
    """

@ -140,12 +127,8 @@ def load_tc_dataset(
    if test_fraction < 0 or test_fraction >= 1.0:
        logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
        test_fraction = 0.25
-    
-    train_df, test_df = train_test_split(
-        all_df,
-        train_size=(1.0 - test_fraction),
-        random_state=random_seed
-    )
+
+    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)

    if train_sample_ratio > 1.0:
        train_sample_ratio = 1.0
@ -153,7 +136,7 @@ def load_tc_dataset(
    elif train_sample_ratio < 0:
        logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
        raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
    if test_sample_ratio > 1.0:
        test_sample_ratio = 1.0
        logging.warning("Setting the testing sample ratio to 1.0")
@ -171,42 +154,24 @@ def load_tc_dataset(
    test_labels = label_encoder.transform(test_df[label_col])
    test_df[label_col] = test_labels

-    processor = Processor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)

-    train_dataloader = processor.create_dataloader_from_df(
-        df=train_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=True,
-        distributed=False
+    train_dataset = processor.dataset_from_dataframe(
+        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
    )
+    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)

-    test_dataloader = processor.create_dataloader_from_df(
-        df=test_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=False,
-        distributed=False
+    test_dataset = processor.dataset_from_dataframe(
+        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
    )
+    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)

    return (train_dataloader, test_dataloader, label_encoder, test_labels)


 def get_label_values(label_encoder, label_ids):
    """
-    Get the label values from label IDs. 
+    Get the label values from label IDs.

    Args:
        label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance
--- a/utils_nlp/dataset/dac.py
+++ b/utils_nlp/dataset/dac.py
@ -8,18 +8,18 @@ paper link:  ("https://www.mendeley.com/catalogue/
        arabic-text-classification-using-deep-learning-technics/")
 """

-import os
-import pandas as pd
 import logging
-import numpy as np
-
+import os
 from tempfile import TemporaryDirectory
-from utils_nlp.dataset.url_utils import extract_zip, maybe_download
-from utils_nlp.models.transformers.common import MAX_SEQ_LEN
-from utils_nlp.models.transformers.sequence_classification import Processor
+
+import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder

+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
+from utils_nlp.dataset.url_utils import extract_zip, maybe_download
+from utils_nlp.models.transformers.common import MAX_SEQ_LEN
+from utils_nlp.models.transformers.sequence_classification import Processor

 URL = (
    "https://data.mendeley.com/datasets/v524p5dhpj/2"
@ -58,7 +58,7 @@ def load_tc_dataset(
    cache_dir=TemporaryDirectory().name,
    max_len=MAX_SEQ_LEN,
    batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
    """
    Load the multinli dataset and split into training and testing datasets.
@ -92,9 +92,9 @@ def load_tc_dataset(

    Returns:
        tuple. The tuple contains four elements:
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.

-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
        
        label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
            can be retrieved by calling the `inverse_transform` function.
@ -104,11 +104,8 @@ def load_tc_dataset(
            label IDs by using the label_encoder.transform function.
    """

-     # download and load the original dataset
-    all_df = load_pandas_df(
-        local_cache_path=local_path,
-        num_rows=None
-    )
+    # download and load the original dataset
+    all_df = load_pandas_df(local_cache_path=local_path, num_rows=None)

    # set the text and label columns
    text_col = all_df.columns[0]
@ -123,12 +120,8 @@ def load_tc_dataset(
    if test_fraction < 0 or test_fraction >= 1.0:
        logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
        test_fraction = 0.25
-    
-    train_df, test_df = train_test_split(
-        all_df,
-        train_size=(1.0 - test_fraction),
-        random_state=random_seed
-    )
+
+    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)

    if train_sample_ratio > 1.0:
        train_sample_ratio = 1.0
@ -136,7 +129,7 @@ def load_tc_dataset(
    elif train_sample_ratio < 0:
        logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
        raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
    if test_sample_ratio > 1.0:
        test_sample_ratio = 1.0
        logging.warning("Setting the testing sample ratio to 1.0")
@ -149,35 +142,17 @@ def load_tc_dataset(
    if test_sample_ratio < 1.0:
        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)

-    processor = Processor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)

-    train_dataloader = processor.create_dataloader_from_df(
-        df=train_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=True,
-        distributed=False
+    train_dataset = processor.dataset_from_dataframe(
+        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
    )
+    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)

-    test_dataloader = processor.create_dataloader_from_df(
-        df=test_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=False,
-        distributed=False
+    test_dataset = processor.dataset_from_dataframe(
+        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
    )
+    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)

    # the DAC dataset already converted the labels to label ID format
    test_labels = test_df[label_col]
--- a/utils_nlp/dataset/multinli.py
+++ b/utils_nlp/dataset/multinli.py
@ -7,18 +7,19 @@
    https://www.nyu.edu/projects/bowman/multinli/
 """

+import logging
 import os
+from tempfile import TemporaryDirectory

 import pandas as pd
-import logging
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder

-from tempfile import TemporaryDirectory
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.data_loaders import DaskJSONLoader
 from utils_nlp.dataset.url_utils import extract_zip, maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
 from utils_nlp.models.transformers.sequence_classification import Processor
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder

 URL = "http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip"
 DATA_FILES = {
@ -63,9 +64,7 @@ def load_pandas_df(local_cache_path=".", file_split="train"):
    return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)


-def get_generator(
-    local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None
-):
+def get_generator(local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None):
    """ Returns an extracted dataset as a random batch generator that
    yields pandas dataframes.
    Args:
@ -85,9 +84,7 @@ def get_generator(
    except Exception as e:
        raise e

-    loader = DaskJSONLoader(
-        os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size
-    )
+    loader = DaskJSONLoader(os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size)

    return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)

@ -103,7 +100,7 @@ def load_tc_dataset(
    cache_dir=TemporaryDirectory().name,
    max_len=MAX_SEQ_LEN,
    batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
    """
    Load the multinli dataset and split into training and testing datasets.
@ -137,9 +134,9 @@ def load_tc_dataset(

    Returns:
        tuple. The tuple contains four elements:
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.

-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
        
        label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
            can be retrieved by calling the `inverse_transform` function.
@ -150,10 +147,7 @@ def load_tc_dataset(
    """

    # download and load the original dataset
-    all_df = load_pandas_df(
-        local_cache_path=local_path,
-        file_split="train"
-    )
+    all_df = load_pandas_df(local_cache_path=local_path, file_split="train")

    # select the examples corresponding to one of the entailment labels (neutral
    # in this case) to avoid duplicate rows, as the sentences are not unique,
@ -169,12 +163,8 @@ def load_tc_dataset(
    if test_fraction < 0 or test_fraction >= 1.0:
        logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
        test_fraction = 0.25
-    
-    train_df, test_df = train_test_split(
-        all_df,
-        train_size=(1.0 - test_fraction),
-        random_state=random_seed
-    )
+
+    train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)

    if train_sample_ratio > 1.0:
        train_sample_ratio = 1.0
@ -182,7 +172,7 @@ def load_tc_dataset(
    elif train_sample_ratio < 0:
        logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
        raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
    if test_sample_ratio > 1.0:
        test_sample_ratio = 1.0
        logging.warning("Setting the testing sample ratio to 1.0")
@ -200,35 +190,17 @@ def load_tc_dataset(
    test_labels = label_encoder.transform(test_df[label_col])
    test_df[label_col] = test_labels

-    processor = Processor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)

-    train_dataloader = processor.create_dataloader_from_df(
-        df=train_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=True,
-        distributed=False
+    train_dataset = processor.dataset_from_dataframe(
+        df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
    )
+    train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)

-    test_dataloader = processor.create_dataloader_from_df(
-        df=test_df,
-        text_col=text_col,
-        label_col=label_col,
-        max_len=max_len,
-        text2_col=None,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        shuffle=False,
-        distributed=False
+    test_dataset = processor.dataset_from_dataframe(
+        df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
    )
+    test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)

    return (train_dataloader, test_dataloader, label_encoder, test_labels)

--- a/utils_nlp/dataset/wikigold.py
+++ b/utils_nlp/dataset/wikigold.py
@ -7,18 +7,19 @@
    https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data
 """

-import random
-import os
-import pandas as pd
 import logging
-
+import os
+import random
 from tempfile import TemporaryDirectory
-from utils_nlp.dataset.url_utils import maybe_download
+
+import pandas as pd
+
+from utils_nlp.common.pytorch_utils import dataloader_from_dataset
 from utils_nlp.dataset.ner_utils import preprocess_conll
+from utils_nlp.dataset.url_utils import maybe_download
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN
 from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor

-
 URL = (
    "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
    "/master/data/wikigold/CONLL-format/data/wikigold.conll.txt"
@ -91,7 +92,7 @@ def load_dataset(
    max_len=MAX_SEQ_LEN,
    trailing_piece_tag="X",
    batch_size=32,
-    num_gpus=None
+    num_gpus=None,
 ):
    """
    Load the wikigold dataset and split into training and testing datasets.
@ -116,7 +117,7 @@ def load_dataset(
        cache_dir (str, optional): The default folder for saving cache files.
            Defaults to './temp'.
        max_len (int, optional): Maximum length of the list of tokens. Lists longer
-            than this are truncated and shorter ones are padded with "O"s. 
+            than this are truncated and shorter ones are padded with "O"s.
            Default value is BERT_MAX_LEN=512.
        trailing_piece_tag (str, optional): Tag used to label trailing word pieces.
            For example, "criticize" is broken into "critic" and "##ize", "critic"
@ -129,16 +130,12 @@ def load_dataset(

    Returns:
        tuple. The tuple contains four elements.
-        train_dataload (DataLoader): a PyTorch DataLoader instance for training.
-
-        test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
-        
-        label_map (dict): A dictionary object to map a label (str) to an ID (int). 
-
+        train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
+        test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
+        label_map (dict): A dictionary object to map a label (str) to an ID (int).
        test_dataset (TensorDataset): A TensorDataset containing the following four tensors.
            1. input_ids_all: Tensor. Each sublist contains numerical values,
-                i.e. token ids, corresponding to the tokens in the input 
-                text data.
+                i.e. token ids, corresponding to the tokens in the input text data.
            2. input_mask_all: Tensor. Each sublist contains the attention
                mask of the input token id list, 1 for input tokens and 0 for
                padded tokens, so that padded tokens are not attended to.
@ -155,9 +152,7 @@ def load_dataset(
    """

    train_df, test_df = load_train_test_dfs(
-        local_cache_path=local_path,
-        test_fraction=test_fraction,
-        random_seed=random_seed
+        local_cache_path=local_path, test_fraction=test_fraction, random_seed=random_seed
    )

    if train_sample_ratio > 1.0:
@ -166,7 +161,7 @@ def load_dataset(
    elif train_sample_ratio < 0:
        logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
        raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
-    
+
    if test_sample_ratio > 1.0:
        test_sample_ratio = 1.0
        logging.warning("Setting the testing sample ratio to 1.0")
@ -179,47 +174,34 @@ def load_dataset(
    if test_sample_ratio < 1.0:
        test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)

-    processor = TokenClassificationProcessor(
-        model_name=model_name,
-        to_lower=to_lower,
-        cache_dir=cache_dir
-    )
+    processor = TokenClassificationProcessor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)

    label_map = TokenClassificationProcessor.create_label_map(
-        label_lists=train_df['labels'],
-        trailing_piece_tag=trailing_piece_tag
+        label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag
    )

    train_dataset = processor.preprocess_for_bert(
-        text=train_df['sentence'],
+        text=train_df["sentence"],
        max_len=max_len,
-        labels=train_df['labels'],
+        labels=train_df["labels"],
        label_map=label_map,
-        trailing_piece_tag=trailing_piece_tag
+        trailing_piece_tag=trailing_piece_tag,
    )

    test_dataset = processor.preprocess_for_bert(
-        text=test_df['sentence'],
+        text=test_df["sentence"],
        max_len=max_len,
-        labels=test_df['labels'],
+        labels=test_df["labels"],
        label_map=label_map,
-        trailing_piece_tag=trailing_piece_tag
+        trailing_piece_tag=trailing_piece_tag,
    )

-    train_dataloader = processor.create_dataloader_from_dataset(
-        train_dataset,
-        shuffle=True,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        distributed=False
+    train_dataloader = dataloader_from_dataset(
+        train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True, distributed=False
    )

-    test_dataloader = processor.create_dataloader_from_dataset(
-        test_dataset,
-        shuffle=False,
-        batch_size=batch_size,
-        num_gpus=num_gpus,
-        distributed=False
+    test_dataloader = dataloader_from_dataset(
+        test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False, distributed=False
    )

    return (train_dataloader, test_dataloader, label_map, test_dataset)
--- a/utils_nlp/eval/evaluate_summarization.py
+++ b/utils_nlp/eval/evaluate_summarization.py
@ -3,22 +3,24 @@

 import os
 from random import random, seed
+
 from bertsum.others.utils import test_rouge


-def get_rouge(predictions, targets, temp_dir):
+def get_rouge(predictions, targets, temp_dir, random_seed=42):
    """
    function to get the rouge metric for the prediction and the reference.

    Args:
        predictions (list of strings): Predictions to be compared.
        target (list of strings): References
-        temp_dir (str): Path where temporary folders are created to host the files 
-            generated by ROUGE applicatoin.
+        temp_dir (str): Path where temporary folders are created to host the files
+            generated by ROUGE application.
+        seed (int, optional): Random seed. Defaults to 42.

    Return:
        dictionary: rouge metric
-        
+
    """

    def _write_list_to_file(list_items, filename):
@ -27,7 +29,7 @@ def get_rouge(predictions, targets, temp_dir):
            for item in list_items:
                filehandle.write("%s\n" % item)

-    seed(42)
+    seed(random_seed)
    random_number = random()
    os.makedirs(temp_dir, exist_ok=True)
    candidate_path = os.path.join(temp_dir, "candidate" + str(random_number))
--- a/utils_nlp/models/bert/sequence_classification.py
+++ b/utils_nlp/models/bert/sequence_classification.py
@ -13,7 +13,7 @@ from pytorch_pretrained_bert.optimization import BertAdam
 from tqdm import tqdm

 from utils_nlp.models.bert.common import Language
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device

 from cached_property import cached_property

@ -91,7 +91,7 @@ class BERTSequenceClassifier:

        device, num_gpus = get_device(num_gpus)

-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
@ -211,7 +211,7 @@ class BERTSequenceClassifier:
                (classes, probabilities) if probabilities is True.
        """
        device, num_gpus = get_device(num_gpus)
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)

        # score
        self.model.eval()
--- a/utils_nlp/models/bert/sequence_classification_distributed.py
+++ b/utils_nlp/models/bert/sequence_classification_distributed.py
@ -14,7 +14,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.optimization import BertAdam
 from tqdm import tqdm

-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
 from utils_nlp.models.bert.common import Language

 try:
@ -192,7 +192,7 @@ class BERTSequenceClassifier:

        device, num_gpus = get_device(num_gpus)

-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)

        if bert_optimizer is None:
            bert_optimizer = self.create_optimizer(
@ -277,7 +277,7 @@ class BERTSequenceClassifier:
                a dictionary with classes, target labels, probabilities) if probabilities is True.
        """
        device, num_gpus = get_device(num_gpus)
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)

        # score
        self.model.eval()
--- a/utils_nlp/models/bert/sequence_encoding.py
+++ b/utils_nlp/models/bert/sequence_encoding.py
@ -4,19 +4,17 @@
 # This script reuses code from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples
 # /extract_features.py, with necessary modifications.

-from pytorch_pretrained_bert.modeling import BertModel
-
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
 from enum import Enum
+
 import numpy as np
 import pandas as pd
-import os
 import torch
-
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-
-from utils_nlp.models.bert.common import Language, Tokenizer
 from cached_property import cached_property
+from pytorch_pretrained_bert.modeling import BertModel
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
+
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
+from utils_nlp.models.bert.common import Language, Tokenizer


 class PoolingStrategy(str, Enum):
@ -43,27 +41,21 @@ class BERTSentenceEncoder:
        pooling_strategy=PoolingStrategy.MEAN,
    ):
        """Initialize the encoder's underlying model and tokenizer
-        
+
        Args:
            bert_model: BERT model to use for encoding. Defaults to pretrained BertModel.
            tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer.
            language: The pretrained model's language. Defaults to Language.ENGLISH.
-            num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used. 
+            num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used.
            cache_dir: Location of BERT's cache directory. Defaults to "."
            to_lower: True to lowercase before tokenization. Defaults to False.
            max_len: Maximum number of tokens.
-            layer_index: The layer from which to extract features. 
+            layer_index: The layer from which to extract features.
                         Defaults to the last layer; can also be a list of integers for experimentation.
            pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding.
        """
-        self.model = (
-            bert_model.model.bert
-            if bert_model
-            else BertModel.from_pretrained(language, cache_dir=cache_dir)
-        )
-        self.tokenizer = (
-            tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
-        )
+        self.model = bert_model.model.bert if bert_model else BertModel.from_pretrained(language, cache_dir=cache_dir)
+        self.tokenizer = tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
        self.num_gpus = num_gpus
        self.max_len = max_len
        self.layer_index = layer_index
@ -98,16 +90,17 @@ class BERTSentenceEncoder:

    def get_hidden_states(self, text, batch_size=32):
        """Extract the hidden states from the pretrained model
-        
+
        Args:
            text: List of documents to extract features from.
            batch_size: Batch size, defaults to 32.
-        
+
        Returns:
-            pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]). 
+            pd.DataFrame with columns:
+                text_index (int), token (str), layer_index (int), values (list[float]).
        """
        device, num_gpus = get_device(self.num_gpus)
-        self.model = move_to_device(self.model, device, self.num_gpus)
+        self.model = move_model_to_device(self.model, device, self.num_gpus)

        self.model.eval()

@ -122,9 +115,7 @@ class BERTSentenceEncoder:
        input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device)

        eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
-        eval_dataloader = DataLoader(
-            eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size
-        )
+        eval_dataloader = DataLoader(eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size)

        hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []}
        for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader:
@ -142,9 +133,7 @@ class BERTSentenceEncoder:
                        hidden_states["text_index"].append(example_index.item())
                        hidden_states["token"].append(token)
                        hidden_states["layer_index"].append(layer_index)
-                        hidden_states["values"].append(
-                            [round(x.item(), 6) for x in layer_output[i]]
-                        )
+                        hidden_states["values"].append([round(x.item(), 6) for x in layer_output[i]])

            # empty cache
            del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
@ -158,7 +147,7 @@ class BERTSentenceEncoder:

    def pool(self, df):
        """Pooling to aggregate token-wise embeddings to sentence embeddings
-        
+
        Args:
            df: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float])

@ -167,31 +156,16 @@ class BERTSentenceEncoder:
        """

        def max_pool(x):
-            values = np.array(
-                [
-                    np.reshape(np.array(x.values[i]), self.embedding_dim)
-                    for i in range(x.values.shape[0])
-                ]
-            )
+            values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
            m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0)
            return m.numpy()

        def mean_pool(x):
-            values = np.array(
-                [
-                    np.reshape(np.array(x.values[i]), self.embedding_dim)
-                    for i in range(x.values.shape[0])
-                ]
-            )
+            values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
            return torch.mean(torch.tensor(values, dtype=torch.float), 0).numpy()

        def cls_pool(x):
-            values = np.array(
-                [
-                    np.reshape(np.array(x.values[i]), self.embedding_dim)
-                    for i in range(x.values.shape[0])
-                ]
-            )
+            values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
            return values[0]

        try:
@ -206,15 +180,11 @@ class BERTSentenceEncoder:
        except ValueError as ve:
            print(ve)

-        return (
-            df.groupby(["text_index", "layer_index"])["values"]
-            .apply(lambda x: pool_func(x))
-            .reset_index()
-        )
+        return df.groupby(["text_index", "layer_index"])["values"].apply(lambda x: pool_func(x)).reset_index()

    def encode(self, text, batch_size=32, as_numpy=False):
-        """Computes sentence encodings 
-        
+        """Computes sentence encodings
+
        Args:
            text: List of documents to encode.
            batch_size: Batch size, defaults to 32.
--- a/utils_nlp/models/bert/token_classification.py
+++ b/utils_nlp/models/bert/token_classification.py
@ -16,7 +16,7 @@ from pytorch_pretrained_bert.optimization import BertAdam
 from tqdm import tqdm, trange

 from utils_nlp.models.bert.common import Language, create_data_loader
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device

 from cached_property import cached_property

@ -144,7 +144,7 @@ class BERTTokenClassifier:

        device, num_gpus = get_device(num_gpus)

-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)

        if num_gpus is None:
            num_gpus_used = torch.cuda.device_count()
@ -228,7 +228,7 @@ class BERTTokenClassifier:
        )
        device, num_gpus = get_device(num_gpus)

-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)

        self.model.eval()
        eval_loss = 0
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@ -4,17 +4,16 @@
 # This script reuses some code from
 # https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py

-from itertools import cycle
 import logging
-import numpy as np
 import os
 import random
 import time
-import torch
-from tqdm import tqdm, trange
+from itertools import cycle

-from transformers import AdamW
-from transformers import get_linear_schedule_with_warmup
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import AdamW, get_linear_schedule_with_warmup
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@ -23,7 +22,8 @@ from transformers.tokenization_bert import BertTokenizer
 from transformers.tokenization_distilbert import DistilBertTokenizer
 from transformers.tokenization_roberta import RobertaTokenizer
 from transformers.tokenization_xlnet import XLNetTokenizer
-from utils_nlp.common.pytorch_utils import get_device
+
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device

 TOKENIZER_CLASS = {}
 TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
@ -38,12 +38,7 @@ logger = logging.getLogger(__name__)

 class Transformer:
    def __init__(
-        self,
-        model_class,
-        model_name="bert-base-cased",
-        num_labels=2,
-        cache_dir=".",
-        load_model_from_dir=None,
+        self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None,
    ):

        if model_name not in self.list_supported_models():
@ -82,22 +77,40 @@ class Transformer:
        if cuda and torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)

+    @staticmethod
+    def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": weight_decay,
+            },
+            {
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
+        return optimizer
+
+    @staticmethod
+    def get_default_scheduler(optimizer, warmup_steps, num_training_steps):
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
+        )
+        return scheduler
+
    def fine_tune(
        self,
        train_dataloader,
        get_inputs,
+        num_gpus=None,
+        gpu_ids=None,
        max_steps=-1,
-        num_train_epochs=1,
        max_grad_norm=1.0,
        gradient_accumulation_steps=1,
-        n_gpu=1,
-        move_batch_to_device=None,
        optimizer=None,
        scheduler=None,
-        weight_decay=0.0,
-        learning_rate=5e-5,
-        adam_epsilon=1e-8,
-        warmup_steps=0,
        fp16=False,
        fp16_opt_level="O1",
        local_rank=-1,
@ -107,51 +120,12 @@ class Transformer:
        clip_grad_norm=True,
    ):

-        device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)

        if seed is not None:
            Transformer.set_seed(seed, num_gpus > 0)

-        try:
-            dataset_length = len(train_dataloader)
-        except:
-            dataset_length = -1
-            
-        if max_steps <= 0:
-            if dataset_length != -1 and num_train_epochs > 0:
-                max_steps = dataset_length // gradient_accumulation_steps * num_train_epochs
-                
-        if max_steps <= 0:
-            raise Exception("Max steps cannot be determined for fine tuning!")
-
-        if optimizer is None:
-            no_decay = ["bias", "LayerNorm.weight"]
-            optimizer_grouped_parameters = [
-                {
-                    "params": [
-                        p
-                        for n, p in self.model.named_parameters()
-                        if not any(nd in n for nd in no_decay)
-                    ],
-                    "weight_decay": weight_decay,
-                },
-                {
-                    "params": [
-                        p
-                        for n, p in self.model.named_parameters()
-                        if any(nd in n for nd in no_decay)
-                    ],
-                    "weight_decay": 0.0,
-                },
-            ]
-            optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
-
-            
-            if scheduler is None:
-                scheduler = get_linear_schedule_with_warmup(
-                    optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps
-                )
-
        if fp16:
            try:
                from apex import amp
@ -159,46 +133,22 @@ class Transformer:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex")
            self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level)

-        if local_rank != -1:
-            self.model = torch.nn.parallel.DistributedDataParallel(
-                self.model,
-                device_ids=[local_rank],
-                output_device=local_rank,
-                find_unused_parameters=True,
-            )
-        else:
-            if isinstance(self.model, torch.nn.DataParallel):
-                self.model = self.model.module
-
-            if num_gpus > 1:
-                self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
-
-        self.model.to(device)
-        self.model.train()
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)

+        # init training
        global_step = 0
        tr_loss = 0.0
-        self.model.zero_grad()
-        
-
-        if move_batch_to_device is None:
-            def move_batch_to_device(batch, device):
-                return tuple(t.to(device) for t in batch)
-
-        start = time.time()
        accum_loss = 0
-
        self.model.train()
+        self.model.zero_grad()

-        while global_step < max_steps:  
-            epoch_iterator = tqdm(
-                train_dataloader,
-                desc="Iteration",
-                disable=local_rank not in [-1, 0] or not verbose
-            )
+        # train
+        start = time.time()
+        while global_step < max_steps:
+            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose)
            for step, batch in enumerate(epoch_iterator):
-                batch = move_batch_to_device(batch, device)
-                inputs = get_inputs(batch, self.model_name)
+                inputs = get_inputs(batch, device, self.model_name)
                outputs = self.model(**inputs)
                loss = outputs[0]

@ -210,28 +160,26 @@ class Transformer:
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
-                    if clip_grad_norm:
-                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
                else:
                    loss.backward()
-                    if clip_grad_norm:
-                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)

                tr_loss += loss.item()
-
                accum_loss += loss.item()
+
                if (step + 1) % gradient_accumulation_steps == 0:
                    global_step += 1
+
+                    if clip_grad_norm:
+                        if fp16:
+                            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
+                        else:
+                            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
+
                    if global_step % report_every == 0 and verbose:
-                        # tqdm.write("Loss:{:.6f}".format(loss))
                        end = time.time()
                        print(
-                            "loss: {0:.6f}, time: {1:f}, number of examples in current step: {2:.0f}, step {3:.0f} out of total {4:.0f}".format(
-                                accum_loss / report_every,
-                                end - start,
-                                len(batch),
-                                global_step,
-                                max_steps,
+                            "loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format(
+                                accum_loss / report_every, end - start, len(batch), global_step, max_steps,
                            )
                        )
                        accum_loss = 0
@ -246,31 +194,20 @@ class Transformer:
                    epoch_iterator.close()
                    break

-            # empty cache
-            torch.cuda.empty_cache()
        return global_step, tr_loss / global_step
-    
-    
-    def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True, move_batch_to_device=None):
-        device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)

-        if isinstance(self.model, torch.nn.DataParallel):
-            self.model = self.model.module
+    def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verbose=True):
+        # get device
+        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)

-        if num_gpus > 1:
-            self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)

-        self.model.to(device)
+        # predict
        self.model.eval()
-        
-        if move_batch_to_device is None:
-            def move_batch_to_device(batch, device):
-                return tuple(t.to(device) for t in batch)
-
-        for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
-            batch = move_batch_to_device(batch, device) #tuple(t.to(device) for t in batch)
+        for batch in tqdm(eval_dataloader, desc="Scoring", disable=not verbose):
            with torch.no_grad():
-                inputs = get_inputs(batch, self.model_name, train_mode=False)
+                inputs = get_inputs(batch, device, self.model_name, train_mode=False)
                outputs = self.model(**inputs)
                logits = outputs[0]
            yield logits.detach().cpu().numpy()
--- a/utils_nlp/models/transformers/extractive_summarization.py
+++ b/utils_nlp/models/transformers/extractive_summarization.py
@ -5,24 +5,22 @@

 import itertools
 import logging
-import numpy as np
 import os
 import random
+
+import numpy as np
 import torch
-import torch.nn as nn
-from torch.utils.data import Dataset, IterableDataset
-from torch.utils.data import DataLoader, SequentialSampler
+from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSampler

 # from torch.utils.data.distributed import DistributedSampler
-from transformers import DistilBertModel, BertModel
+from transformers import BertModel, DistilBertModel

-from bertsum.models import model_builder, data_loader
+from bertsum.models import data_loader, model_builder
 from bertsum.models.data_loader import Batch
 from bertsum.models.model_builder import Summarizer
-
-from utils_nlp.common.pytorch_utils import get_device
-from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
+from utils_nlp.common.pytorch_utils import compute_training_steps, get_device
 from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
+from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer

 MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": DistilBertModel}

@ -42,8 +40,8 @@ def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000):

    Args:
        data_iter (generator): data generator.
-        shuffle (bool): whether the data is shuffled
-        is_labeled (bool): it specifies whether the data objects are labeled data.
+        shuffle (bool): whether the data is shuffled.
+        is_labeled (bool): specifies whether the data objects are labeled data.
        batch_size (int): number of tokens per batch.

    Returns:
@ -79,9 +77,7 @@ class ExtSumProcessedIterableDataset(IterableDataset):
        if self.is_shuffle:
            return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list)))
        else:
-            return itertools.chain.from_iterable(
-                map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))
-            )
+            return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list))))

    def __iter__(self):
        return self.get_stream()
@ -114,9 +110,7 @@ class ExtSumProcessedDataset(Dataset):
        return self.data[idx]


-def get_pred(
-    example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3
-):
+def get_pred(example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3):
    """
        Get the summarization prediction for the paragraph example based on the scores
        returned by the transformer summarization model.
@ -229,9 +223,7 @@ class ExtSumProcessedData:
    def _get_files(self, root):
        train_files = []
        test_files = []
-        files = [
-            os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
-        ]
+        files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))]
        for fname in files:
            if fname.find("train") != -1:
                train_files.append(fname)
@ -324,7 +316,7 @@ class ExtSumProcessor:
        self._model_name = value

    @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
        """
        Creates an input dictionary given a model name.

@ -332,6 +324,7 @@ class ExtSumProcessor:
            batch (object): A Batch containing input ids, segment ids, sentence class ids,
                masks for the input ids, masks for  sentence class ids and source text.
                If train_model is True, it also contains the labels and target text.
+            device (torch.device): A PyTorch device.
            model_name (bool, optional): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.
@ -344,6 +337,7 @@ class ExtSumProcessor:

        if model_name.split("-")[0] in ["bert", "distilbert"]:
            if train_mode:
+                batch = batch.to(device)
                # labels must be the last
                return {
                    "x": batch.src,
@ -354,12 +348,13 @@ class ExtSumProcessor:
                    "labels": batch.labels,
                }
            else:
+                batch = Bunch(batch)
                return {
-                    "x": batch.src,
-                    "segs": batch.segs,
-                    "clss": batch.clss,
-                    "mask": batch.mask,
-                    "mask_cls": batch.mask_cls,
+                    "x": batch.src.to(device),
+                    "segs": batch.segs.to(device),
+                    "clss": batch.clss.to(device),
+                    "mask": batch.mask.to(device),
+                    "mask_cls": batch.mask_cls.to(device),
                }
        else:
            raise ValueError("Model not supported: {}".format(model_name))
@ -476,7 +471,7 @@ class ExtractiveSummarizer(Transformer):
        Args:
            model_name (str, optional): Transformer model name used in preprocessing.
                check MODEL_CLASS for supported models. Defaults to "distilbert-base-uncased".
-            encoder (str, optional): Encoder algorithm used by summarization layer. 
+            encoder (str, optional): Encoder algorithm used by summarization layer.
                There are four options:
                    - baseline: it used a smaller transformer model to replace the bert model
                      and with transformer summarization layer.
@ -485,13 +480,11 @@ class ExtractiveSummarizer(Transformer):
                    - transformer: it uses pretrained BERT and fine-tune BERT with transformer
                      summarization layer.
                    - RNN: it uses pretrained BERT and fine-tune BERT with LSTM summarization layer.
-                Defaults to "transformer". 
+                Defaults to "transformer".
            cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
        """

-        super().__init__(
-            model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir
-        )
+        super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir)
        if model_name not in self.list_supported_models():
            raise ValueError(
                "Model name {} is not supported by ExtractiveSummarizer. "
@ -522,6 +515,7 @@ class ExtractiveSummarizer(Transformer):
        self,
        train_dataset,
        num_gpus=None,
+        gpu_ids=None,
        batch_size=3000,
        local_rank=-1,
        max_steps=5e5,
@ -546,7 +540,10 @@ class ExtractiveSummarizer(Transformer):
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will
                be used. Defaults to None.
-            batch_size (int, optional): Maximum number of tokens in each batch. 
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
+            batch_size (int, optional): Maximum number of tokens in each batch.
            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                -1, which means non-distributed training.
            max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.
@ -571,16 +568,7 @@ class ExtractiveSummarizer(Transformer):
            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
        """

-        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
-
-        def move_batch_to_device(batch, device):
-            return batch.to(device)
-
-        # if isinstance(self.model, nn.DataParallel):
-        #    self.model.module.to(device)
-        # else:
-        self.model.to(device)
-
+        # init optimizer
        optimizer = model_builder.build_optim(
            optimization_method,
            learning_rate,
@ -594,31 +582,34 @@ class ExtractiveSummarizer(Transformer):
        )

        # batch_size is the number of tokens in a batch
-        train_dataloader = get_dataloader(
-            train_dataset.get_stream(), is_labeled=True, batch_size=batch_size
+        train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
+            train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
        )

        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=ExtSumProcessor.get_inputs,
-            move_batch_to_device=move_batch_to_device,
-            n_gpu=num_gpus,
-            num_train_epochs=-1,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,
            max_steps=max_steps,
-            optimizer=optimizer,
-            warmup_steps=warmup_steps,
+            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
+            optimizer=optimizer,
+            scheduler=None,
            verbose=verbose,
            seed=seed,
            report_every=report_every,
            clip_grad_norm=False,
-            max_grad_norm=max_grad_norm,
        )

    def predict(
        self,
        test_dataset,
        num_gpus=1,
+        gpu_ids=None,
        batch_size=16,
        sentence_separator="<q>",
        top_n=3,
@ -632,6 +623,9 @@ class ExtractiveSummarizer(Transformer):
        Args:
            test_dataset (Dataset): Dataset for which the summary to be predicted
            num_gpus (int, optional): The number of GPUs used in prediction. Defaults to 1.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
            batch_size (int, optional): The number of test examples in each batch. Defaults to 16.
            sentence_separator (str, optional): String to be inserted between sentences in
                the prediction. Defaults to '<q>'.
@ -678,10 +672,8 @@ class ExtractiveSummarizer(Transformer):
                }

        test_sampler = SequentialSampler(test_dataset)
-        test_dataloader = DataLoader(
-            test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn
-        )
-        sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus)
+        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn)
+        sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids)
        sent_scores_list = list(sent_scores)
        scores_list = []
        for i in sent_scores_list:
@ -699,15 +691,18 @@ class ExtractiveSummarizer(Transformer):
            prediction.extend(temp_pred)
        return prediction

-    def predict_scores(self, eval_dataloader, num_gpus=1, verbose=True):
+    def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True):
        """
        Scores a dataset using a fine-tuned model and a given dataloader.

        Args:
-            eval_dataloader (Dataloader): Dataloader for the evaluation data.
+            test_dataloader (Dataloader): Dataloader for scoring the data.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.

        Returns
@ -716,23 +711,13 @@ class ExtractiveSummarizer(Transformer):

        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)

-        def move_batch_to_device(batch, device):
-            batch["src"] = batch["src"].to(device)
-            batch["segs"] = batch["segs"].to(device)
-            batch["clss"] = batch["clss"].to(device)
-            batch["mask"] = batch["mask"].to(device)
-            batch["mask_cls"] = batch["mask_cls"].to(device)
-            if "labels" in batch:
-                batch["labels"] = batch["labels"].to(device)
-            return Bunch(batch)
-
        preds = list(
            super().predict(
-                eval_dataloader=eval_dataloader,
+                eval_dataloader=test_dataloader,
                get_inputs=ExtSumProcessor.get_inputs,
-                n_gpu=num_gpus,
+                num_gpus=num_gpus,
+                gpu_ids=gpu_ids,
                verbose=verbose,
-                move_batch_to_device=move_batch_to_device,
            )
        )
        return preds
--- a/utils_nlp/models/transformers/named_entity_recognition.py
+++ b/utils_nlp/models/transformers/named_entity_recognition.py
@ -2,18 +2,16 @@
 # Licensed under the MIT License.

 import logging
+from collections import Iterable
+
 import numpy as np
 import torch
-import torch.nn as nn
-
-from collections import Iterable
 from torch.utils.data import TensorDataset
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification
 from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForTokenClassification
-from utils_nlp.common.pytorch_utils import get_device
+
+from utils_nlp.common.pytorch_utils import compute_training_steps
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler

 TC_MODEL_CLASS = {}
 TC_MODEL_CLASS.update({k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
@ -42,27 +40,36 @@ class TokenClassificationProcessor:
        )

    @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
        """
-        Produce a dictionary object for model training or prediction.
+        Creates an input dictionary given a model name.

        Args:
-            model_name (str): The pretained model name.
-            train_mode (bool, optional): Whether it's for model training. Set it to False if
-                it's for testing and it won't have the 'labels' data field.
-                Defaults to True, for model training.
+            batch (tuple): A tuple containing input ids, attention mask,
+                segment ids, and labels tensors.
+            device (torch.device): A PyTorch device.
+            model_name (bool, optional): Model name used to format the inputs.
+            train_mode (bool, optional): Training mode flag.
+                Defaults to True.

        Returns:
-            dict: A dictionary object contains all needed information for training or testing.
+            dict: Dictionary containing input ids, segment ids, masks, and labels.
+                Labels are only returned when train_mode is True.
        """
+        batch = tuple(t.to(device) for t in batch)
+        if model_name.split("-")[0] in ["bert", "distilbert"]:
+            if train_mode:
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            else:
+                inputs = {"input_ids": batch[0], "attention_mask": batch[1]}

-        if model_name not in list(TC_MODEL_CLASS):
-            raise ValueError("Model not supported: {}".format(model_name))
+            # distilbert doesn't support segment ids
+            if model_name.split("-")[0] not in ["distilbert"]:
+                inputs["token_type_ids"] = batch[2]

-        if train_mode:
-            return {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+            return inputs
        else:
-            return {"input_ids": batch[0], "attention_mask": batch[1]}
+            raise ValueError("Model not supported: {}".format(model_name))

    @staticmethod
    def create_label_map(label_lists, trailing_piece_tag="X"):
@ -89,9 +96,7 @@ class TokenClassificationProcessor:
            label_map[trailing_piece_tag] = len(label_set)
        return label_map

-    def preprocess_for_bert(
-        self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"
-    ):
+    def preprocess_for_bert(self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"):
        """
        Tokenize and preprocesses input word lists, involving the following steps
            0. WordPiece tokenization.
@ -125,7 +130,7 @@ class TokenClassificationProcessor:
        Returns:
            TensorDataset: A TensorDataset containing the following four tensors.
                1. input_ids_all: Tensor. Each sublist contains numerical values,
-                    i.e. token ids, corresponding to the tokens in the input 
+                    i.e. token ids, corresponding to the tokens in the input
                    text data.
                2. input_mask_all: Tensor. Each sublist contains the attention
                    mask of the input token id list, 1 for input tokens and 0 for
@ -146,9 +151,7 @@ class TokenClassificationProcessor:
            return isinstance(obj, Iterable) and not isinstance(obj, str)

        if max_len > MAX_SEQ_LEN:
-            logging.warning(
-                "Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)
-            )
+            logging.warning("Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
            max_len = MAX_SEQ_LEN

        if not _is_iterable_but_not_string(text):
@ -181,9 +184,7 @@ class TokenClassificationProcessor:
        for t, t_labels in zip(text, labels):
            if len(t) != len(t_labels):
                raise ValueError(
-                    "The number of words is {0}, but the number of labels is {1}.".format(
-                        len(t), len(t_labels)
-                    )
+                    "The number of words is {0}, but the number of labels is {1}.".format(len(t), len(t_labels))
                )

            new_labels = []
@ -197,11 +198,7 @@ class TokenClassificationProcessor:
                    new_tokens.append(sub_word)

            if len(new_tokens) > max_len:
-                logging.warn(
-                    "Text after tokenization with length {} has been truncated".format(
-                        len(new_tokens)
-                    )
-                )
+                logging.warn("Text after tokenization with length {} has been truncated".format(len(new_tokens)))
                new_tokens = new_tokens[:max_len]
                new_labels = new_labels[:max_len]
            input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
@ -218,9 +215,7 @@ class TokenClassificationProcessor:
            input_mask += padding
            new_labels += label_padding

-            trailing_token_mask_all.append(
-                [True if label != trailing_piece_tag else False for label in new_labels]
-            )
+            trailing_token_mask_all.append([True if label != trailing_piece_tag else False for label in new_labels])

            if label_map:
                label_ids = [label_map[label] for label in new_labels]
@ -235,32 +230,17 @@ class TokenClassificationProcessor:
            td = TensorDataset(
                torch.tensor(input_ids_all, dtype=torch.long),
                torch.tensor(input_mask_all, dtype=torch.long),
-                torch.tensor(trailing_token_mask_all, dtype=torch.bool),
+                torch.tensor(trailing_token_mask_all, dtype=torch.long),
                torch.tensor(label_ids_all, dtype=torch.long),
            )
        else:
            td = TensorDataset(
                torch.tensor(input_ids_all, dtype=torch.long),
                torch.tensor(input_mask_all, dtype=torch.long),
-                torch.tensor(trailing_token_mask_all, dtype=torch.bool),
+                torch.tensor(trailing_token_mask_all, dtype=torch.long),
            )
        return td

-    def create_dataloader_from_dataset(
-        self, dataset, shuffle=False, batch_size=32, num_gpus=None, distributed=False
-    ):
-        if num_gpus is None:
-            num_gpus = torch.cuda.device_count()
-
-        batch_size = batch_size * max(1, num_gpus)
-
-        if distributed:
-            sampler = DistributedSampler(dataset)
-        else:
-            sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset)
-
-        return DataLoader(dataset, sampler=sampler, batch_size=batch_size)
-

 class TokenClassifier(Transformer):
    """
@ -277,10 +257,7 @@ class TokenClassifier(Transformer):

    def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
        super().__init__(
-            model_class=TC_MODEL_CLASS,
-            model_name=model_name,
-            num_labels=num_labels,
-            cache_dir=cache_dir,
+            model_class=TC_MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
        )

    @staticmethod
@ -291,7 +268,10 @@ class TokenClassifier(Transformer):
        self,
        train_dataloader,
        num_epochs=1,
+        max_steps=-1,
+        gradient_accumulation_steps=1,
        num_gpus=None,
+        gpu_ids=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
@ -301,73 +281,96 @@ class TokenClassifier(Transformer):
        seed=None,
    ):
        """
-        Fit the TokenClassifier model using the given training dataset.
+        Fine-tunes a pre-trained token classification model.

        Args:
-            train_dataloader (DataLoader): DataLoader instance for training.
-            num_epochs (int, optional): Number of training epochs.
-                Defaults to 1.
+            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
+            num_epochs (int, optional): Number of training epochs. Defaults to 1.
+            max_steps (int, optional): Total number of training steps.
+                If set to a positive value, it overrides num_epochs.
+                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Defualts to -1.
+            gradient_accumulation_steps (int, optional): Number of steps to accumulate
+                before performing a backward/update pass.
+                Default to 1.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
-            local_rank (int, optional): Whether need to do distributed training.
-                Defaults to -1, no distributed training.
-            weight_decay (float, optional): Weight decay rate.
-                Defaults to 0.
-            learning_rate (float, optional): The learning rate.
-                Defaults to 5e-5.
-            adam_espilon (float, optional): The 'eps' parameter for the 'AdamW' optimizer.
-                Defaults to 1e-8.
-            warmup_steps (int, optional): Number of warmup steps for 'WarmupLinearSchedule'.
-                Defaults to 0.
-            verbose (bool, optional): Verbose model.
-                Defaults to False.
-            seed (int, optional): The seed for the transformers.
-                Defaults to None, use the default seed.
+                be used. If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
+            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
+                -1, which means non-distributed training.
+            weight_decay (float, optional): Weight decay to apply after each parameter update.
+                Defaults to 0.0.
+            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
+                5e-5.
+            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
+            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
+                to `learning rate`. Defaults to 0.
+            verbose (bool, optional): Whether to print out the training log. Defaults to True.
+            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
        """

+        # init optimizer
+        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
+            train_dataloader,
+            num_epochs=num_epochs,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+        )
+
+        # init scheduler
+        scheduler = Transformer.get_default_scheduler(
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+        )
+
+        # fine tune
        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=TokenClassificationProcessor.get_inputs,
-            n_gpu=num_gpus,
-            num_train_epochs=num_epochs,
-            weight_decay=weight_decay,
-            learning_rate=learning_rate,
-            adam_epsilon=adam_epsilon,
-            warmup_steps=warmup_steps,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            local_rank=local_rank,
            verbose=verbose,
            seed=seed,
        )

-    def predict(self, eval_dataloader, num_gpus=None, verbose=True):
+    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
        """
-        Test on an evaluation dataset and get the token label predictions.
+        Scores a dataset using a fine-tuned model and a given dataloader.

        Args:
-            eval_dataset (TensorDataset): A TensorDataset for evaluation.
+            test_dataloader (DataLoader): DataLoader for scoring the data.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
-            verbose (bool, optional): Verbose model.
-                Defaults to False.
+                be used. If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
+            verbose (bool, optional): Whether to print out the training log. Defaults to True.

-        Returns:
-            ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is
-            [number_of_examples, sequence_length, number_of_labels]. Each
-            value in the ndarray is not normalized. Post-process will be needed
-            to get the probability for each class label.
+        Returns
+            1darray: numpy array of predicted label indices.
        """

        preds = list(
            super().predict(
-                eval_dataloader=eval_dataloader,
+                eval_dataloader=test_dataloader,
                get_inputs=TokenClassificationProcessor.get_inputs,
-                n_gpu=num_gpus,
+                num_gpus=num_gpus,
+                gpu_ids=gpu_ids,
                verbose=verbose,
            )
        )
-        preds_np = np.concatenate(preds)
-        return preds_np
+        return np.concatenate(preds)

    def get_predicted_token_labels(self, predictions, label_map, dataset):
        """
@ -376,21 +379,19 @@ class TokenClassifier(Transformer):
        Args:
            predictions (ndarray): A numpy ndarray produced from the `predict` function call.
                The shape of the ndarray is [number_of_examples, sequence_length, number_of_labels].
-            label_map (dict): A dictionary object to map a label (str) to an ID (int). 
+            label_map (dict): A dictionary object to map a label (str) to an ID (int).
                dataset (TensorDataset): The TensorDataset for evaluation.
            dataset (Dataset): The test Dataset instance.

        Returns:
            list: A list of lists. The size of the retured list is the number of testing samples.
-            Each sublist represents the predicted label for each token. 
+            Each sublist represents the predicted label for each token.
        """

        num_samples = len(dataset.tensors[0])
        if num_samples != predictions.shape[0]:
            raise ValueError(
-                "Predictions have {0} samples, but got {1} samples in dataset".format(
-                    predictions.shape[0], num_samples
-                )
+                "Predictions have {0} samples, but got {1} samples in dataset".format(predictions.shape[0], num_samples)
            )

        label_id2str = {v: k for k, v in label_map.items()}
@ -409,7 +410,7 @@ class TokenClassifier(Transformer):
                if attention_mask[sid] == 0:
                    break

-                if not trailing_mask[sid]:
+                if not bool(trailing_mask[sid]):
                    continue

                label_id = seq_probs[sid].argmax()
@ -422,13 +423,13 @@ class TokenClassifier(Transformer):
        Get the true testing label values.

        Args:
-            label_map (dict): A dictionary object to map a label (str) to an ID (int). 
+            label_map (dict): A dictionary object to map a label (str) to an ID (int).
                dataset (TensorDataset): The TensorDataset for evaluation.
            dataset (Dataset): The test Dataset instance.

        Returns:
            list: A list of lists. The size of the retured list is the number of testing samples.
-            Each sublist represents the predicted label for each token. 
+            Each sublist represents the predicted label for each token.
        """

        num_samples = len(dataset.tensors[0])
--- a/utils_nlp/models/transformers/question_answering.py
+++ b/utils_nlp/models/transformers/question_answering.py
@ -17,38 +17,30 @@
 # Modifications copyright © Microsoft Corporation


-import os
-import logging
-from tqdm import tqdm
 import collections
 import json
+import logging
 import math
+import os
+
 import jsonlines
-
 import torch
-from torch.utils.data import TensorDataset, SequentialSampler, DataLoader, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+from torch.utils.data import TensorDataset
+from tqdm import tqdm
+from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
 from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
-from transformers.modeling_xlnet import (
-    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetForQuestionAnswering,
-)
-from transformers.modeling_distilbert import (
-    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    DistilBertForQuestionAnswering,
-)
+from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
+from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize

-from utils_nlp.common.pytorch_utils import get_device
+from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer

 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
 MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update(
-    {k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
+MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})

 # cached files during preprocessing
 # these are used in postprocessing to generate the final answer texts
@ -85,9 +77,7 @@ class QAProcessor:
        cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
    """

-    def __init__(
-        self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."
-    ):
+    def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
        self.model_name = model_name
        self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
            model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
@ -116,13 +106,14 @@ class QAProcessor:
        return self._model_type

    @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
        """
        Creates an input dictionary given a model name.

        Args:
            batch (tuple): A tuple containing input ids, attention mask,
                segment ids, and labels tensors.
+            device (torch.device): A PyTorch device.
            model_name (bool, optional): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.
@ -131,6 +122,7 @@ class QAProcessor:
            dict: Dictionary containing input ids, segment ids, masks, and labels.
                Labels are only returned when train_mode is True.
        """
+        batch = tuple(t.to(device) for t in batch)
        model_type = model_name.split("-")[0]

        inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
@ -191,6 +183,8 @@ class QAProcessor:
                directory. These files are required during postprocessing to generate the final
                answer texts from predicted answer start and answer end indices. Defaults to
                "./cached_qa_features".
+        Returns:
+            DataSet: A Pytorch DataSet.
        """

        if not os.path.exists(feature_cache_dir):
@ -223,9 +217,7 @@ class QAProcessor:

                qa_examples.append(qa_example_cur)

-                qa_examples_json.append(
-                    {"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
-                )
+                qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})

                features_cur = _create_qa_features(
                    qa_example_cur,
@ -271,28 +263,13 @@ class QAProcessor:
            start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            qa_dataset = TensorDataset(
-                input_ids,
-                input_mask,
-                segment_ids,
-                start_positions,
-                end_positions,
-                cls_index,
-                p_mask,
+                input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
            )
        else:
            unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
-            qa_dataset = TensorDataset(
-                input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
-            )
+            qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)

-        if num_gpus is not None:
-            batch_size = batch_size * max(1, num_gpus)
-        if distributed:
-            sampler = DistributedSampler(qa_dataset)
-        else:
-            sampler = RandomSampler(qa_dataset) if is_training else SequentialSampler(qa_dataset)
-
-        return DataLoader(qa_dataset, sampler=sampler, batch_size=batch_size)
+        return qa_dataset

    def postprocess(
        self,
@ -420,14 +397,7 @@ class QAResult(QAResult_):

 QAResultExtended_ = collections.namedtuple(
    "QAResultExtended",
-    [
-        "unique_id",
-        "start_top_log_probs",
-        "start_top_index",
-        "end_top_log_probs",
-        "end_top_index",
-        "cls_logits",
-    ],
+    ["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
 )


@ -489,18 +459,16 @@ class AnswerExtractor(Transformer):
    def fit(
        self,
        train_dataloader,
-        num_gpus=None,
        num_epochs=1,
-        learning_rate=5e-5,
-        max_grad_norm=1.0,
        max_steps=-1,
        gradient_accumulation_steps=1,
-        warmup_steps=0,
-        weight_decay=0.0,
-        adam_epsilon=1e-8,
-        fp16=False,
-        fp16_opt_level="O1",
+        num_gpus=None,
+        gpu_ids=None,
        local_rank=-1,
+        weight_decay=0.0,
+        learning_rate=5e-5,
+        adam_epsilon=1e-8,
+        warmup_steps=0,
        verbose=True,
        seed=None,
        cache_model=True,
@ -509,31 +477,30 @@ class AnswerExtractor(Transformer):
        Fine-tune pre-trained transofmer models for question answering.

        Args:
-            train_dataloader (Dataloader): Dataloader for the training data.
-            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
-                be used. If set to 0 or GPUs are not available, CPU device will
-                be used. Defaults to None.
+            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
-            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
-                5e-5.
-            max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
-                Defaults to 1.0.
-            max_steps (int, optional): Maximum number of training steps. If specified,
-                `num_epochs` will be ignored. Defaults to -1.
-            gradient_accumulation_steps (int, optional): Number of batches to accumulate
-                gradients on between each model parameter update. Defaults to 1.
-            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
-                to `learning rate`. Defaults to 0.
-            weight_decay (float, optional): Weight decay to apply after each parameter update.
-                Defaults to 0.0.
-            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
-            fp16 (bool, optional): Whether to use 16-bit (mixed) precision (through NVIDIA apex)
-                instead of 32-bit. Defaults to False.
-            fp16_opt_level (str, optional): For fp16: Apex AMP optimization level selected in
-                ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html.
-                Defaults to "O1",
+            max_steps (int, optional): Total number of training steps.
+                If set to a positive value, it overrides num_epochs.
+                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Defualts to -1.
+            gradient_accumulation_steps (int, optional): Number of steps to accumulate
+                before performing a backward/update pass.
+                Default to 1.
+            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
+                be used. If set to 0 or GPUs are not available, CPU device will be used.
+                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                -1, which means non-distributed training.
+            weight_decay (float, optional): Weight decay to apply after each parameter update.
+                Defaults to 0.0.
+            learning_rate (float, optional):  Learning rate of the AdamW optimizer. Defaults to
+                5e-5.
+            adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
+            warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
+                to `learning rate`. Defaults to 0.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.
            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
            cache_model (bool, optional): Whether to save the fine-tuned model. If True,
@ -542,39 +509,53 @@ class AnswerExtractor(Transformer):

        """

+        # init optimizer
+        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
+            train_dataloader,
+            num_epochs=num_epochs,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+        )
+
+        # inin scheduler
+        scheduler = Transformer.get_default_scheduler(
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+        )
+
+        # fine tune
        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=QAProcessor.get_inputs,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,          
            max_steps=max_steps,
-            num_train_epochs=num_epochs,
-            max_grad_norm=max_grad_norm,
            gradient_accumulation_steps=gradient_accumulation_steps,
-            n_gpu=num_gpus,
-            weight_decay=weight_decay,
-            learning_rate=learning_rate,
-            adam_epsilon=adam_epsilon,
-            warmup_steps=warmup_steps,
-            fp16=fp16,
-            fp16_opt_level=fp16_opt_level,
+            optimizer=optimizer,
+            scheduler=scheduler,
            local_rank=local_rank,
            verbose=verbose,
            seed=seed,
        )
+
        if cache_model:
            self.save_model()

-    def predict(self, test_dataloader, num_gpus=None, verbose=True):
+    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):

        """
        Predicts answer start and end logits.

        Args:
-            test_dataloader (QADataset): Dataloader for the testing data.
+            test_dataloader (DataLoader): DataLoader for scoring the data.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will
                be used. Defaults to None.
-            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-                -1, which means non-distributed.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
            verbose (bool, optional): Whether to print out the predicting log. Defaults to True.

        Returns:
@ -584,25 +565,16 @@ class AnswerExtractor(Transformer):
        def _to_list(tensor):
            return tensor.detach().cpu().tolist()

+        # get device
        device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
-
-        if isinstance(self.model, torch.nn.DataParallel):
-            self.model = self.model.module
-
-        if num_gpus > 1:
-            self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
-
-        self.model.to(device)
-        self.model.eval()
+        # move model
+        self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)

        all_results = []
        for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose):
-            batch = tuple(t.to(device) for t in batch)
            with torch.no_grad():
-                inputs = QAProcessor.get_inputs(batch, self.model_name, train_mode=False)
-
+                inputs = QAProcessor.get_inputs(batch, device, self.model_name, train_mode=False)
                outputs = self.model(**inputs)
-
                unique_id_tensor = batch[5]

            for i, u_id in enumerate(unique_id_tensor):
@ -617,9 +589,7 @@ class AnswerExtractor(Transformer):
                    )
                else:
                    result = QAResult(
-                        unique_id=u_id.item(),
-                        start_logits=_to_list(outputs[0][i]),
-                        end_logits=_to_list(outputs[1][i]),
+                        unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
                    )
                all_results.append(result)
            torch.cuda.empty_cache()
@ -783,9 +753,7 @@ def postprocess_bert_answer(

        # Sort by the sum of the start and end logits in ascending order,
        # so that the first element is the most probable answer
-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
-        )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)

        seen_predictions = {}
        nbest = []
@ -818,19 +786,11 @@ def postprocess_bert_answer(
                final_text = ""
                seen_predictions[final_text] = True

-            nbest.append(
-                _NbestPrediction(
-                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
-                )
-            )
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
        # if we didn't include the empty option in the n-best, include it
        if unanswerable_exists:
            if "" not in seen_predictions:
-                nbest.append(
-                    _NbestPrediction(
-                        text="", start_logit=null_start_logit, end_logit=null_end_logit
-                    )
-                )
+                nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))

            # In very rare edge cases we could only have single null prediction.
            # So we just create a nonce prediction in this case to avoid failure.
@ -874,9 +834,7 @@ def postprocess_bert_answer(
            all_probs[example["qa_id"]] = nbest_json[0]["probability"]
        else:
            # predict "" iff the null score - the score of best non-null > threshold
-            score_diff = (
-                score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
-            )
+            score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
            scores_diff_json[example["qa_id"]] = score_diff
            if score_diff > null_score_diff_threshold:
                all_predictions[example["qa_id"]] = ""
@ -1042,9 +1000,7 @@ def postprocess_xlnet_answer(
                        )
                    )

-        prelim_predictions = sorted(
-            prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
-        )
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)

        seen_predictions = {}
        nbest = []
@ -1075,20 +1031,14 @@ def postprocess_xlnet_answer(
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)

-            final_text = _get_final_text(
-                tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
-            )
+            final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)

            if final_text in seen_predictions:
                continue

            seen_predictions[final_text] = True

-            nbest.append(
-                _NbestPrediction(
-                    text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
-                )
-            )
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))

        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
@ -1235,9 +1185,7 @@ def _create_qa_example(qa_input, is_training):
            actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
            cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
            if actual_text.find(cleaned_answer_text) == -1:
-                logger.warning(
-                    "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text
-                )
+                logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                return
        else:
            start_position = -1
@ -1696,9 +1644,7 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):

    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
-            logger.info(
-                "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text
-            )
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
        return orig_text

    # We then project the characters in `pred_text` back to `orig_text` using
--- a/utils_nlp/models/transformers/sequence_classification.py
+++ b/utils_nlp/models/transformers/sequence_classification.py
@ -2,37 +2,25 @@
 # Licensed under the MIT License.

 import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from transformers.modeling_bert import (
-    BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    BertForSequenceClassification,
-)
+from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForSequenceClassification
+from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification
 from transformers.modeling_distilbert import (
    DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    DistilBertForSequenceClassification,
 )
-from transformers.modeling_roberta import (
-    ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
-    RobertaForSequenceClassification,
-)
-from transformers.modeling_xlnet import (
-    XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
-    XLNetForSequenceClassification,
-)
+from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForSequenceClassification
+from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification
+
+from utils_nlp.common.pytorch_utils import compute_training_steps
 from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
 from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet

 MODEL_CLASS = {}
 MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update(
-    {k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
+MODEL_CLASS.update({k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP})
 MODEL_CLASS.update({k: XLNetForSequenceClassification for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
-MODEL_CLASS.update(
-    {k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
-)
+MODEL_CLASS.update({k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
+MODEL_CLASS.update({k: AlbertForSequenceClassification for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})


 class Processor:
@ -56,13 +44,14 @@ class Processor:
        )

    @staticmethod
-    def get_inputs(batch, model_name, train_mode=True):
+    def get_inputs(batch, device, model_name, train_mode=True):
        """
        Creates an input dictionary given a model name.

        Args:
            batch (tuple): A tuple containing input ids, attention mask,
                segment ids, and labels tensors.
+            device (torch.device): A PyTorch device.
            model_name (bool, optional): Model name used to format the inputs.
            train_mode (bool, optional): Training mode flag.
                Defaults to True.
@ -71,7 +60,8 @@ class Processor:
            dict: Dictionary containing input ids, segment ids, masks, and labels.
                Labels are only returned when train_mode is True.
        """
-        if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert"]:
+        batch = tuple(t.to(device) for t in batch)
+        if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert", "albert"]:
            if train_mode:
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            else:
@ -103,11 +93,7 @@ class Processor:
            print("setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
            max_len = MAX_SEQ_LEN
        # truncate and add CLS & SEP markers
-        tokens = (
-            [tokenizer.cls_token]
-            + tokenizer.tokenize(text)[0 : max_len - 2]
-            + [tokenizer.sep_token]
-        )
+        tokens = [tokenizer.cls_token] + tokenizer.tokenize(text)[0 : max_len - 2] + [tokenizer.sep_token]
        # get input ids
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # pad sequence
@ -188,55 +174,13 @@ class Processor:

        return input_ids, attention_mask, token_type_ids

-    def create_dataloader_from_df(
-        self,
-        df,
-        text_col,
-        label_col=None,
-        text2_col=None,
-        shuffle=False,
-        max_len=MAX_SEQ_LEN,
-        batch_size=32,
-        num_gpus=None,
-        distributed=False,
-    ):
-        """
-        Creates a PyTorch DataLoader from a Pandas DataFrame for sequence classification tasks.    
-
-        Args:
-            df (pandas.DataFrame): Input Pandas DataFrame.
-            text_col (str/int): Text column name or index.
-            label_col (str/int, optional): Label column name or index. Defualts to None.
-            text2_col (str/int, optional): Second text column name or index for sequence-pair tasks.
-                Defualts to None.
-            shuffle (bool, optional): If set to True, the DataLoader will use a RandomSampler,
-                otherwise it will use a SequentialSampler.
-                Defaults to False.
-            max_len (int, optional): Maximum sequence length. Defaults to 512.
-            batch_size (int, optional): Batch size. Defaults to 32.
-            num_gpus (int, optional): Number of GPUs to use.
-                If None, all available GPUs will be used.
-                If set to 0 or GPUs are not available, CPU device will be used.
-                Defaults to None.
-            distributed (bool, optional): If set to True, the DataLoader will use
-                a DistributedSampler.
-                Defaults to False.
-
-        Returns:
-            DataLoader: A PyTorch DataLoader object that can be used for training or scoring.
-        """
-
+    def dataset_from_dataframe(self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN):
        if text2_col is None:
-            ds = SCDataSet(
-                df,
-                text_col,
-                label_col,
-                transform=Processor.text_transform,
-                tokenizer=self.tokenizer,
-                max_len=max_len,
+            return SCDataSet(
+                df, text_col, label_col, transform=Processor.text_transform, tokenizer=self.tokenizer, max_len=max_len,
            )
        else:
-            ds = SPCDataSet(
+            return SPCDataSet(
                df,
                text_col,
                text2_col,
@ -246,26 +190,11 @@ class Processor:
                max_len=max_len,
            )

-        if num_gpus is None:
-            num_gpus = torch.cuda.device_count()
-
-        batch_size = batch_size * max(1, num_gpus)
-
-        if distributed:
-            sampler = DistributedSampler(ds)
-        else:
-            sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
-
-        return DataLoader(ds, sampler=sampler, batch_size=batch_size)
-

 class SequenceClassifier(Transformer):
    def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
        super().__init__(
-            model_class=MODEL_CLASS,
-            model_name=model_name,
-            num_labels=num_labels,
-            cache_dir=cache_dir,
+            model_class=MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
        )

    @staticmethod
@ -276,7 +205,10 @@ class SequenceClassifier(Transformer):
        self,
        train_dataloader,
        num_epochs=1,
+        max_steps=-1,
+        gradient_accumulation_steps=1,
        num_gpus=None,
+        gpu_ids=None,
        local_rank=-1,
        weight_decay=0.0,
        learning_rate=5e-5,
@ -289,11 +221,21 @@ class SequenceClassifier(Transformer):
        Fine-tunes a pre-trained sequence classification model.

        Args:
-            train_dataloader (Dataloader): Dataloader for the training data.
+            train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
            num_epochs (int, optional): Number of training epochs. Defaults to 1.
+            max_steps (int, optional): Total number of training steps.
+                If set to a positive value, it overrides num_epochs.
+                Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
+                Defualts to -1.
+            gradient_accumulation_steps (int, optional): Number of steps to accumulate
+                before performing a backward/update pass.
+                Default to 1.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
            local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
                -1, which means non-distributed training.
            weight_decay (float, optional): Weight decay to apply after each parameter update.
@ -307,28 +249,49 @@ class SequenceClassifier(Transformer):
            seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
        """

+        # init optimizer
+        optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
+
+        # compute the max number of training steps
+        max_steps = compute_training_steps(
+            train_dataloader,
+            num_epochs=num_epochs,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+        )
+
+        # init scheduler
+        scheduler = Transformer.get_default_scheduler(
+            optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
+        )
+
+        # fine tune
        super().fine_tune(
            train_dataloader=train_dataloader,
            get_inputs=Processor.get_inputs,
-            n_gpu=num_gpus,
-            num_train_epochs=num_epochs,
-            weight_decay=weight_decay,
-            learning_rate=learning_rate,
-            adam_epsilon=adam_epsilon,
-            warmup_steps=warmup_steps,
+            num_gpus=num_gpus,
+            gpu_ids=gpu_ids,
+            max_steps=max_steps,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            local_rank=local_rank,
            verbose=verbose,
            seed=seed,
        )

-    def predict(self, eval_dataloader, num_gpus=None, verbose=True):
+    def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
        """
        Scores a dataset using a fine-tuned model and a given dataloader.

        Args:
-            eval_dataloader (Dataloader): Dataloader for the evaluation data.
+            test_dataloader (DataLoader): DataLoader for scoring the data.
            num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
                be used. If set to 0 or GPUs are not available, CPU device will be used.
                Defaults to None.
+            gpu_ids (list): List of GPU IDs to be used.
+                If set to None, the first num_gpus GPUs will be used.
+                Defaults to None.
            verbose (bool, optional): Whether to print out the training log. Defaults to True.

        Returns
@ -337,12 +300,12 @@ class SequenceClassifier(Transformer):

        preds = list(
            super().predict(
-                eval_dataloader=eval_dataloader,
+                eval_dataloader=test_dataloader,
                get_inputs=Processor.get_inputs,
-                n_gpu=num_gpus,
+                num_gpus=num_gpus,
+                gpu_ids=gpu_ids,
                verbose=verbose,
            )
        )
        preds = np.concatenate(preds)
-        # todo generator & probs
        return np.argmax(preds, axis=1)
--- a/utils_nlp/models/xlnet/sequence_classification.py
+++ b/utils_nlp/models/xlnet/sequence_classification.py
@ -2,23 +2,20 @@
 # Licensed under the MIT License.

 """Utilities for Xlnet Sequence Classification"""
-import numpy as np
+import os
 from collections import namedtuple
-import torch
-import torch.nn as nn
-from transformers import (
-    XLNetConfig,
-    XLNetForSequenceClassification,
-    AdamW,
-    WarmupLinearSchedule,
-)
-from tqdm import tqdm
-from torch.utils.data import DataLoader, RandomSampler, TensorDataset
-from utils_nlp.common.pytorch_utils import get_device, move_to_device
-from utils_nlp.models.xlnet.common import Language
+
 import mlflow
 import mlflow.pytorch
-import os
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, RandomSampler, TensorDataset
+from tqdm import tqdm
+from transformers import AdamW, WarmupLinearSchedule, XLNetConfig, XLNetForSequenceClassification
+
+from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
+from utils_nlp.models.xlnet.common import Language


 class XLNetSequenceClassifier:
@ -79,9 +76,7 @@ class XLNetSequenceClassifier:
        self.max_grad_norm = max_grad_norm

        # create classifier
-        self.config = XLNetConfig.from_pretrained(
-            self.language.value, num_labels=num_labels, cache_dir=cache_dir
-        )
+        self.config = XLNetConfig.from_pretrained(self.language.value, num_labels=num_labels, cache_dir=cache_dir)
        self.model = XLNetForSequenceClassification(self.config)

    def fit(
@ -114,7 +109,7 @@ class XLNetSequenceClassifier:
        """

        device, num_gpus = get_device(self.num_gpus)
-        self.model = move_to_device(self.model, device, self.num_gpus)
+        self.model = move_model_to_device(self.model, device, self.num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
@ -128,24 +123,17 @@ class XLNetSequenceClassifier:
            token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
            val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)

-            train_dataset = TensorDataset(
-                token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor
-            )
+            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor)

            val_dataset = TensorDataset(
-                val_token_ids_tensor,
-                val_input_mask_tensor,
-                val_token_type_ids_tensor,
-                val_labels_tensor,
+                val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor,
            )

        else:

            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)

-            val_dataset = TensorDataset(
-                val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor
-            )
+            val_dataset = TensorDataset(val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor)

        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
@ -155,10 +143,7 @@ class XLNetSequenceClassifier:
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": self.weight_decay,
            },
-            {
-                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
-                "weight_decay": 0.0,
-            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]

        val_sampler = RandomSampler(val_dataset)
@ -181,9 +166,7 @@ class XLNetSequenceClassifier:

            train_sampler = RandomSampler(train_dataset)

-            train_dataloader = DataLoader(
-                train_dataset, sampler=train_sampler, batch_size=self.batch_size
-            )
+            train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size)

            tr_loss = 0.0
            logging_loss = 0.0
@ -191,18 +174,13 @@ class XLNetSequenceClassifier:

            for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if token_type_ids:
-                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
-                        t.to(device) for t in batch
-                    )
+                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(t.to(device) for t in batch)
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)

                outputs = self.model(
-                    input_ids=x_batch,
-                    token_type_ids=token_type_ids_batch,
-                    attention_mask=mask_batch,
-                    labels=y_batch,
+                    input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch,
                )

                loss = outputs[0]  # model outputs are always tuple in pytorch-transformers
@ -220,9 +198,7 @@ class XLNetSequenceClassifier:
                if logging_steps > 0 and global_step % logging_steps == 0:
                    mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step)
                    mlflow.log_metric(
-                        "training loss",
-                        (tr_loss - logging_loss) / (logging_steps * self.batch_size),
-                        step=global_step,
+                        "training loss", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step,
                    )
                    logging_loss = tr_loss
                # model checkpointing
@ -245,9 +221,7 @@ class XLNetSequenceClassifier:
                            )
                        else:
                            token_type_ids_batch = None
-                            val_x_batch, val_mask_batch, val_y_batch = tuple(
-                                t.to(device) for t in val_batch
-                            )
+                            val_x_batch, val_mask_batch, val_y_batch = tuple(t.to(device) for t in val_batch)
                        val_outputs = self.model(
                            input_ids=val_x_batch,
                            token_type_ids=val_token_type_ids_batch,
@ -256,9 +230,7 @@ class XLNetSequenceClassifier:
                        )
                        vloss = val_outputs[0]
                        val_loss += vloss.sum().item()
-                    mlflow.log_metric(
-                        "validation loss", val_loss / len(val_dataset), step=global_step
-                    )
+                    mlflow.log_metric("validation loss", val_loss / len(val_dataset), step=global_step)
                    self.model.train()

                if verbose:
@ -300,13 +272,7 @@ class XLNetSequenceClassifier:
        torch.cuda.empty_cache()

    def predict(
-        self,
-        token_ids,
-        input_mask,
-        token_type_ids=None,
-        num_gpus=None,
-        batch_size=8,
-        probabilities=False,
+        self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

@ -330,7 +296,7 @@ class XLNetSequenceClassifier:
        """

        device, num_gpus = get_device(num_gpus)
-        self.model = move_to_device(self.model, device, num_gpus)
+        self.model = move_model_to_device(self.model, device, num_gpus)

        self.model.eval()
        preds = []
@ -342,16 +308,11 @@ class XLNetSequenceClassifier:
                x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device)
                mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device)

-                token_type_ids_batch = torch.tensor(
-                    token_type_ids[start:end], dtype=torch.long, device=device
-                )
+                token_type_ids_batch = torch.tensor(token_type_ids[start:end], dtype=torch.long, device=device)

                with torch.no_grad():
                    pred_batch = self.model(
-                        input_ids=x_batch,
-                        token_type_ids=token_type_ids_batch,
-                        attention_mask=mask_batch,
-                        labels=None,
+                        input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None,
                    )
                    preds.append(pred_batch[0].cpu())
                    if i % batch_size == 0: