reverted changes to dataset

2020-06-23 14:45:52 +00:00 · 2020-06-23 14:45:52 +00:00 · 5c690efc9b
--- a/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
+++ b/examples/named_entity_recognition/ner_wikigold_transformer.ipynb
@ -32,19 +32,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Ensure edits to libraries are loaded and plotting is shown in the notebook.\n",
-    "%reload_ext autoreload\n",
-    "%autoreload 2\n",
-    "%matplotlib inline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -66,7 +54,7 @@
    "from utils_nlp.dataset.url_utils import maybe_download\n",
    "from utils_nlp.models.transformers.named_entity_recognition import (\n",
    "    TokenClassificationProcessor, TokenClassifier)\n",
-    "from utils_nlp.models.transformers.named_entity_recognition import supported_models as SUPPORTED_MODELS\n"
+    "from utils_nlp.models.transformers.named_entity_recognition import supported_models as SUPPORTED_MODELS"
   ]
  },
  {
@ -97,7 +85,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -107,7 +95,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 3,
   "metadata": {
    "tags": [
     "parameters"
@ -115,6 +103,12 @@
   },
   "outputs": [],
   "source": [
+    "# Wikigold dataset\n",
+    "DATA_URL = (\n",
+    "    \"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets\"\n",
+    "    \"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt\"\n",
+    ")\n",
+    "\n",
    "# fraction of the dataset used for testing\n",
    "TEST_DATA_FRACTION = 0.3\n",
    "\n",
@ -155,7 +149,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@ -249,7 +243,7 @@
       "[70 rows x 1 columns]"
      ]
     },
-     "execution_count": 65,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -262,7 +256,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Get Training & Testing Dataset\n",
+    "## Get Traning & Testing Dataset\n",
    "\n",
    "The dataset used in this notebook is the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302). The wikigold dataset consists of 145 mannually labelled Wikipedia articles, including 1841 sentences and 40k tokens in total. The dataset can be directly downloaded from [here](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold). \n",
    "\n",
@ -271,9 +265,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 96.0/96.0 [00:00<00:00, 4.02kKB/s]"
+     ]
+    },
    {
     "name": "stdout",
     "output_type": "stream",
@ -281,6 +282,48 @@
      "Maximum sequence length is: 144\n"
     ]
    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "# download data\n",
+    "file_name = DATA_URL.split(\"/\")[-1]  # a name for the downloaded file\n",
+    "maybe_download(DATA_URL, file_name, DATA_PATH)\n",
+    "data_file = os.path.join(DATA_PATH, file_name)\n",
+    "\n",
+    "# parse CoNll file\n",
+    "sentence_list, labels_list = read_conll_file(data_file, sep=\" \")\n",
+    "\n",
+    "# sub-sample (optional)\n",
+    "random.seed(RANDOM_SEED)\n",
+    "sample_size = int(SAMPLE_RATIO * len(sentence_list))\n",
+    "sentence_list, labels_list = list(\n",
+    "    zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))\n",
+    ")\n",
+    "\n",
+    "# train-test split\n",
+    "train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(\n",
+    "    sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following is an example input sentence of the training set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
    {
     "data": {
      "text/html": [
@ -309,53 +352,53 @@
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
-       "      <td>[The, words, were, set, to, a, melody, by, com...</td>\n",
-       "      <td>[O, O, O, O, O, O, O, O, O, I-PER, I-PER, O, O...</td>\n",
+       "      <td>[The, origin, of, Agotes, (, or, Cagots, ), is...</td>\n",
+       "      <td>[O, O, O, I-MISC, O, O, I-MISC, O, O, O, O]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
-       "      <td>[In, 1997, they, released, Fucked, By, Rock, -...</td>\n",
-       "      <td>[O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, O, ...</td>\n",
+       "      <td>[-DOCSTART-]</td>\n",
+       "      <td>[O]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
-       "      <td>[\", The, more, I, worked, on, Movement, the, m...</td>\n",
-       "      <td>[O, O, O, O, O, O, I-ORG, O, O, O, O, O, O, O,...</td>\n",
+       "      <td>[It, provides, full, -, and, part-time, polyte...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
-       "      <td>[They, performed, their, first, gig, in, Galwa...</td>\n",
-       "      <td>[O, O, O, O, O, O, I-LOC, O, O, O, O]</td>\n",
+       "      <td>[Since, she, was, the, daughter, of, the, grea...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
-       "      <td>[The, next, day, ,, the, regiment, pursued, th...</td>\n",
-       "      <td>[O, O, O, O, O, O, O, O, I-ORG, O, O, O, O, O,...</td>\n",
+       "      <td>[The, goals, were, two, posts, ,, with, no, cr...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O, O, O]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
-       "      <td>[At, Elm, Coulee, Field, ,, the, Bakken, is, o...</td>\n",
-       "      <td>[O, I-LOC, I-LOC, I-LOC, O, O, I-LOC, O, O, O,...</td>\n",
+       "      <td>[At, one, point, ,, so, many, orders, had, bee...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
-       "      <td>[He, has, also, released, his, own, solo, albu...</td>\n",
-       "      <td>[O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O,...</td>\n",
+       "      <td>[Left, camp, in, July, 1972, ,, and, was, deal...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
-       "      <td>[Carter, Anthony, Beauford, (, born, November,...</td>\n",
-       "      <td>[I-PER, I-PER, I-PER, O, O, O, O, O, O, O, I-L...</td>\n",
+       "      <td>[She, fled, again, to, Abra, ,, where, she, wa...</td>\n",
+       "      <td>[O, O, O, O, I-LOC, O, O, O, O, O, O]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
-       "      <td>[Thus, would, arise, the, confusion, between, ...</td>\n",
-       "      <td>[O, O, O, O, O, O, I-MISC, O, I-MISC, O]</td>\n",
+       "      <td>[As, the, younger, sibling, ,, Ben, was, const...</td>\n",
+       "      <td>[O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
-       "      <td>[-DOCSTART-]</td>\n",
-       "      <td>[O]</td>\n",
+       "      <td>[Milepost, 1, :, granite, masonry, arch, over,...</td>\n",
+       "      <td>[O, O, O, O, O, O, O, I-LOC, I-LOC, O]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@ -363,43 +406,43 @@
      ],
      "text/plain": [
       "                                            sentence  \\\n",
-       "0  [The, words, were, set, to, a, melody, by, com...   \n",
-       "1  [In, 1997, they, released, Fucked, By, Rock, -...   \n",
-       "2  [\", The, more, I, worked, on, Movement, the, m...   \n",
-       "3  [They, performed, their, first, gig, in, Galwa...   \n",
-       "4  [The, next, day, ,, the, regiment, pursued, th...   \n",
-       "5  [At, Elm, Coulee, Field, ,, the, Bakken, is, o...   \n",
-       "6  [He, has, also, released, his, own, solo, albu...   \n",
-       "7  [Carter, Anthony, Beauford, (, born, November,...   \n",
-       "8  [Thus, would, arise, the, confusion, between, ...   \n",
-       "9                                       [-DOCSTART-]   \n",
+       "0  [The, origin, of, Agotes, (, or, Cagots, ), is...   \n",
+       "1                                       [-DOCSTART-]   \n",
+       "2  [It, provides, full, -, and, part-time, polyte...   \n",
+       "3  [Since, she, was, the, daughter, of, the, grea...   \n",
+       "4  [The, goals, were, two, posts, ,, with, no, cr...   \n",
+       "5  [At, one, point, ,, so, many, orders, had, bee...   \n",
+       "6  [Left, camp, in, July, 1972, ,, and, was, deal...   \n",
+       "7  [She, fled, again, to, Abra, ,, where, she, wa...   \n",
+       "8  [As, the, younger, sibling, ,, Ben, was, const...   \n",
+       "9  [Milepost, 1, :, granite, masonry, arch, over,...   \n",
       "\n",
       "                                              labels  \n",
-       "0  [O, O, O, O, O, O, O, O, O, I-PER, I-PER, O, O...  \n",
-       "1  [O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, O, ...  \n",
-       "2  [O, O, O, O, O, O, I-ORG, O, O, O, O, O, O, O,...  \n",
-       "3              [O, O, O, O, O, O, I-LOC, O, O, O, O]  \n",
-       "4  [O, O, O, O, O, O, O, O, I-ORG, O, O, O, O, O,...  \n",
-       "5  [O, I-LOC, I-LOC, I-LOC, O, O, I-LOC, O, O, O,...  \n",
-       "6  [O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O,...  \n",
-       "7  [I-PER, I-PER, I-PER, O, O, O, O, O, O, O, I-L...  \n",
-       "8           [O, O, O, O, O, O, I-MISC, O, I-MISC, O]  \n",
-       "9                                                [O]  "
+       "0        [O, O, O, I-MISC, O, O, I-MISC, O, O, O, O]  \n",
+       "1                                                [O]  \n",
+       "2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  \n",
+       "3  [O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI...  \n",
+       "4                     [O, O, O, O, O, O, O, O, O, O]  \n",
+       "5  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  \n",
+       "6  [O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG...  \n",
+       "7              [O, O, O, O, I-LOC, O, O, O, O, O, O]  \n",
+       "8  [O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,...  \n",
+       "9             [O, O, O, O, O, O, O, I-LOC, I-LOC, O]  "
      ]
     },
-     "execution_count": 66,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "train_df, test_df = wikigold.load_train_test_dfs(test_fraction=TEST_DATA_FRACTION)\n",
-    "train_df.head(10)"
+    "# Show example sentences from input\n",
+    "pd.DataFrame({\"sentence\": sentence_list, \"labels\": labels_list}).head(10)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -423,237 +466,93 @@
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
-       "      <th>text</th>\n",
-       "      <th>labels</th>\n",
+       "      <th>token</th>\n",
+       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
-       "      <td>The</td>\n",
+       "      <td>In</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
-       "      <td>words</td>\n",
+       "      <td>1999</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
-       "      <td>were</td>\n",
+       "      <td>,</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
-       "      <td>set</td>\n",
+       "      <td>the</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
-       "      <td>to</td>\n",
-       "      <td>O</td>\n",
+       "      <td>Caloi</td>\n",
+       "      <td>I-PER</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
-       "      <td>a</td>\n",
+       "      <td>family</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
-       "      <td>melody</td>\n",
+       "      <td>sold</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
-       "      <td>by</td>\n",
+       "      <td>the</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
-       "      <td>composer</td>\n",
+       "      <td>majority</td>\n",
       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
-       "      <td>Alfredo</td>\n",
-       "      <td>I-PER</td>\n",
+       "      <td>of</td>\n",
+       "      <td>O</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
-       "      <td>Keil</td>\n",
-       "      <td>I-PER</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>11</th>\n",
-       "      <td>and</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>the</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>song</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>soon</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>became</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>popular</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>among</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>people</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>unhappy</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>with</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>what</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>they</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>considered</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>a</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25</th>\n",
-       "      <td>submissive</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>26</th>\n",
-       "      <td>and</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27</th>\n",
-       "      <td>humiliating</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>28</th>\n",
-       "      <td>attitude</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29</th>\n",
-       "      <td>by</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>30</th>\n",
-       "      <td>the</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>Portuguese</td>\n",
-       "      <td>I-MISC</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>32</th>\n",
-       "      <td>authorities</td>\n",
-       "      <td>O</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>33</th>\n",
-       "      <td>.</td>\n",
-       "      <td>O</td>\n",
+       "      <td>Caloi</td>\n",
+       "      <td>I-ORG</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
-       "           text  labels\n",
-       "0           The       O\n",
-       "1         words       O\n",
-       "2          were       O\n",
-       "3           set       O\n",
-       "4            to       O\n",
-       "5             a       O\n",
-       "6        melody       O\n",
-       "7            by       O\n",
-       "8      composer       O\n",
-       "9       Alfredo   I-PER\n",
-       "10         Keil   I-PER\n",
-       "11          and       O\n",
-       "12          the       O\n",
-       "13         song       O\n",
-       "14         soon       O\n",
-       "15       became       O\n",
-       "16      popular       O\n",
-       "17        among       O\n",
-       "18       people       O\n",
-       "19      unhappy       O\n",
-       "20         with       O\n",
-       "21         what       O\n",
-       "22         they       O\n",
-       "23   considered       O\n",
-       "24            a       O\n",
-       "25   submissive       O\n",
-       "26          and       O\n",
-       "27  humiliating       O\n",
-       "28     attitude       O\n",
-       "29           by       O\n",
-       "30          the       O\n",
-       "31   Portuguese  I-MISC\n",
-       "32  authorities       O\n",
-       "33            .       O"
+       "       token  label\n",
+       "0         In      O\n",
+       "1       1999      O\n",
+       "2          ,      O\n",
+       "3        the      O\n",
+       "4      Caloi  I-PER\n",
+       "5     family      O\n",
+       "6       sold      O\n",
+       "7        the      O\n",
+       "8   majority      O\n",
+       "9         of      O\n",
+       "10     Caloi  I-ORG"
      ]
     },
-     "execution_count": 67,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "pd.DataFrame({\"text\": train_df.sentence[0], \"labels\": train_df.labels[0]})"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The following is an example input sentence of the training set."
+    "# Show example tokens from input\n",
+    "pd.DataFrame({\"token\": train_sentence_list[0], \"label\": train_labels_list[0]}).head(11)"
   ]
  },
  {
@ -673,27 +572,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 96.0/96.0 [00:00<00:00, 4.87kKB/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Maximum sequence length is: 144\n"
-     ]
-    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6579121e24d1420f9fb51a607053fd91",
+       "model_id": "ea57217fe6394812af03defcdaffe4db",
       "version_major": 2,
       "version_minor": 0
      },
@ -714,7 +599,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "a7b1d7a70ab44472987d05cb20c3e2f5",
+       "model_id": "00884141779a4ddead34204d5ea01b41",
       "version_major": 2,
       "version_minor": 0
      },
@ -748,15 +633,33 @@
    }
   ],
   "source": [
-    "train_dataloader, test_dataloader, label_map, test_dataset = wikigold.load_dataset(\n",
-    "    local_path=DATA_PATH, \n",
-    "    test_fraction=TEST_DATA_FRACTION, \n",
-    "    random_seed=RANDOM_SEED,\n",
-    "    model_name=MODEL_NAME,\n",
-    "    to_lower = DO_LOWER_CASE,\n",
+    "processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)\n",
+    "\n",
+    "label_map = TokenClassificationProcessor.create_label_map(\n",
+    "    label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG\n",
+    ")\n",
+    "\n",
+    "train_dataset = processor.preprocess(\n",
+    "    text=train_sentence_list,\n",
    "    max_len=MAX_SEQ_LENGTH,\n",
-    "    batch_size=BATCH_SIZE,\n",
-    "    num_gpus=NUM_GPUS)"
+    "    labels=train_labels_list,\n",
+    "    label_map=label_map,\n",
+    "    trailing_piece_tag=TRAILING_PIECE_TAG,\n",
+    ")\n",
+    "train_dataloader = dataloader_from_dataset(\n",
+    "    train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False\n",
+    ")\n",
+    "\n",
+    "test_dataset = processor.preprocess(\n",
+    "    text=test_sentence_list,\n",
+    "    max_len=MAX_SEQ_LENGTH,\n",
+    "    labels=test_labels_list,\n",
+    "    label_map=label_map,\n",
+    "    trailing_piece_tag=TRAILING_PIECE_TAG,\n",
+    ")\n",
+    "test_dataloader = dataloader_from_dataset(\n",
+    "    test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False\n",
+    ")\n"
   ]
  },
  {
@ -770,34 +673,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "5ec7123ccc5c40fd93be354f5b6dbed6",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "HBox(children=(IntProgress(value=0, description='Downloading', max=411, style=ProgressStyle(description_width=…"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "c13b6612f69840e7abbc81b2c2cdcd56",
+       "model_id": "7cd3a9259b5c42638e8580f9fbae27db",
       "version_major": 2,
       "version_minor": 0
      },
@ -813,7 +695,7 @@
     "output_type": "stream",
     "text": [
      "\n",
-      "Training time : 0.058 hrs\n"
+      "Training time : 0.060 hrs\n"
     ]
    }
   ],
@ -854,14 +736,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Scoring: 100%|██████████| 35/35 [00:06<00:00,  5.68it/s]"
+      "Scoring: 100%|██████████| 35/35 [00:06<00:00,  6.14it/s]"
     ]
    },
    {
@ -899,7 +781,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@ -915,7 +797,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
@ -924,13 +806,13 @@
     "text": [
      "           precision    recall  f1-score   support\n",
      "\n",
-      "     MISC       0.65      0.74      0.69       186\n",
-      "      LOC       0.84      0.85      0.85       316\n",
-      "      ORG       0.76      0.82      0.79       270\n",
-      "      PER       0.92      0.92      0.92       261\n",
+      "      ORG       0.72      0.76      0.74       274\n",
+      "     MISC       0.67      0.73      0.70       221\n",
+      "      LOC       0.79      0.84      0.81       317\n",
+      "      PER       0.90      0.93      0.92       257\n",
      "\n",
-      "micro avg       0.77      0.84      0.81      1033\n",
-      "macro avg       0.80      0.84      0.82      1033\n",
+      "micro avg       0.76      0.82      0.79      1069\n",
+      "macro avg       0.77      0.82      0.79      1069\n",
      "\n"
     ]
    }
@ -960,7 +842,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@ -968,7 +850,7 @@
     "output_type": "stream",
     "text": [
      "WARNING:root:Token lists with length > 512 will be truncated\n",
-      "Scoring: 100%|██████████| 1/1 [00:00<00:00, 26.47it/s]"
+      "Scoring: 100%|██████████| 1/1 [00:00<00:00, 25.31it/s]"
     ]
    },
    {
@ -1012,7 +894,6 @@
    "]\n",
    "sample_tokens = [x.split() for x in sample_text]\n",
    "\n",
-    "processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
    "sample_dataset = processor.preprocess(\n",
    "    text=sample_tokens,\n",
    "    max_len=MAX_SEQ_LENGTH,\n",
@ -1048,13 +929,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/scrapbook.scrap.json+json": {
-       "data": 0.8,
+       "data": 0.77,
       "encoder": "json",
       "name": "precision",
       "version": 1
@ -1072,7 +953,7 @@
    {
     "data": {
      "application/scrapbook.scrap.json+json": {
-       "data": 0.84,
+       "data": 0.82,
       "encoder": "json",
       "name": "recall",
       "version": 1
@ -1090,7 +971,7 @@
    {
     "data": {
      "application/scrapbook.scrap.json+json": {
-       "data": 0.82,
+       "data": 0.79,
       "encoder": "json",
       "name": "f1",
       "version": 1
--- a/utils_nlp/dataset/ner_utils.py
+++ b/utils_nlp/dataset/ner_utils.py
@ -4,7 +4,7 @@
 """Common helper functions for preprocessing Named Entity Recognition (NER) datasets."""


-def preprocess_conll(text, sep="\t "):
+def preprocess_conll(text, sep="\t"):
    """
    Converts data in CoNLL format to word and label lists.

@ -36,7 +36,7 @@ def preprocess_conll(text, sep="\t "):
        # split each sentence string into "word label" pairs
        s_split = s.split("\n")
        # split "word label" pairs
-        s_split_split = [t.split() for t in s_split]
+        s_split_split = [t.split(sep) for t in s_split]
        sentence_list.append([t[0] for t in s_split_split if len(t) > 1])
        labels_list.append([t[1] for t in s_split_split if len(t) > 1])
        if len(s_split_split) > max_seq_len: