This commit is contained in:
Sharat Chikkerur 2020-06-23 14:45:52 +00:00
Родитель d3e5350931
Коммит 5c690efc9b
2 изменённых файлов: 188 добавлений и 307 удалений

Просмотреть файл

@ -32,19 +32,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"# Ensure edits to libraries are loaded and plotting is shown in the notebook.\n",
"%reload_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -66,7 +54,7 @@
"from utils_nlp.dataset.url_utils import maybe_download\n",
"from utils_nlp.models.transformers.named_entity_recognition import (\n",
" TokenClassificationProcessor, TokenClassifier)\n",
"from utils_nlp.models.transformers.named_entity_recognition import supported_models as SUPPORTED_MODELS\n"
"from utils_nlp.models.transformers.named_entity_recognition import supported_models as SUPPORTED_MODELS"
]
},
{
@ -97,7 +85,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -107,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 3,
"metadata": {
"tags": [
"parameters"
@ -115,6 +103,12 @@
},
"outputs": [],
"source": [
"# Wikigold dataset\n",
"DATA_URL = (\n",
" \"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets\"\n",
" \"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt\"\n",
")\n",
"\n",
"# fraction of the dataset used for testing\n",
"TEST_DATA_FRACTION = 0.3\n",
"\n",
@ -155,7 +149,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -249,7 +243,7 @@
"[70 rows x 1 columns]"
]
},
"execution_count": 65,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -262,7 +256,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get Training & Testing Dataset\n",
"## Get Traning & Testing Dataset\n",
"\n",
"The dataset used in this notebook is the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302). The wikigold dataset consists of 145 mannually labelled Wikipedia articles, including 1841 sentences and 40k tokens in total. The dataset can be directly downloaded from [here](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold). \n",
"\n",
@ -271,9 +265,16 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 96.0/96.0 [00:00<00:00, 4.02kKB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
@ -281,6 +282,48 @@
"Maximum sequence length is: 144\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# download data\n",
"file_name = DATA_URL.split(\"/\")[-1] # a name for the downloaded file\n",
"maybe_download(DATA_URL, file_name, DATA_PATH)\n",
"data_file = os.path.join(DATA_PATH, file_name)\n",
"\n",
"# parse CoNll file\n",
"sentence_list, labels_list = read_conll_file(data_file, sep=\" \")\n",
"\n",
"# sub-sample (optional)\n",
"random.seed(RANDOM_SEED)\n",
"sample_size = int(SAMPLE_RATIO * len(sentence_list))\n",
"sentence_list, labels_list = list(\n",
" zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))\n",
")\n",
"\n",
"# train-test split\n",
"train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(\n",
" sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following is an example input sentence of the training set."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
@ -309,53 +352,53 @@
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[The, words, were, set, to, a, melody, by, com...</td>\n",
" <td>[O, O, O, O, O, O, O, O, O, I-PER, I-PER, O, O...</td>\n",
" <td>[The, origin, of, Agotes, (, or, Cagots, ), is...</td>\n",
" <td>[O, O, O, I-MISC, O, O, I-MISC, O, O, O, O]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[In, 1997, they, released, Fucked, By, Rock, -...</td>\n",
" <td>[O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, O, ...</td>\n",
" <td>[-DOCSTART-]</td>\n",
" <td>[O]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[\", The, more, I, worked, on, Movement, the, m...</td>\n",
" <td>[O, O, O, O, O, O, I-ORG, O, O, O, O, O, O, O,...</td>\n",
" <td>[It, provides, full, -, and, part-time, polyte...</td>\n",
" <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[They, performed, their, first, gig, in, Galwa...</td>\n",
" <td>[O, O, O, O, O, O, I-LOC, O, O, O, O]</td>\n",
" <td>[Since, she, was, the, daughter, of, the, grea...</td>\n",
" <td>[O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[The, next, day, ,, the, regiment, pursued, th...</td>\n",
" <td>[O, O, O, O, O, O, O, O, I-ORG, O, O, O, O, O,...</td>\n",
" <td>[The, goals, were, two, posts, ,, with, no, cr...</td>\n",
" <td>[O, O, O, O, O, O, O, O, O, O]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>[At, Elm, Coulee, Field, ,, the, Bakken, is, o...</td>\n",
" <td>[O, I-LOC, I-LOC, I-LOC, O, O, I-LOC, O, O, O,...</td>\n",
" <td>[At, one, point, ,, so, many, orders, had, bee...</td>\n",
" <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>[He, has, also, released, his, own, solo, albu...</td>\n",
" <td>[O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O,...</td>\n",
" <td>[Left, camp, in, July, 1972, ,, and, was, deal...</td>\n",
" <td>[O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>[Carter, Anthony, Beauford, (, born, November,...</td>\n",
" <td>[I-PER, I-PER, I-PER, O, O, O, O, O, O, O, I-L...</td>\n",
" <td>[She, fled, again, to, Abra, ,, where, she, wa...</td>\n",
" <td>[O, O, O, O, I-LOC, O, O, O, O, O, O]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>[Thus, would, arise, the, confusion, between, ...</td>\n",
" <td>[O, O, O, O, O, O, I-MISC, O, I-MISC, O]</td>\n",
" <td>[As, the, younger, sibling, ,, Ben, was, const...</td>\n",
" <td>[O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>[-DOCSTART-]</td>\n",
" <td>[O]</td>\n",
" <td>[Milepost, 1, :, granite, masonry, arch, over,...</td>\n",
" <td>[O, O, O, O, O, O, O, I-LOC, I-LOC, O]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@ -363,43 +406,43 @@
],
"text/plain": [
" sentence \\\n",
"0 [The, words, were, set, to, a, melody, by, com... \n",
"1 [In, 1997, they, released, Fucked, By, Rock, -... \n",
"2 [\", The, more, I, worked, on, Movement, the, m... \n",
"3 [They, performed, their, first, gig, in, Galwa... \n",
"4 [The, next, day, ,, the, regiment, pursued, th... \n",
"5 [At, Elm, Coulee, Field, ,, the, Bakken, is, o... \n",
"6 [He, has, also, released, his, own, solo, albu... \n",
"7 [Carter, Anthony, Beauford, (, born, November,... \n",
"8 [Thus, would, arise, the, confusion, between, ... \n",
"9 [-DOCSTART-] \n",
"0 [The, origin, of, Agotes, (, or, Cagots, ), is... \n",
"1 [-DOCSTART-] \n",
"2 [It, provides, full, -, and, part-time, polyte... \n",
"3 [Since, she, was, the, daughter, of, the, grea... \n",
"4 [The, goals, were, two, posts, ,, with, no, cr... \n",
"5 [At, one, point, ,, so, many, orders, had, bee... \n",
"6 [Left, camp, in, July, 1972, ,, and, was, deal... \n",
"7 [She, fled, again, to, Abra, ,, where, she, wa... \n",
"8 [As, the, younger, sibling, ,, Ben, was, const... \n",
"9 [Milepost, 1, :, granite, masonry, arch, over,... \n",
"\n",
" labels \n",
"0 [O, O, O, O, O, O, O, O, O, I-PER, I-PER, O, O... \n",
"1 [O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, O, ... \n",
"2 [O, O, O, O, O, O, I-ORG, O, O, O, O, O, O, O,... \n",
"3 [O, O, O, O, O, O, I-LOC, O, O, O, O] \n",
"4 [O, O, O, O, O, O, O, O, I-ORG, O, O, O, O, O,... \n",
"5 [O, I-LOC, I-LOC, I-LOC, O, O, I-LOC, O, O, O,... \n",
"6 [O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O,... \n",
"7 [I-PER, I-PER, I-PER, O, O, O, O, O, O, O, I-L... \n",
"8 [O, O, O, O, O, O, I-MISC, O, I-MISC, O] \n",
"9 [O] "
"0 [O, O, O, I-MISC, O, O, I-MISC, O, O, O, O] \n",
"1 [O] \n",
"2 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n",
"3 [O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI... \n",
"4 [O, O, O, O, O, O, O, O, O, O] \n",
"5 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n",
"6 [O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG... \n",
"7 [O, O, O, O, I-LOC, O, O, O, O, O, O] \n",
"8 [O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,... \n",
"9 [O, O, O, O, O, O, O, I-LOC, I-LOC, O] "
]
},
"execution_count": 66,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df, test_df = wikigold.load_train_test_dfs(test_fraction=TEST_DATA_FRACTION)\n",
"train_df.head(10)"
"# Show example sentences from input\n",
"pd.DataFrame({\"sentence\": sentence_list, \"labels\": labels_list}).head(10)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -423,237 +466,93 @@
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>labels</th>\n",
" <th>token</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>The</td>\n",
" <td>In</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>words</td>\n",
" <td>1999</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>were</td>\n",
" <td>,</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>set</td>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>to</td>\n",
" <td>O</td>\n",
" <td>Caloi</td>\n",
" <td>I-PER</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>a</td>\n",
" <td>family</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>melody</td>\n",
" <td>sold</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>by</td>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>composer</td>\n",
" <td>majority</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Alfredo</td>\n",
" <td>I-PER</td>\n",
" <td>of</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Keil</td>\n",
" <td>I-PER</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>and</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>song</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>soon</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>became</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>popular</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>among</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>people</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>unhappy</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>with</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>what</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>they</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>considered</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>a</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>submissive</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>and</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>humiliating</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>attitude</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>by</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>Portuguese</td>\n",
" <td>I-MISC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>authorities</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>.</td>\n",
" <td>O</td>\n",
" <td>Caloi</td>\n",
" <td>I-ORG</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text labels\n",
"0 The O\n",
"1 words O\n",
"2 were O\n",
"3 set O\n",
"4 to O\n",
"5 a O\n",
"6 melody O\n",
"7 by O\n",
"8 composer O\n",
"9 Alfredo I-PER\n",
"10 Keil I-PER\n",
"11 and O\n",
"12 the O\n",
"13 song O\n",
"14 soon O\n",
"15 became O\n",
"16 popular O\n",
"17 among O\n",
"18 people O\n",
"19 unhappy O\n",
"20 with O\n",
"21 what O\n",
"22 they O\n",
"23 considered O\n",
"24 a O\n",
"25 submissive O\n",
"26 and O\n",
"27 humiliating O\n",
"28 attitude O\n",
"29 by O\n",
"30 the O\n",
"31 Portuguese I-MISC\n",
"32 authorities O\n",
"33 . O"
" token label\n",
"0 In O\n",
"1 1999 O\n",
"2 , O\n",
"3 the O\n",
"4 Caloi I-PER\n",
"5 family O\n",
"6 sold O\n",
"7 the O\n",
"8 majority O\n",
"9 of O\n",
"10 Caloi I-ORG"
]
},
"execution_count": 67,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({\"text\": train_df.sentence[0], \"labels\": train_df.labels[0]})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following is an example input sentence of the training set."
"# Show example tokens from input\n",
"pd.DataFrame({\"token\": train_sentence_list[0], \"label\": train_labels_list[0]}).head(11)"
]
},
{
@ -673,27 +572,13 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 96.0/96.0 [00:00<00:00, 4.87kKB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Maximum sequence length is: 144\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6579121e24d1420f9fb51a607053fd91",
"model_id": "ea57217fe6394812af03defcdaffe4db",
"version_major": 2,
"version_minor": 0
},
@ -714,7 +599,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "a7b1d7a70ab44472987d05cb20c3e2f5",
"model_id": "00884141779a4ddead34204d5ea01b41",
"version_major": 2,
"version_minor": 0
},
@ -748,15 +633,33 @@
}
],
"source": [
"train_dataloader, test_dataloader, label_map, test_dataset = wikigold.load_dataset(\n",
" local_path=DATA_PATH, \n",
" test_fraction=TEST_DATA_FRACTION, \n",
" random_seed=RANDOM_SEED,\n",
" model_name=MODEL_NAME,\n",
" to_lower = DO_LOWER_CASE,\n",
"processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)\n",
"\n",
"label_map = TokenClassificationProcessor.create_label_map(\n",
" label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG\n",
")\n",
"\n",
"train_dataset = processor.preprocess(\n",
" text=train_sentence_list,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" batch_size=BATCH_SIZE,\n",
" num_gpus=NUM_GPUS)"
" labels=train_labels_list,\n",
" label_map=label_map,\n",
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
")\n",
"train_dataloader = dataloader_from_dataset(\n",
" train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False\n",
")\n",
"\n",
"test_dataset = processor.preprocess(\n",
" text=test_sentence_list,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" labels=test_labels_list,\n",
" label_map=label_map,\n",
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
")\n",
"test_dataloader = dataloader_from_dataset(\n",
" test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False\n",
")\n"
]
},
{
@ -770,34 +673,13 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5ec7123ccc5c40fd93be354f5b6dbed6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, description='Downloading', max=411, style=ProgressStyle(description_width=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c13b6612f69840e7abbc81b2c2cdcd56",
"model_id": "7cd3a9259b5c42638e8580f9fbae27db",
"version_major": 2,
"version_minor": 0
},
@ -813,7 +695,7 @@
"output_type": "stream",
"text": [
"\n",
"Training time : 0.058 hrs\n"
"Training time : 0.060 hrs\n"
]
}
],
@ -854,14 +736,14 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Scoring: 100%|██████████| 35/35 [00:06<00:00, 5.68it/s]"
"Scoring: 100%|██████████| 35/35 [00:06<00:00, 6.14it/s]"
]
},
{
@ -899,7 +781,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@ -915,7 +797,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 13,
"metadata": {},
"outputs": [
{
@ -924,13 +806,13 @@
"text": [
" precision recall f1-score support\n",
"\n",
" MISC 0.65 0.74 0.69 186\n",
" LOC 0.84 0.85 0.85 316\n",
" ORG 0.76 0.82 0.79 270\n",
" PER 0.92 0.92 0.92 261\n",
" ORG 0.72 0.76 0.74 274\n",
" MISC 0.67 0.73 0.70 221\n",
" LOC 0.79 0.84 0.81 317\n",
" PER 0.90 0.93 0.92 257\n",
"\n",
"micro avg 0.77 0.84 0.81 1033\n",
"macro avg 0.80 0.84 0.82 1033\n",
"micro avg 0.76 0.82 0.79 1069\n",
"macro avg 0.77 0.82 0.79 1069\n",
"\n"
]
}
@ -960,7 +842,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 14,
"metadata": {},
"outputs": [
{
@ -968,7 +850,7 @@
"output_type": "stream",
"text": [
"WARNING:root:Token lists with length > 512 will be truncated\n",
"Scoring: 100%|██████████| 1/1 [00:00<00:00, 26.47it/s]"
"Scoring: 100%|██████████| 1/1 [00:00<00:00, 25.31it/s]"
]
},
{
@ -1012,7 +894,6 @@
"]\n",
"sample_tokens = [x.split() for x in sample_text]\n",
"\n",
"processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
"sample_dataset = processor.preprocess(\n",
" text=sample_tokens,\n",
" max_len=MAX_SEQ_LENGTH,\n",
@ -1048,13 +929,13 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.8,
"data": 0.77,
"encoder": "json",
"name": "precision",
"version": 1
@ -1072,7 +953,7 @@
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.84,
"data": 0.82,
"encoder": "json",
"name": "recall",
"version": 1
@ -1090,7 +971,7 @@
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.82,
"data": 0.79,
"encoder": "json",
"name": "f1",
"version": 1

Просмотреть файл

@ -4,7 +4,7 @@
"""Common helper functions for preprocessing Named Entity Recognition (NER) datasets."""
def preprocess_conll(text, sep="\t "):
def preprocess_conll(text, sep="\t"):
"""
Converts data in CoNLL format to word and label lists.
@ -36,7 +36,7 @@ def preprocess_conll(text, sep="\t "):
# split each sentence string into "word label" pairs
s_split = s.split("\n")
# split "word label" pairs
s_split_split = [t.split() for t in s_split]
s_split_split = [t.split(sep) for t in s_split]
sentence_list.append([t[0] for t in s_split_split if len(t) > 1])
labels_list.append([t[1] for t in s_split_split if len(t) > 1])
if len(s_split_split) > max_seq_len: