reverted changes to dataset
This commit is contained in:
Родитель
d3e5350931
Коммит
5c690efc9b
|
@ -32,19 +32,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 61,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Ensure edits to libraries are loaded and plotting is shown in the notebook.\n",
|
||||
"%reload_ext autoreload\n",
|
||||
"%autoreload 2\n",
|
||||
"%matplotlib inline"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -66,7 +54,7 @@
|
|||
"from utils_nlp.dataset.url_utils import maybe_download\n",
|
||||
"from utils_nlp.models.transformers.named_entity_recognition import (\n",
|
||||
" TokenClassificationProcessor, TokenClassifier)\n",
|
||||
"from utils_nlp.models.transformers.named_entity_recognition import supported_models as SUPPORTED_MODELS\n"
|
||||
"from utils_nlp.models.transformers.named_entity_recognition import supported_models as SUPPORTED_MODELS"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -97,7 +85,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -107,7 +95,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 64,
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
|
@ -115,6 +103,12 @@
|
|||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Wikigold dataset\n",
|
||||
"DATA_URL = (\n",
|
||||
" \"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets\"\n",
|
||||
" \"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# fraction of the dataset used for testing\n",
|
||||
"TEST_DATA_FRACTION = 0.3\n",
|
||||
"\n",
|
||||
|
@ -155,7 +149,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -249,7 +243,7 @@
|
|||
"[70 rows x 1 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 65,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -262,7 +256,7 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Get Training & Testing Dataset\n",
|
||||
"## Get Traning & Testing Dataset\n",
|
||||
"\n",
|
||||
"The dataset used in this notebook is the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302). The wikigold dataset consists of 145 mannually labelled Wikipedia articles, including 1841 sentences and 40k tokens in total. The dataset can be directly downloaded from [here](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold). \n",
|
||||
"\n",
|
||||
|
@ -271,9 +265,16 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 96.0/96.0 [00:00<00:00, 4.02kKB/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
|
@ -281,6 +282,48 @@
|
|||
"Maximum sequence length is: 144\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# download data\n",
|
||||
"file_name = DATA_URL.split(\"/\")[-1] # a name for the downloaded file\n",
|
||||
"maybe_download(DATA_URL, file_name, DATA_PATH)\n",
|
||||
"data_file = os.path.join(DATA_PATH, file_name)\n",
|
||||
"\n",
|
||||
"# parse CoNll file\n",
|
||||
"sentence_list, labels_list = read_conll_file(data_file, sep=\" \")\n",
|
||||
"\n",
|
||||
"# sub-sample (optional)\n",
|
||||
"random.seed(RANDOM_SEED)\n",
|
||||
"sample_size = int(SAMPLE_RATIO * len(sentence_list))\n",
|
||||
"sentence_list, labels_list = list(\n",
|
||||
" zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# train-test split\n",
|
||||
"train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(\n",
|
||||
" sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following is an example input sentence of the training set."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
|
@ -309,53 +352,53 @@
|
|||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>[The, words, were, set, to, a, melody, by, com...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, O, I-PER, I-PER, O, O...</td>\n",
|
||||
" <td>[The, origin, of, Agotes, (, or, Cagots, ), is...</td>\n",
|
||||
" <td>[O, O, O, I-MISC, O, O, I-MISC, O, O, O, O]</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>[In, 1997, they, released, Fucked, By, Rock, -...</td>\n",
|
||||
" <td>[O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, O, ...</td>\n",
|
||||
" <td>[-DOCSTART-]</td>\n",
|
||||
" <td>[O]</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>[\", The, more, I, worked, on, Movement, the, m...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, I-ORG, O, O, O, O, O, O, O,...</td>\n",
|
||||
" <td>[It, provides, full, -, and, part-time, polyte...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>[They, performed, their, first, gig, in, Galwa...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, I-LOC, O, O, O, O]</td>\n",
|
||||
" <td>[Since, she, was, the, daughter, of, the, grea...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>[The, next, day, ,, the, regiment, pursued, th...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, I-ORG, O, O, O, O, O,...</td>\n",
|
||||
" <td>[The, goals, were, two, posts, ,, with, no, cr...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, O, O]</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>[At, Elm, Coulee, Field, ,, the, Bakken, is, o...</td>\n",
|
||||
" <td>[O, I-LOC, I-LOC, I-LOC, O, O, I-LOC, O, O, O,...</td>\n",
|
||||
" <td>[At, one, point, ,, so, many, orders, had, bee...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>[He, has, also, released, his, own, solo, albu...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O,...</td>\n",
|
||||
" <td>[Left, camp, in, July, 1972, ,, and, was, deal...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>[Carter, Anthony, Beauford, (, born, November,...</td>\n",
|
||||
" <td>[I-PER, I-PER, I-PER, O, O, O, O, O, O, O, I-L...</td>\n",
|
||||
" <td>[She, fled, again, to, Abra, ,, where, she, wa...</td>\n",
|
||||
" <td>[O, O, O, O, I-LOC, O, O, O, O, O, O]</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>[Thus, would, arise, the, confusion, between, ...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, I-MISC, O, I-MISC, O]</td>\n",
|
||||
" <td>[As, the, younger, sibling, ,, Ben, was, const...</td>\n",
|
||||
" <td>[O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>[-DOCSTART-]</td>\n",
|
||||
" <td>[O]</td>\n",
|
||||
" <td>[Milepost, 1, :, granite, masonry, arch, over,...</td>\n",
|
||||
" <td>[O, O, O, O, O, O, O, I-LOC, I-LOC, O]</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
|
@ -363,43 +406,43 @@
|
|||
],
|
||||
"text/plain": [
|
||||
" sentence \\\n",
|
||||
"0 [The, words, were, set, to, a, melody, by, com... \n",
|
||||
"1 [In, 1997, they, released, Fucked, By, Rock, -... \n",
|
||||
"2 [\", The, more, I, worked, on, Movement, the, m... \n",
|
||||
"3 [They, performed, their, first, gig, in, Galwa... \n",
|
||||
"4 [The, next, day, ,, the, regiment, pursued, th... \n",
|
||||
"5 [At, Elm, Coulee, Field, ,, the, Bakken, is, o... \n",
|
||||
"6 [He, has, also, released, his, own, solo, albu... \n",
|
||||
"7 [Carter, Anthony, Beauford, (, born, November,... \n",
|
||||
"8 [Thus, would, arise, the, confusion, between, ... \n",
|
||||
"9 [-DOCSTART-] \n",
|
||||
"0 [The, origin, of, Agotes, (, or, Cagots, ), is... \n",
|
||||
"1 [-DOCSTART-] \n",
|
||||
"2 [It, provides, full, -, and, part-time, polyte... \n",
|
||||
"3 [Since, she, was, the, daughter, of, the, grea... \n",
|
||||
"4 [The, goals, were, two, posts, ,, with, no, cr... \n",
|
||||
"5 [At, one, point, ,, so, many, orders, had, bee... \n",
|
||||
"6 [Left, camp, in, July, 1972, ,, and, was, deal... \n",
|
||||
"7 [She, fled, again, to, Abra, ,, where, she, wa... \n",
|
||||
"8 [As, the, younger, sibling, ,, Ben, was, const... \n",
|
||||
"9 [Milepost, 1, :, granite, masonry, arch, over,... \n",
|
||||
"\n",
|
||||
" labels \n",
|
||||
"0 [O, O, O, O, O, O, O, O, O, I-PER, I-PER, O, O... \n",
|
||||
"1 [O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, O, ... \n",
|
||||
"2 [O, O, O, O, O, O, I-ORG, O, O, O, O, O, O, O,... \n",
|
||||
"3 [O, O, O, O, O, O, I-LOC, O, O, O, O] \n",
|
||||
"4 [O, O, O, O, O, O, O, O, I-ORG, O, O, O, O, O,... \n",
|
||||
"5 [O, I-LOC, I-LOC, I-LOC, O, O, I-LOC, O, O, O,... \n",
|
||||
"6 [O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O,... \n",
|
||||
"7 [I-PER, I-PER, I-PER, O, O, O, O, O, O, O, I-L... \n",
|
||||
"8 [O, O, O, O, O, O, I-MISC, O, I-MISC, O] \n",
|
||||
"9 [O] "
|
||||
"0 [O, O, O, I-MISC, O, O, I-MISC, O, O, O, O] \n",
|
||||
"1 [O] \n",
|
||||
"2 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n",
|
||||
"3 [O, O, O, O, O, O, O, O, I-MISC, O, O, O, I-MI... \n",
|
||||
"4 [O, O, O, O, O, O, O, O, O, O] \n",
|
||||
"5 [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... \n",
|
||||
"6 [O, O, O, O, O, O, O, O, O, O, O, I-ORG, I-ORG... \n",
|
||||
"7 [O, O, O, O, I-LOC, O, O, O, O, O, O] \n",
|
||||
"8 [O, O, O, O, O, I-PER, O, O, O, O, O, O, O, O,... \n",
|
||||
"9 [O, O, O, O, O, O, O, I-LOC, I-LOC, O] "
|
||||
]
|
||||
},
|
||||
"execution_count": 66,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_df, test_df = wikigold.load_train_test_dfs(test_fraction=TEST_DATA_FRACTION)\n",
|
||||
"train_df.head(10)"
|
||||
"# Show example sentences from input\n",
|
||||
"pd.DataFrame({\"sentence\": sentence_list, \"labels\": labels_list}).head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 67,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -423,237 +466,93 @@
|
|||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>text</th>\n",
|
||||
" <th>labels</th>\n",
|
||||
" <th>token</th>\n",
|
||||
" <th>label</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>The</td>\n",
|
||||
" <td>In</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>words</td>\n",
|
||||
" <td>1999</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>were</td>\n",
|
||||
" <td>,</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>set</td>\n",
|
||||
" <td>the</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>to</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" <td>Caloi</td>\n",
|
||||
" <td>I-PER</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>a</td>\n",
|
||||
" <td>family</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>melody</td>\n",
|
||||
" <td>sold</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>by</td>\n",
|
||||
" <td>the</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>composer</td>\n",
|
||||
" <td>majority</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>Alfredo</td>\n",
|
||||
" <td>I-PER</td>\n",
|
||||
" <td>of</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>Keil</td>\n",
|
||||
" <td>I-PER</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>and</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>the</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>song</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>soon</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>became</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>popular</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>among</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>people</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>unhappy</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20</th>\n",
|
||||
" <td>with</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21</th>\n",
|
||||
" <td>what</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>22</th>\n",
|
||||
" <td>they</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>23</th>\n",
|
||||
" <td>considered</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>24</th>\n",
|
||||
" <td>a</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25</th>\n",
|
||||
" <td>submissive</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>26</th>\n",
|
||||
" <td>and</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>27</th>\n",
|
||||
" <td>humiliating</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>28</th>\n",
|
||||
" <td>attitude</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>by</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>30</th>\n",
|
||||
" <td>the</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>31</th>\n",
|
||||
" <td>Portuguese</td>\n",
|
||||
" <td>I-MISC</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>32</th>\n",
|
||||
" <td>authorities</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>33</th>\n",
|
||||
" <td>.</td>\n",
|
||||
" <td>O</td>\n",
|
||||
" <td>Caloi</td>\n",
|
||||
" <td>I-ORG</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" text labels\n",
|
||||
"0 The O\n",
|
||||
"1 words O\n",
|
||||
"2 were O\n",
|
||||
"3 set O\n",
|
||||
"4 to O\n",
|
||||
"5 a O\n",
|
||||
"6 melody O\n",
|
||||
"7 by O\n",
|
||||
"8 composer O\n",
|
||||
"9 Alfredo I-PER\n",
|
||||
"10 Keil I-PER\n",
|
||||
"11 and O\n",
|
||||
"12 the O\n",
|
||||
"13 song O\n",
|
||||
"14 soon O\n",
|
||||
"15 became O\n",
|
||||
"16 popular O\n",
|
||||
"17 among O\n",
|
||||
"18 people O\n",
|
||||
"19 unhappy O\n",
|
||||
"20 with O\n",
|
||||
"21 what O\n",
|
||||
"22 they O\n",
|
||||
"23 considered O\n",
|
||||
"24 a O\n",
|
||||
"25 submissive O\n",
|
||||
"26 and O\n",
|
||||
"27 humiliating O\n",
|
||||
"28 attitude O\n",
|
||||
"29 by O\n",
|
||||
"30 the O\n",
|
||||
"31 Portuguese I-MISC\n",
|
||||
"32 authorities O\n",
|
||||
"33 . O"
|
||||
" token label\n",
|
||||
"0 In O\n",
|
||||
"1 1999 O\n",
|
||||
"2 , O\n",
|
||||
"3 the O\n",
|
||||
"4 Caloi I-PER\n",
|
||||
"5 family O\n",
|
||||
"6 sold O\n",
|
||||
"7 the O\n",
|
||||
"8 majority O\n",
|
||||
"9 of O\n",
|
||||
"10 Caloi I-ORG"
|
||||
]
|
||||
},
|
||||
"execution_count": 67,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pd.DataFrame({\"text\": train_df.sentence[0], \"labels\": train_df.labels[0]})"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following is an example input sentence of the training set."
|
||||
"# Show example tokens from input\n",
|
||||
"pd.DataFrame({\"token\": train_sentence_list[0], \"label\": train_labels_list[0]}).head(11)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -673,27 +572,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 68,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 96.0/96.0 [00:00<00:00, 4.87kKB/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Maximum sequence length is: 144\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "6579121e24d1420f9fb51a607053fd91",
|
||||
"model_id": "ea57217fe6394812af03defcdaffe4db",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
|
@ -714,7 +599,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "a7b1d7a70ab44472987d05cb20c3e2f5",
|
||||
"model_id": "00884141779a4ddead34204d5ea01b41",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
|
@ -748,15 +633,33 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"train_dataloader, test_dataloader, label_map, test_dataset = wikigold.load_dataset(\n",
|
||||
" local_path=DATA_PATH, \n",
|
||||
" test_fraction=TEST_DATA_FRACTION, \n",
|
||||
" random_seed=RANDOM_SEED,\n",
|
||||
" model_name=MODEL_NAME,\n",
|
||||
" to_lower = DO_LOWER_CASE,\n",
|
||||
"processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)\n",
|
||||
"\n",
|
||||
"label_map = TokenClassificationProcessor.create_label_map(\n",
|
||||
" label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"train_dataset = processor.preprocess(\n",
|
||||
" text=train_sentence_list,\n",
|
||||
" max_len=MAX_SEQ_LENGTH,\n",
|
||||
" batch_size=BATCH_SIZE,\n",
|
||||
" num_gpus=NUM_GPUS)"
|
||||
" labels=train_labels_list,\n",
|
||||
" label_map=label_map,\n",
|
||||
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
|
||||
")\n",
|
||||
"train_dataloader = dataloader_from_dataset(\n",
|
||||
" train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"test_dataset = processor.preprocess(\n",
|
||||
" text=test_sentence_list,\n",
|
||||
" max_len=MAX_SEQ_LENGTH,\n",
|
||||
" labels=test_labels_list,\n",
|
||||
" label_map=label_map,\n",
|
||||
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
|
||||
")\n",
|
||||
"test_dataloader = dataloader_from_dataset(\n",
|
||||
" test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False\n",
|
||||
")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -770,34 +673,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5ec7123ccc5c40fd93be354f5b6dbed6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(IntProgress(value=0, description='Downloading', max=411, style=ProgressStyle(description_width=…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "c13b6612f69840e7abbc81b2c2cdcd56",
|
||||
"model_id": "7cd3a9259b5c42638e8580f9fbae27db",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
|
@ -813,7 +695,7 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Training time : 0.058 hrs\n"
|
||||
"Training time : 0.060 hrs\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -854,14 +736,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Scoring: 100%|██████████| 35/35 [00:06<00:00, 5.68it/s]"
|
||||
"Scoring: 100%|██████████| 35/35 [00:06<00:00, 6.14it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -899,7 +781,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -915,7 +797,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -924,13 +806,13 @@
|
|||
"text": [
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" MISC 0.65 0.74 0.69 186\n",
|
||||
" LOC 0.84 0.85 0.85 316\n",
|
||||
" ORG 0.76 0.82 0.79 270\n",
|
||||
" PER 0.92 0.92 0.92 261\n",
|
||||
" ORG 0.72 0.76 0.74 274\n",
|
||||
" MISC 0.67 0.73 0.70 221\n",
|
||||
" LOC 0.79 0.84 0.81 317\n",
|
||||
" PER 0.90 0.93 0.92 257\n",
|
||||
"\n",
|
||||
"micro avg 0.77 0.84 0.81 1033\n",
|
||||
"macro avg 0.80 0.84 0.82 1033\n",
|
||||
"micro avg 0.76 0.82 0.79 1069\n",
|
||||
"macro avg 0.77 0.82 0.79 1069\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
|
@ -960,7 +842,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 73,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -968,7 +850,7 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:root:Token lists with length > 512 will be truncated\n",
|
||||
"Scoring: 100%|██████████| 1/1 [00:00<00:00, 26.47it/s]"
|
||||
"Scoring: 100%|██████████| 1/1 [00:00<00:00, 25.31it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1012,7 +894,6 @@
|
|||
"]\n",
|
||||
"sample_tokens = [x.split() for x in sample_text]\n",
|
||||
"\n",
|
||||
"processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
|
||||
"sample_dataset = processor.preprocess(\n",
|
||||
" text=sample_tokens,\n",
|
||||
" max_len=MAX_SEQ_LENGTH,\n",
|
||||
|
@ -1048,13 +929,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 74,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.8,
|
||||
"data": 0.77,
|
||||
"encoder": "json",
|
||||
"name": "precision",
|
||||
"version": 1
|
||||
|
@ -1072,7 +953,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.84,
|
||||
"data": 0.82,
|
||||
"encoder": "json",
|
||||
"name": "recall",
|
||||
"version": 1
|
||||
|
@ -1090,7 +971,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.82,
|
||||
"data": 0.79,
|
||||
"encoder": "json",
|
||||
"name": "f1",
|
||||
"version": 1
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
"""Common helper functions for preprocessing Named Entity Recognition (NER) datasets."""
|
||||
|
||||
|
||||
def preprocess_conll(text, sep="\t "):
|
||||
def preprocess_conll(text, sep="\t"):
|
||||
"""
|
||||
Converts data in CoNLL format to word and label lists.
|
||||
|
||||
|
@ -36,7 +36,7 @@ def preprocess_conll(text, sep="\t "):
|
|||
# split each sentence string into "word label" pairs
|
||||
s_split = s.split("\n")
|
||||
# split "word label" pairs
|
||||
s_split_split = [t.split() for t in s_split]
|
||||
s_split_split = [t.split(sep) for t in s_split]
|
||||
sentence_list.append([t[0] for t in s_split_split if len(t) > 1])
|
||||
labels_list.append([t[1] for t in s_split_split if len(t) > 1])
|
||||
if len(s_split_split) > max_seq_len:
|
||||
|
|
Загрузка…
Ссылка в новой задаче