Merge pull request #529 from microsoft/bleik/optim-patch
bleik/common transformers utils update
This commit is contained in:
Коммит
7dcdc32399
|
@ -233,7 +233,7 @@
|
|||
"source": [
|
||||
"with Timer() as t:\n",
|
||||
" preds = model.predict(\n",
|
||||
" eval_dataloader=test_dataloader,\n",
|
||||
" test_dataloader=test_dataloader,\n",
|
||||
" num_gpus=None,\n",
|
||||
" verbose=True\n",
|
||||
" )\n",
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
"from sklearn.preprocessing import LabelEncoder\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"from utils_nlp.common.timer import Timer\n",
|
||||
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
|
||||
"from utils_nlp.dataset.multinli import load_pandas_df\n",
|
||||
"from utils_nlp.models.transformers.sequence_classification import (\n",
|
||||
" Processor, SequenceClassifier)"
|
||||
|
@ -93,7 +94,7 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 222k/222k [01:25<00:00, 2.60kKB/s] \n"
|
||||
"100%|██████████| 222k/222k [01:20<00:00, 2.74kKB/s] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -196,7 +197,7 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
|
||||
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
|
||||
" FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
|
@ -232,11 +233,11 @@
|
|||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"telephone 1055\n",
|
||||
"slate 1003\n",
|
||||
"travel 961\n",
|
||||
"fiction 952\n",
|
||||
"government 938\n",
|
||||
"telephone 1043\n",
|
||||
"slate 989\n",
|
||||
"fiction 968\n",
|
||||
"travel 964\n",
|
||||
"government 945\n",
|
||||
"Name: genre, dtype: int64"
|
||||
]
|
||||
},
|
||||
|
@ -385,32 +386,108 @@
|
|||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>roberta-base</td>\n",
|
||||
" <td>bert-base-japanese</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>roberta-large</td>\n",
|
||||
" <td>bert-base-japanese-whole-word-masking</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>roberta-large-mnli</td>\n",
|
||||
" <td>bert-base-japanese-char</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>xlnet-base-cased</td>\n",
|
||||
" <td>bert-base-japanese-char-whole-word-masking</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>xlnet-large-cased</td>\n",
|
||||
" <td>bert-base-finnish-cased-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20</th>\n",
|
||||
" <td>distilbert-base-uncased</td>\n",
|
||||
" <td>bert-base-finnish-uncased-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21</th>\n",
|
||||
" <td>roberta-base</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>22</th>\n",
|
||||
" <td>roberta-large</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>23</th>\n",
|
||||
" <td>roberta-large-mnli</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>24</th>\n",
|
||||
" <td>distilroberta-base</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25</th>\n",
|
||||
" <td>roberta-base-openai-detector</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>26</th>\n",
|
||||
" <td>roberta-large-openai-detector</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>27</th>\n",
|
||||
" <td>xlnet-base-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>28</th>\n",
|
||||
" <td>xlnet-large-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>distilbert-base-uncased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>30</th>\n",
|
||||
" <td>distilbert-base-uncased-distilled-squad</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>31</th>\n",
|
||||
" <td>distilbert-base-german-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>32</th>\n",
|
||||
" <td>distilbert-base-multilingual-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>33</th>\n",
|
||||
" <td>albert-base-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>34</th>\n",
|
||||
" <td>albert-large-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>35</th>\n",
|
||||
" <td>albert-xlarge-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>36</th>\n",
|
||||
" <td>albert-xxlarge-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>37</th>\n",
|
||||
" <td>albert-base-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>38</th>\n",
|
||||
" <td>albert-large-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>39</th>\n",
|
||||
" <td>albert-xlarge-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>40</th>\n",
|
||||
" <td>albert-xxlarge-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
|
@ -432,13 +509,32 @@
|
|||
"12 bert-base-cased-finetuned-mrpc\n",
|
||||
"13 bert-base-german-dbmdz-cased\n",
|
||||
"14 bert-base-german-dbmdz-uncased\n",
|
||||
"15 roberta-base\n",
|
||||
"16 roberta-large\n",
|
||||
"17 roberta-large-mnli\n",
|
||||
"18 xlnet-base-cased\n",
|
||||
"19 xlnet-large-cased\n",
|
||||
"20 distilbert-base-uncased\n",
|
||||
"21 distilbert-base-uncased-distilled-squad"
|
||||
"15 bert-base-japanese\n",
|
||||
"16 bert-base-japanese-whole-word-masking\n",
|
||||
"17 bert-base-japanese-char\n",
|
||||
"18 bert-base-japanese-char-whole-word-masking\n",
|
||||
"19 bert-base-finnish-cased-v1\n",
|
||||
"20 bert-base-finnish-uncased-v1\n",
|
||||
"21 roberta-base\n",
|
||||
"22 roberta-large\n",
|
||||
"23 roberta-large-mnli\n",
|
||||
"24 distilroberta-base\n",
|
||||
"25 roberta-base-openai-detector\n",
|
||||
"26 roberta-large-openai-detector\n",
|
||||
"27 xlnet-base-cased\n",
|
||||
"28 xlnet-large-cased\n",
|
||||
"29 distilbert-base-uncased\n",
|
||||
"30 distilbert-base-uncased-distilled-squad\n",
|
||||
"31 distilbert-base-german-cased\n",
|
||||
"32 distilbert-base-multilingual-cased\n",
|
||||
"33 albert-base-v1\n",
|
||||
"34 albert-large-v1\n",
|
||||
"35 albert-xlarge-v1\n",
|
||||
"36 albert-xxlarge-v1\n",
|
||||
"37 albert-base-v2\n",
|
||||
"38 albert-large-v2\n",
|
||||
"39 albert-xlarge-v2\n",
|
||||
"40 albert-xxlarge-v2"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
|
@ -492,18 +588,8 @@
|
|||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 231508/231508 [00:00<00:00, 15545441.79B/s]\n",
|
||||
"100%|██████████| 492/492 [00:00<00:00, 560455.61B/s]\n",
|
||||
"100%|██████████| 267967963/267967963 [00:04<00:00, 61255588.46B/s]\n",
|
||||
"/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
|
||||
" warnings.warn('Was asked to gather along dimension 0, but all '\n",
|
||||
"100%|██████████| 898823/898823 [00:00<00:00, 23932308.55B/s]\n",
|
||||
"100%|██████████| 456318/456318 [00:00<00:00, 23321916.66B/s]\n",
|
||||
"100%|██████████| 473/473 [00:00<00:00, 477015.10B/s]\n",
|
||||
"100%|██████████| 501200538/501200538 [00:07<00:00, 64332558.45B/s]\n",
|
||||
"100%|██████████| 798011/798011 [00:00<00:00, 25002433.16B/s]\n",
|
||||
"100%|██████████| 641/641 [00:00<00:00, 695974.34B/s]\n",
|
||||
"100%|██████████| 467042463/467042463 [00:08<00:00, 55154509.21B/s]\n"
|
||||
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
|
||||
" warnings.warn('Was asked to gather along dimension 0, but all '\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -518,11 +604,17 @@
|
|||
" to_lower=model_name.endswith(\"uncased\"),\n",
|
||||
" cache_dir=CACHE_DIR,\n",
|
||||
" )\n",
|
||||
" train_dataloader = processor.create_dataloader_from_df(\n",
|
||||
" df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
|
||||
" train_dataset = processor.dataset_from_dataframe(\n",
|
||||
" df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
|
||||
" )\n",
|
||||
" test_dataloader = processor.create_dataloader_from_df(\n",
|
||||
" df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
|
||||
" train_dataloader = dataloader_from_dataset(\n",
|
||||
" train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
|
||||
" )\n",
|
||||
" test_dataset = processor.dataset_from_dataframe(\n",
|
||||
" df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
|
||||
" )\n",
|
||||
" test_dataloader = dataloader_from_dataset(\n",
|
||||
" test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # fine-tune\n",
|
||||
|
@ -531,17 +623,12 @@
|
|||
" )\n",
|
||||
" with Timer() as t:\n",
|
||||
" classifier.fit(\n",
|
||||
" train_dataloader,\n",
|
||||
" num_epochs=NUM_EPOCHS,\n",
|
||||
" num_gpus=NUM_GPUS,\n",
|
||||
" verbose=False,\n",
|
||||
" train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,\n",
|
||||
" )\n",
|
||||
" train_time = t.interval / 3600\n",
|
||||
"\n",
|
||||
" # predict\n",
|
||||
" preds = classifier.predict(\n",
|
||||
" test_dataloader, num_gpus=NUM_GPUS, verbose=False\n",
|
||||
" )\n",
|
||||
" preds = classifier.predict(test_dataloader, num_gpus=NUM_GPUS, verbose=False)\n",
|
||||
"\n",
|
||||
" # eval\n",
|
||||
" accuracy = accuracy_score(df_test[LABEL_COL], preds)\n",
|
||||
|
@ -600,21 +687,21 @@
|
|||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>accuracy</th>\n",
|
||||
" <td>0.895477</td>\n",
|
||||
" <td>0.879584</td>\n",
|
||||
" <td>0.894866</td>\n",
|
||||
" <td>0.889364</td>\n",
|
||||
" <td>0.885697</td>\n",
|
||||
" <td>0.886308</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>f1-score</th>\n",
|
||||
" <td>0.896656</td>\n",
|
||||
" <td>0.881218</td>\n",
|
||||
" <td>0.896108</td>\n",
|
||||
" <td>0.885225</td>\n",
|
||||
" <td>0.880926</td>\n",
|
||||
" <td>0.881819</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>time(hrs)</th>\n",
|
||||
" <td>0.021865</td>\n",
|
||||
" <td>0.035351</td>\n",
|
||||
" <td>0.046295</td>\n",
|
||||
" <td>0.023326</td>\n",
|
||||
" <td>0.044209</td>\n",
|
||||
" <td>0.052801</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
|
@ -622,9 +709,9 @@
|
|||
],
|
||||
"text/plain": [
|
||||
" distilbert-base-uncased roberta-base xlnet-base-cased\n",
|
||||
"accuracy 0.895477 0.879584 0.894866\n",
|
||||
"f1-score 0.896656 0.881218 0.896108\n",
|
||||
"time(hrs) 0.021865 0.035351 0.046295"
|
||||
"accuracy 0.889364 0.885697 0.886308\n",
|
||||
"f1-score 0.885225 0.880926 0.881819\n",
|
||||
"time(hrs) 0.023326 0.044209 0.052801"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
|
@ -645,7 +732,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.8899755501222494,
|
||||
"data": 0.887123064384678,
|
||||
"encoder": "json",
|
||||
"name": "accuracy",
|
||||
"version": 1
|
||||
|
@ -663,7 +750,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.8913273009038569,
|
||||
"data": 0.8826569624491233,
|
||||
"encoder": "json",
|
||||
"name": "f1",
|
||||
"version": 1
|
||||
|
@ -688,9 +775,9 @@
|
|||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "nlp_gpu",
|
||||
"display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
|
||||
"language": "python",
|
||||
"name": "nlp_gpu"
|
||||
"name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -69,7 +69,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
|
@ -183,32 +183,108 @@
|
|||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>roberta-base</td>\n",
|
||||
" <td>bert-base-japanese</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>roberta-large</td>\n",
|
||||
" <td>bert-base-japanese-whole-word-masking</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>roberta-large-mnli</td>\n",
|
||||
" <td>bert-base-japanese-char</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>xlnet-base-cased</td>\n",
|
||||
" <td>bert-base-japanese-char-whole-word-masking</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>xlnet-large-cased</td>\n",
|
||||
" <td>bert-base-finnish-cased-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20</th>\n",
|
||||
" <td>distilbert-base-uncased</td>\n",
|
||||
" <td>bert-base-finnish-uncased-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21</th>\n",
|
||||
" <td>roberta-base</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>22</th>\n",
|
||||
" <td>roberta-large</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>23</th>\n",
|
||||
" <td>roberta-large-mnli</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>24</th>\n",
|
||||
" <td>distilroberta-base</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25</th>\n",
|
||||
" <td>roberta-base-openai-detector</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>26</th>\n",
|
||||
" <td>roberta-large-openai-detector</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>27</th>\n",
|
||||
" <td>xlnet-base-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>28</th>\n",
|
||||
" <td>xlnet-large-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>distilbert-base-uncased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>30</th>\n",
|
||||
" <td>distilbert-base-uncased-distilled-squad</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>31</th>\n",
|
||||
" <td>distilbert-base-german-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>32</th>\n",
|
||||
" <td>distilbert-base-multilingual-cased</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>33</th>\n",
|
||||
" <td>albert-base-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>34</th>\n",
|
||||
" <td>albert-large-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>35</th>\n",
|
||||
" <td>albert-xlarge-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>36</th>\n",
|
||||
" <td>albert-xxlarge-v1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>37</th>\n",
|
||||
" <td>albert-base-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>38</th>\n",
|
||||
" <td>albert-large-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>39</th>\n",
|
||||
" <td>albert-xlarge-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>40</th>\n",
|
||||
" <td>albert-xxlarge-v2</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
|
@ -230,13 +306,32 @@
|
|||
"12 bert-base-cased-finetuned-mrpc\n",
|
||||
"13 bert-base-german-dbmdz-cased\n",
|
||||
"14 bert-base-german-dbmdz-uncased\n",
|
||||
"15 roberta-base\n",
|
||||
"16 roberta-large\n",
|
||||
"17 roberta-large-mnli\n",
|
||||
"18 xlnet-base-cased\n",
|
||||
"19 xlnet-large-cased\n",
|
||||
"20 distilbert-base-uncased\n",
|
||||
"21 distilbert-base-uncased-distilled-squad"
|
||||
"15 bert-base-japanese\n",
|
||||
"16 bert-base-japanese-whole-word-masking\n",
|
||||
"17 bert-base-japanese-char\n",
|
||||
"18 bert-base-japanese-char-whole-word-masking\n",
|
||||
"19 bert-base-finnish-cased-v1\n",
|
||||
"20 bert-base-finnish-uncased-v1\n",
|
||||
"21 roberta-base\n",
|
||||
"22 roberta-large\n",
|
||||
"23 roberta-large-mnli\n",
|
||||
"24 distilroberta-base\n",
|
||||
"25 roberta-base-openai-detector\n",
|
||||
"26 roberta-large-openai-detector\n",
|
||||
"27 xlnet-base-cased\n",
|
||||
"28 xlnet-large-cased\n",
|
||||
"29 distilbert-base-uncased\n",
|
||||
"30 distilbert-base-uncased-distilled-squad\n",
|
||||
"31 distilbert-base-german-cased\n",
|
||||
"32 distilbert-base-multilingual-cased\n",
|
||||
"33 albert-base-v1\n",
|
||||
"34 albert-large-v1\n",
|
||||
"35 albert-xlarge-v1\n",
|
||||
"36 albert-xxlarge-v1\n",
|
||||
"37 albert-base-v2\n",
|
||||
"38 albert-large-v2\n",
|
||||
"39 albert-xlarge-v2\n",
|
||||
"40 albert-xxlarge-v2"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
|
@ -264,7 +359,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -281,7 +376,7 @@
|
|||
" 'num_train_epochs': 5,\n",
|
||||
" 'num_gpus': 2,\n",
|
||||
" 'batch_size': 16,\n",
|
||||
" 'verbose': True,\n",
|
||||
" 'verbose': False,\n",
|
||||
" 'load_dataset_func': None,\n",
|
||||
" 'get_labels_func': None\n",
|
||||
"}\n",
|
||||
|
@ -325,9 +420,19 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 80.1k/80.1k [00:02<00:00, 30.8kKB/s]\n",
|
||||
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
|
||||
" FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_dataloader, test_dataloader, label_encoder, test_labels = CONFIG['load_dataset_func'](\n",
|
||||
" local_path=CONFIG['local_path'],\n",
|
||||
|
@ -354,11 +459,27 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
|
||||
" warnings.warn('Was asked to gather along dimension 0, but all '\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Training time : 0.190 hrs\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = SequenceClassifier(\n",
|
||||
" model_name=CONFIG['model_name'],\n",
|
||||
|
@ -390,9 +511,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Prediction time : 0.021 hrs\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"with Timer() as t:\n",
|
||||
" preds = model.predict(\n",
|
||||
|
@ -422,11 +551,11 @@
|
|||
"text": [
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" culture 0.89 0.89 0.89 843\n",
|
||||
" diverse 0.99 0.99 0.99 1738\n",
|
||||
" economy 0.96 0.96 0.96 661\n",
|
||||
" politics 0.94 0.94 0.94 530\n",
|
||||
" sports 0.87 0.87 0.87 580\n",
|
||||
" culture 0.93 0.94 0.93 548\n",
|
||||
" diverse 0.94 0.94 0.94 640\n",
|
||||
" economy 0.90 0.88 0.89 570\n",
|
||||
" politics 0.87 0.88 0.88 809\n",
|
||||
" sports 0.99 0.98 0.99 1785\n",
|
||||
"\n",
|
||||
" micro avg 0.94 0.94 0.94 4352\n",
|
||||
" macro avg 0.93 0.93 0.93 4352\n",
|
||||
|
@ -449,9 +578,64 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.94,
|
||||
"encoder": "json",
|
||||
"name": "precision",
|
||||
"version": 1
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"scrapbook": {
|
||||
"data": true,
|
||||
"display": false,
|
||||
"name": "precision"
|
||||
}
|
||||
},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.94,
|
||||
"encoder": "json",
|
||||
"name": "recall",
|
||||
"version": 1
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"scrapbook": {
|
||||
"data": true,
|
||||
"display": false,
|
||||
"name": "recall"
|
||||
}
|
||||
},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/scrapbook.scrap.json+json": {
|
||||
"data": 0.94,
|
||||
"encoder": "json",
|
||||
"name": "f1",
|
||||
"version": 1
|
||||
}
|
||||
},
|
||||
"metadata": {
|
||||
"scrapbook": {
|
||||
"data": true,
|
||||
"display": false,
|
||||
"name": "f1"
|
||||
}
|
||||
},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# for testing\n",
|
||||
"report_splits = report.split('\\n')[-2].split()\n",
|
||||
|
@ -463,11 +647,10 @@
|
|||
}
|
||||
],
|
||||
"metadata": {
|
||||
"celltoolbar": "Tags",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
|
|
@ -1,14 +1,10 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
import pytest
|
||||
import papermill as pm
|
||||
import pytest
|
||||
import scrapbook as sb
|
||||
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
|
||||
|
||||
from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK
|
||||
|
||||
ABS_TOL = 0.02
|
||||
|
||||
|
@ -31,13 +27,10 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp):
|
|||
CACHE_DIR=tmp,
|
||||
BATCH_SIZE=3000,
|
||||
REPORT_EVERY=50,
|
||||
MAX_STEPS=1e3,
|
||||
MAX_STEPS=1000,
|
||||
WARMUP_STEPS=5e2,
|
||||
MODEL_NAME="distilbert-base-uncased",
|
||||
),
|
||||
)
|
||||
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
|
||||
print(result)
|
||||
assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL)
|
||||
|
||||
|
||||
|
|
|
@ -33,8 +33,8 @@ def test_tc_mnli_transformers(notebooks, tmp):
|
|||
),
|
||||
)
|
||||
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
|
||||
assert pytest.approx(result["accuracy"], 0.89, abs=ABS_TOL)
|
||||
assert pytest.approx(result["f1"], 0.89, abs=ABS_TOL)
|
||||
assert pytest.approx(result["accuracy"], 0.885, abs=ABS_TOL)
|
||||
assert pytest.approx(result["f1"], 0.885, abs=ABS_TOL)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
|
|
|
@ -9,4 +9,3 @@ import torch
|
|||
@pytest.mark.gpu
|
||||
def test_machine_is_gpu_machine():
|
||||
assert torch.cuda.is_available() is True
|
||||
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pytest
|
||||
|
||||
from utils_nlp.models.bert.token_classification import (
|
||||
BERTTokenClassifier,
|
||||
postprocess_token_labels,
|
||||
)
|
||||
|
||||
|
||||
def test_token_classifier_num_labels():
|
||||
with pytest.raises(ValueError):
|
||||
BERTTokenClassifier(num_labels=1)
|
||||
|
||||
|
||||
def test_token_classifier_fit_predict(tmp_path, ner_test_data):
|
||||
token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=tmp_path)
|
||||
|
||||
# test fit, no warmup
|
||||
token_classifier.fit(
|
||||
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
|
||||
input_mask=ner_test_data["INPUT_MASK"],
|
||||
labels=ner_test_data["INPUT_LABEL_IDS"],
|
||||
)
|
||||
|
||||
# test fit, with warmup
|
||||
token_classifier.fit(
|
||||
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
|
||||
input_mask=ner_test_data["INPUT_MASK"],
|
||||
labels=ner_test_data["INPUT_LABEL_IDS"],
|
||||
warmup_proportion=0.1,
|
||||
)
|
||||
# test predict, no labels
|
||||
token_classifier.predict(
|
||||
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
|
||||
input_mask=ner_test_data["INPUT_MASK"],
|
||||
)
|
||||
|
||||
# test predict, with labels
|
||||
token_classifier.predict(
|
||||
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
|
||||
input_mask=ner_test_data["INPUT_MASK"],
|
||||
labels=ner_test_data["INPUT_LABEL_IDS"],
|
||||
)
|
||||
|
||||
# test output probabilities
|
||||
predictions = token_classifier.predict(
|
||||
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
|
||||
input_mask=ner_test_data["INPUT_MASK"],
|
||||
labels=ner_test_data["INPUT_LABEL_IDS"],
|
||||
probabilities=True,
|
||||
)
|
||||
assert len(predictions.classes) == predictions.probabilities.shape[0]
|
||||
|
||||
|
||||
def test_postprocess_token_labels(ner_test_data):
|
||||
labels_no_padding = postprocess_token_labels(
|
||||
labels=ner_test_data["PREDICTED_LABELS"],
|
||||
input_mask=ner_test_data["INPUT_MASK"],
|
||||
label_map=ner_test_data["LABEL_MAP"],
|
||||
)
|
||||
|
||||
assert labels_no_padding == ner_test_data["EXPECTED_TOKENS_NO_PADDING"]
|
||||
|
||||
|
||||
def test_postprocess_token_labels_remove_trailing(ner_test_data):
|
||||
labels_no_padding_no_trailing = postprocess_token_labels(
|
||||
labels=ner_test_data["PREDICTED_LABELS"],
|
||||
input_mask=ner_test_data["INPUT_MASK"],
|
||||
label_map=ner_test_data["LABEL_MAP"],
|
||||
remove_trailing_word_pieces=True,
|
||||
trailing_token_mask=ner_test_data["TRAILING_TOKEN_MASK"],
|
||||
)
|
||||
|
||||
assert (
|
||||
labels_no_padding_no_trailing
|
||||
== ner_test_data["EXPECTED_TOKENS_NO_PADDING_NO_TRAILING"]
|
||||
)
|
|
@ -1,14 +1,15 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""PyTorch utils tests."""
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn.parallel.data_parallel import DataParallel
|
||||
from torch.nn.modules.container import Sequential
|
||||
from torch.nn.parallel.data_parallel import DataParallel
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_to_device
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -55,49 +56,47 @@ def test_get_device_local_rank():
|
|||
|
||||
def test_move_to_device_cpu(model):
|
||||
# test when device.type="cpu"
|
||||
model_cpu = move_to_device(model, torch.device("cpu"))
|
||||
model_cpu = move_model_to_device(model, torch.device("cpu"))
|
||||
assert isinstance(model_cpu, nn.modules.container.Sequential)
|
||||
|
||||
|
||||
def test_move_to_device_cpu_parallelized(model):
|
||||
# test when input model is parallelized
|
||||
model_parallelized = nn.DataParallel(model)
|
||||
model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu"))
|
||||
model_parallelized_output = move_model_to_device(model_parallelized, torch.device("cpu"))
|
||||
assert isinstance(model_parallelized_output, nn.modules.container.Sequential)
|
||||
|
||||
|
||||
def test_move_to_device_exception_not_torch_device(model):
|
||||
# test when device is not torch.device
|
||||
with pytest.raises(ValueError):
|
||||
move_to_device(model, "abc")
|
||||
move_model_to_device(model, "abc")
|
||||
|
||||
|
||||
def test_move_to_device_exception_wrong_type(model):
|
||||
# test when device.type is not "cuda" or "cpu"
|
||||
with pytest.raises(Exception):
|
||||
move_to_device(model, torch.device("opengl"))
|
||||
move_model_to_device(model, torch.device("opengl"))
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine"
|
||||
)
|
||||
@pytest.mark.skipif(torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine")
|
||||
def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
|
||||
# test when the model is moved to a gpu but it is a cpu machine
|
||||
with pytest.raises(Exception):
|
||||
move_to_device(model, torch.device("cuda"))
|
||||
move_model_to_device(model, torch.device("cuda"))
|
||||
|
||||
|
||||
@pytest.mark.gpu
|
||||
def test_move_to_device_exception_cuda_zero_gpus(model):
|
||||
# test when device.type is cuda, but num_gpus is 0
|
||||
with pytest.raises(ValueError):
|
||||
move_to_device(model, torch.device("cuda"), num_gpus=0)
|
||||
move_model_to_device(model, torch.device("cuda"), num_gpus=0)
|
||||
|
||||
|
||||
@pytest.mark.gpu
|
||||
def test_move_to_device_gpu(model):
|
||||
# test when device.type="cuda"
|
||||
model_cuda = move_to_device(model, torch.device("cuda"))
|
||||
model_cuda = move_model_to_device(model, torch.device("cuda"))
|
||||
num_cuda_devices = torch.cuda.device_count()
|
||||
|
||||
if num_cuda_devices > 1:
|
||||
|
@ -105,18 +104,16 @@ def test_move_to_device_gpu(model):
|
|||
else:
|
||||
assert isinstance(model_cuda, Sequential)
|
||||
|
||||
model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1)
|
||||
model_cuda_1_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=1)
|
||||
assert isinstance(model_cuda_1_gpu, Sequential)
|
||||
|
||||
model_cuda_1_more_gpu = move_to_device(
|
||||
model, torch.device("cuda"), num_gpus=num_cuda_devices + 1
|
||||
)
|
||||
model_cuda_1_more_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices + 1)
|
||||
if num_cuda_devices > 1:
|
||||
assert isinstance(model_cuda_1_more_gpu, DataParallel)
|
||||
else:
|
||||
assert isinstance(model_cuda_1_more_gpu, Sequential)
|
||||
|
||||
model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
|
||||
model_cuda_same_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
|
||||
if num_cuda_devices > 1:
|
||||
assert isinstance(model_cuda_same_gpu, DataParallel)
|
||||
else:
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import nltk
|
||||
|
||||
nltk.download("punkt")
|
||||
from nltk import tokenize
|
||||
import pytest
|
||||
import os
|
||||
import shutil
|
||||
|
||||
import nltk
|
||||
nltk.download("punkt")
|
||||
import pytest
|
||||
from nltk import tokenize
|
||||
|
||||
from utils_nlp.models.transformers.datasets import SummarizationDataset
|
||||
from utils_nlp.models.transformers.extractive_summarization import (
|
||||
|
@ -17,6 +15,9 @@ from utils_nlp.models.transformers.extractive_summarization import (
|
|||
ExtSumProcessor,
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
# @pytest.fixture()
|
||||
def source_data():
|
||||
return (
|
||||
|
@ -48,18 +49,10 @@ def data_to_file(tmp_module):
|
|||
f.write(target)
|
||||
f.close()
|
||||
train_dataset = SummarizationDataset(
|
||||
source_file,
|
||||
target_file,
|
||||
[tokenize.sent_tokenize],
|
||||
[tokenize.sent_tokenize],
|
||||
nltk.word_tokenize,
|
||||
source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
|
||||
)
|
||||
test_dataset = SummarizationDataset(
|
||||
source_file,
|
||||
target_file,
|
||||
[tokenize.sent_tokenize],
|
||||
[tokenize.sent_tokenize],
|
||||
nltk.word_tokenize,
|
||||
source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
|
||||
)
|
||||
|
||||
processor = ExtSumProcessor(
|
||||
|
@ -70,20 +63,12 @@ def data_to_file(tmp_module):
|
|||
min_nsents=0,
|
||||
min_src_ntokens=1,
|
||||
)
|
||||
ext_sum_train = processor.preprocess(
|
||||
train_dataset, train_dataset.get_target(), oracle_mode="greedy"
|
||||
)
|
||||
ext_sum_test = processor.preprocess(
|
||||
test_dataset, test_dataset.get_target(), oracle_mode="greedy"
|
||||
)
|
||||
ext_sum_train = processor.preprocess(train_dataset, train_dataset.get_target(), oracle_mode="greedy")
|
||||
ext_sum_test = processor.preprocess(test_dataset, test_dataset.get_target(), oracle_mode="greedy")
|
||||
|
||||
save_path = os.path.join(tmp_module, "processed")
|
||||
train_files = ExtSumProcessedData.save_data(
|
||||
ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000
|
||||
)
|
||||
test_files = ExtSumProcessedData.save_data(
|
||||
ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000
|
||||
)
|
||||
train_files = ExtSumProcessedData.save_data(ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000)
|
||||
test_files = ExtSumProcessedData.save_data(ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000)
|
||||
print(train_files)
|
||||
print(test_files)
|
||||
assert os.path.exists(train_files[0])
|
||||
|
@ -96,10 +81,10 @@ def test_bert_training(data_to_file, tmp_module):
|
|||
|
||||
CACHE_DIR = tmp_module
|
||||
ENCODER = "transformer"
|
||||
BATCH_SIZE = 200
|
||||
BATCH_SIZE = 128
|
||||
LEARNING_RATE = 2e-3
|
||||
REPORT_EVERY = 100
|
||||
MAX_STEPS = 5e2
|
||||
REPORT_EVERY = 50
|
||||
MAX_STEPS = 2e2
|
||||
WARMUP_STEPS = 1e2
|
||||
DATA_SAVED_PATH = data_to_file
|
||||
result_base_path = "./results"
|
||||
|
|
|
@ -1,18 +1,20 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pytest
|
||||
import os
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
|
||||
from utils_nlp.models.transformers.datasets import QADataset
|
||||
from utils_nlp.models.transformers.question_answering import (
|
||||
QAProcessor,
|
||||
AnswerExtractor,
|
||||
CACHED_EXAMPLES_TEST_FILE,
|
||||
CACHED_FEATURES_TEST_FILE,
|
||||
AnswerExtractor,
|
||||
QAProcessor,
|
||||
)
|
||||
|
||||
import torch
|
||||
|
||||
NUM_GPUS = max(1, torch.cuda.device_count())
|
||||
BATCH_SIZE = 8
|
||||
|
||||
|
@ -109,9 +111,7 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
qa_processor_distilbert = QAProcessor(
|
||||
model_name="distilbert-base-uncased", cache_dir=tmp_module
|
||||
)
|
||||
qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
|
||||
train_features_distilbert = qa_processor_distilbert.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
|
@ -153,15 +153,9 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
def test_QAProcessor(qa_test_data, tmp_module):
|
||||
for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
|
||||
qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module
|
||||
)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module
|
||||
)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module
|
||||
)
|
||||
qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
|
||||
|
||||
# test unsupported model type
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -169,51 +163,49 @@ def test_QAProcessor(qa_test_data, tmp_module):
|
|||
|
||||
# test training data has no ground truth exception
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
|
||||
)
|
||||
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
|
||||
|
||||
# test when answer start is a list, but answer text is not
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_start_text_mismatch"],
|
||||
is_training=True,
|
||||
feature_cache_dir=tmp_module,
|
||||
qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
# test when training data has multiple answers
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_multi_answers"],
|
||||
is_training=True,
|
||||
feature_cache_dir=tmp_module,
|
||||
qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
|
||||
def test_AnswerExtractor(qa_test_data, tmp_module):
|
||||
# test bert
|
||||
# bert
|
||||
qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)
|
||||
qa_extractor_bert.fit(qa_test_data["train_features_bert"], cache_model=True)
|
||||
train_loader_bert = dataloader_from_dataset(qa_test_data["train_features_bert"])
|
||||
test_loader_bert = dataloader_from_dataset(qa_test_data["test_features_bert"], shuffle=False)
|
||||
qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)
|
||||
|
||||
# test saving fine-tuned model
|
||||
model_output_dir = os.path.join(tmp_module, "fine_tuned")
|
||||
assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
|
||||
assert os.path.exists(os.path.join(model_output_dir, "config.json"))
|
||||
|
||||
qa_extractor_from_cache = AnswerExtractor(
|
||||
cache_dir=tmp_module, load_model_from_dir=model_output_dir
|
||||
)
|
||||
qa_extractor_from_cache.predict(qa_test_data["test_features_bert"])
|
||||
qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
|
||||
qa_extractor_from_cache.predict(test_loader_bert, verbose=False)
|
||||
|
||||
# xlnet
|
||||
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_xlnet"])
|
||||
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_xlnet"], shuffle=False)
|
||||
qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
|
||||
qa_extractor_xlnet.fit(qa_test_data["train_features_xlnet"], cache_model=False)
|
||||
qa_extractor_xlnet.predict(qa_test_data["test_features_xlnet"])
|
||||
qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False)
|
||||
qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)
|
||||
|
||||
qa_extractor_distilbert = AnswerExtractor(
|
||||
model_name="distilbert-base-uncased", cache_dir=tmp_module
|
||||
)
|
||||
qa_extractor_distilbert.fit(qa_test_data["train_features_distilbert"], cache_model=False)
|
||||
qa_extractor_distilbert.predict(qa_test_data["test_features_distilbert"])
|
||||
# distilbert
|
||||
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
|
||||
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
|
||||
qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
|
||||
qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
|
||||
qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)
|
||||
|
||||
|
||||
def test_postprocess_bert_answer(qa_test_data, tmp_module):
|
||||
|
@ -226,8 +218,9 @@ def test_postprocess_bert_answer(qa_test_data, tmp_module):
|
|||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
test_loader = dataloader_from_dataset(test_features, shuffle=False)
|
||||
qa_extractor = AnswerExtractor(cache_dir=tmp_module)
|
||||
predictions = qa_extractor.predict(test_features)
|
||||
predictions = qa_extractor.predict(test_loader)
|
||||
|
||||
qa_processor.postprocess(
|
||||
results=predictions,
|
||||
|
@ -260,8 +253,9 @@ def test_postprocess_xlnet_answer(qa_test_data, tmp_module):
|
|||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
test_loader = dataloader_from_dataset(test_features, shuffle=False)
|
||||
qa_extractor = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
|
||||
predictions = qa_extractor.predict(test_features)
|
||||
predictions = qa_extractor.predict(test_loader)
|
||||
|
||||
qa_processor.postprocess(
|
||||
results=predictions,
|
||||
|
|
|
@ -5,6 +5,7 @@ import pytest
|
|||
import pandas as pd
|
||||
|
||||
from utils_nlp.models.transformers.sequence_classification import SequenceClassifier, Processor
|
||||
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
|
@ -19,12 +20,11 @@ def test_classifier(data, tmpdir):
|
|||
num_labels = len(pd.unique(data[1]))
|
||||
model_name = "bert-base-uncased"
|
||||
processor = Processor(model_name=model_name, cache_dir=tmpdir)
|
||||
train_dataloader = processor.create_dataloader_from_df(
|
||||
df, "text", "label", batch_size=2, num_gpus=0
|
||||
)
|
||||
ds = processor.dataset_from_dataframe(df, "text", "label")
|
||||
dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)
|
||||
classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
|
||||
classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=0, verbose=False)
|
||||
preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
|
||||
classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False)
|
||||
preds = classifier.predict(dl, num_gpus=0, verbose=False)
|
||||
assert len(preds) == len(data[1])
|
||||
|
||||
|
||||
|
@ -35,17 +35,16 @@ def test_classifier_gpu_train_cpu_predict(data, tmpdir):
|
|||
num_labels = len(pd.unique(data[1]))
|
||||
model_name = "bert-base-uncased"
|
||||
processor = Processor(model_name=model_name, cache_dir=tmpdir)
|
||||
train_dataloader = processor.create_dataloader_from_df(
|
||||
df, "text", "label", batch_size=2, num_gpus=1
|
||||
)
|
||||
ds = processor.dataset_from_dataframe(df, "text", "label")
|
||||
dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)
|
||||
classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
|
||||
classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=1, verbose=False)
|
||||
classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False)
|
||||
|
||||
assert next(classifier.model.parameters()).is_cuda is True
|
||||
# gpu prediction, no model move
|
||||
preds = classifier.predict(train_dataloader, num_gpus=1, verbose=False)
|
||||
preds = classifier.predict(dl, num_gpus=1, verbose=False)
|
||||
assert len(preds) == len(data[1])
|
||||
# cpu prediction, need model move
|
||||
assert next(classifier.model.parameters()).is_cuda is True
|
||||
preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
|
||||
preds = classifier.predict(dl, num_gpus=0, verbose=False)
|
||||
assert next(classifier.model.parameters()).is_cuda is False
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pytest
|
||||
|
||||
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
|
||||
from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor, TokenClassifier
|
||||
|
||||
|
||||
@pytest.mark.cpu
|
||||
def test_token_classifier_fit_predict(tmpdir, ner_test_data):
|
||||
token_classifier = TokenClassifier(model_name="bert-base-uncased", num_labels=6, cache_dir=tmpdir)
|
||||
processor = TokenClassificationProcessor(model_name="bert-base-uncased", cache_dir=tmpdir)
|
||||
|
||||
# test fit, no warmup
|
||||
train_dataset = processor.preprocess_for_bert(
|
||||
text=ner_test_data["INPUT_TEXT"], labels=ner_test_data["INPUT_LABELS"], label_map=ner_test_data["LABEL_MAP"],
|
||||
)
|
||||
train_dataloader = dataloader_from_dataset(train_dataset)
|
||||
token_classifier.fit(train_dataloader)
|
||||
|
||||
# test predict, no labels
|
||||
_ = token_classifier.predict(train_dataloader, verbose=False)
|
|
@ -1,11 +1,11 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Common PyTorch utilities that facilitate building Pytorch models."""
|
||||
"""Common PyTorch utilities that facilitate building PyTorch models."""
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import warnings
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
|
||||
def get_device(
|
||||
|
@ -17,11 +17,7 @@ def get_device(
|
|||
# init_method="file:///distributed",
|
||||
):
|
||||
if local_rank == -1:
|
||||
num_gpus = (
|
||||
min(num_gpus, torch.cuda.device_count())
|
||||
if num_gpus is not None
|
||||
else torch.cuda.device_count()
|
||||
)
|
||||
num_gpus = min(num_gpus, torch.cuda.device_count()) if num_gpus is not None else torch.cuda.device_count()
|
||||
device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
|
||||
else:
|
||||
torch.cuda.set_device(local_rank)
|
||||
|
@ -32,59 +28,109 @@ def get_device(
|
|||
return device, num_gpus
|
||||
|
||||
|
||||
def move_to_device(model, device, num_gpus=None):
|
||||
def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=-1):
|
||||
"""Moves a model to the specified device (cpu or gpu/s)
|
||||
and implements data parallelism when multiple gpus are specified.
|
||||
|
||||
Args:
|
||||
model (Module): A PyTorch model
|
||||
device (torch.device): A PyTorch device
|
||||
num_gpus (int): The number of GPUs to be used. Defaults to None,
|
||||
all gpus are used.
|
||||
model (Module): A PyTorch model.
|
||||
device (torch.device): A PyTorch device.
|
||||
num_gpus (int): The number of GPUs to be used.
|
||||
If set to None, all available GPUs will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If None, the first num_gpus GPUs will be used.
|
||||
If not None, overrides num_gpus.
|
||||
Defaults to None.
|
||||
local_rank (int): Local GPU ID within a node. Used in distributed environments.
|
||||
If not -1, num_gpus and gpu_ids are ignored.
|
||||
Defaults to -1.
|
||||
|
||||
Returns:
|
||||
Module, DataParallel, DistributedDataParallel: A PyTorch Module or
|
||||
a DataParallel/DistributedDataParallel wrapper (when multiple gpus are used).
|
||||
"""
|
||||
if not isinstance(device, torch.device):
|
||||
raise ValueError("device must be of type torch.device.")
|
||||
|
||||
# unwrap model
|
||||
if isinstance(model, torch.nn.DataParallel):
|
||||
model = model.module
|
||||
# wrap in DataParallel or DistributedDataParallel
|
||||
if local_rank != -1:
|
||||
self.model = torch.nn.parallel.DistributedDataParallel(
|
||||
self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True,
|
||||
)
|
||||
else:
|
||||
if device.type == "cuda":
|
||||
if num_gpus is not None:
|
||||
if num_gpus < 1:
|
||||
raise ValueError("num_gpus must be at least 1 or None")
|
||||
num_cuda_devices = torch.cuda.device_count()
|
||||
if num_cuda_devices < 1:
|
||||
raise Exception("CUDA devices are not available.")
|
||||
if gpu_ids is None:
|
||||
num_gpus = num_cuda_devices if num_gpus is None else min(num_gpus, num_cuda_devices)
|
||||
gpu_ids = list(range(num_gpus))
|
||||
if len(gpu_ids) > 1:
|
||||
model = torch.nn.DataParallel(model, device_ids=gpu_ids)
|
||||
# move to device
|
||||
return model.to(device)
|
||||
|
||||
|
||||
def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False):
|
||||
"""Creates a PyTorch DataLoader given a Dataset object.
|
||||
|
||||
Args:
|
||||
ds (torch.utils.data.DataSet): A PyTorch dataset.
|
||||
batch_size (int, optional): Batch size.
|
||||
If more than 1 gpu is used, this would be the batch size per gpu.
|
||||
Defaults to 32.
|
||||
num_gpus (int, optional): The number of GPUs to be used. Defaults to None.
|
||||
shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.
|
||||
distributed (book, optional): If True, a DistributedSampler is used. Defaults to False.
|
||||
|
||||
Returns:
|
||||
Module, DataParallel: A PyTorch Module or
|
||||
a DataParallel wrapper (when multiple gpus are used).
|
||||
"""
|
||||
if isinstance(model, nn.DataParallel):
|
||||
model = model.module
|
||||
if num_gpus is None:
|
||||
num_gpus = torch.cuda.device_count()
|
||||
|
||||
if not isinstance(device, torch.device):
|
||||
raise ValueError("device must be of type torch.device.")
|
||||
|
||||
if device.type == "cuda":
|
||||
model.to(device) # inplace
|
||||
if num_gpus == 0:
|
||||
raise ValueError("num_gpus must be non-zero when device.type is 'cuda'")
|
||||
elif num_gpus == 1:
|
||||
return model
|
||||
else:
|
||||
# parallelize
|
||||
num_cuda_devices = torch.cuda.device_count()
|
||||
if num_cuda_devices < 1:
|
||||
raise Exception("CUDA devices are not available.")
|
||||
elif num_cuda_devices < 2:
|
||||
print("Warning: Only 1 CUDA device is available. Data parallelism is not possible.")
|
||||
return model
|
||||
else:
|
||||
if num_gpus is None:
|
||||
# use all available devices
|
||||
return nn.DataParallel(model, device_ids=None)
|
||||
elif num_gpus > num_cuda_devices:
|
||||
print(
|
||||
"Warning: Only {0} devices are available. "
|
||||
"Setting the number of gpus to {0}".format(num_cuda_devices)
|
||||
)
|
||||
return nn.DataParallel(model, device_ids=None)
|
||||
else:
|
||||
return nn.DataParallel(model, device_ids=list(range(num_gpus)))
|
||||
elif device.type == "cpu":
|
||||
if num_gpus != 0 and num_gpus is not None:
|
||||
warnings.warn("Device type is 'cpu'. num_gpus is ignored.")
|
||||
return model.to(device)
|
||||
batch_size = batch_size * max(1, num_gpus)
|
||||
|
||||
if distributed:
|
||||
sampler = DistributedSampler(ds)
|
||||
else:
|
||||
raise Exception(
|
||||
"Device type '{}' not supported. Currently, only cpu "
|
||||
"and cuda devices are supported.".format(device.type)
|
||||
)
|
||||
sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
|
||||
|
||||
return DataLoader(ds, sampler=sampler, batch_size=batch_size)
|
||||
|
||||
|
||||
def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1):
|
||||
"""Computes the max training steps given a dataloader.
|
||||
|
||||
Args:
|
||||
dataloader (Dataloader): A PyTorch DataLoader.
|
||||
num_epochs (int, optional): Number of training epochs. Defaults to 1.
|
||||
max_steps (int, optional): Total number of training steps.
|
||||
If set to a positive value, it overrides num_epochs.
|
||||
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
|
||||
Defualts to -1.
|
||||
gradient_accumulation_steps (int, optional): Number of steps to accumulate
|
||||
before performing a backward/update pass.
|
||||
Default to 1.
|
||||
|
||||
Returns:
|
||||
int: The max number of steps to be used in a training loop.
|
||||
"""
|
||||
try:
|
||||
dataset_length = len(dataloader)
|
||||
except Exception:
|
||||
dataset_length = -1
|
||||
if max_steps <= 0:
|
||||
if dataset_length != -1 and num_epochs > 0:
|
||||
max_steps = dataset_length // gradient_accumulation_steps * num_epochs
|
||||
if max_steps <= 0:
|
||||
raise Exception("Max steps cannot be determined.")
|
||||
return max_steps
|
||||
|
|
|
@ -7,24 +7,21 @@
|
|||
https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
import tarfile
|
||||
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
|
||||
from utils_nlp.dataset.url_utils import maybe_download
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
|
||||
from utils_nlp.models.transformers.sequence_classification import Processor
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
|
||||
URL = (
|
||||
"https://github.com/NirantK/hindi2vec/releases/"
|
||||
"download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"
|
||||
)
|
||||
URL = "https://github.com/NirantK/hindi2vec/releases/" "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"
|
||||
|
||||
|
||||
def load_pandas_df(local_cache_path=TemporaryDirectory().name):
|
||||
|
@ -49,19 +46,9 @@ def load_pandas_df(local_cache_path=TemporaryDirectory().name):
|
|||
train_csv_file_path = os.path.join(local_cache_path, "hindi-train.csv")
|
||||
test_csv_file_path = os.path.join(local_cache_path, "hindi-test.csv")
|
||||
|
||||
train_df = pd.read_csv(
|
||||
train_csv_file_path,
|
||||
sep="\t",
|
||||
encoding='utf-8',
|
||||
header=None
|
||||
)
|
||||
train_df = pd.read_csv(train_csv_file_path, sep="\t", encoding="utf-8", header=None)
|
||||
|
||||
test_df = pd.read_csv(
|
||||
test_csv_file_path,
|
||||
sep="\t",
|
||||
encoding='utf-8',
|
||||
header=None
|
||||
)
|
||||
test_df = pd.read_csv(test_csv_file_path, sep="\t", encoding="utf-8", header=None)
|
||||
|
||||
train_df = train_df.fillna("")
|
||||
test_df = test_df.fillna("")
|
||||
|
@ -80,7 +67,7 @@ def load_tc_dataset(
|
|||
cache_dir=TemporaryDirectory().name,
|
||||
max_len=MAX_SEQ_LEN,
|
||||
batch_size=32,
|
||||
num_gpus=None
|
||||
num_gpus=None,
|
||||
):
|
||||
"""
|
||||
Load the multinli dataset and split into training and testing datasets.
|
||||
|
@ -105,7 +92,7 @@ def load_tc_dataset(
|
|||
cache_dir (str, optional): The default folder for saving cache files.
|
||||
Defaults to TemporaryDirectory().name.
|
||||
max_len (int, optional): Maximum length of the list of tokens. Lists longer
|
||||
than this are truncated and shorter ones are padded with "O"s.
|
||||
than this are truncated and shorter ones are padded with "O"s.
|
||||
Default value is BERT_MAX_LEN=512.
|
||||
batch_size (int, optional): The batch size for training and testing.
|
||||
Defaults to 32.
|
||||
|
@ -114,15 +101,15 @@ def load_tc_dataset(
|
|||
|
||||
Returns:
|
||||
tuple. The tuple contains four elements:
|
||||
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
|
||||
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
|
||||
|
||||
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
|
||||
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
|
||||
label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
|
||||
can be retrieved by calling the `inverse_transform` function.
|
||||
|
||||
|
||||
test_labels (Series): a Pandas Series of testing label (in label ID format). If
|
||||
the labels are in raw label values format, we will need to transform it to
|
||||
the labels are in raw label values format, we will need to transform it to
|
||||
label IDs by using the label_encoder.transform function.
|
||||
"""
|
||||
|
||||
|
@ -140,12 +127,8 @@ def load_tc_dataset(
|
|||
if test_fraction < 0 or test_fraction >= 1.0:
|
||||
logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
|
||||
test_fraction = 0.25
|
||||
|
||||
train_df, test_df = train_test_split(
|
||||
all_df,
|
||||
train_size=(1.0 - test_fraction),
|
||||
random_state=random_seed
|
||||
)
|
||||
|
||||
train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
|
||||
|
||||
if train_sample_ratio > 1.0:
|
||||
train_sample_ratio = 1.0
|
||||
|
@ -153,7 +136,7 @@ def load_tc_dataset(
|
|||
elif train_sample_ratio < 0:
|
||||
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
|
||||
|
||||
if test_sample_ratio > 1.0:
|
||||
test_sample_ratio = 1.0
|
||||
logging.warning("Setting the testing sample ratio to 1.0")
|
||||
|
@ -171,42 +154,24 @@ def load_tc_dataset(
|
|||
test_labels = label_encoder.transform(test_df[label_col])
|
||||
test_df[label_col] = test_labels
|
||||
|
||||
processor = Processor(
|
||||
model_name=model_name,
|
||||
to_lower=to_lower,
|
||||
cache_dir=cache_dir
|
||||
)
|
||||
processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
|
||||
|
||||
train_dataloader = processor.create_dataloader_from_df(
|
||||
df=train_df,
|
||||
text_col=text_col,
|
||||
label_col=label_col,
|
||||
max_len=max_len,
|
||||
text2_col=None,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
shuffle=True,
|
||||
distributed=False
|
||||
train_dataset = processor.dataset_from_dataframe(
|
||||
df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
|
||||
)
|
||||
train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
|
||||
|
||||
test_dataloader = processor.create_dataloader_from_df(
|
||||
df=test_df,
|
||||
text_col=text_col,
|
||||
label_col=label_col,
|
||||
max_len=max_len,
|
||||
text2_col=None,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
shuffle=False,
|
||||
distributed=False
|
||||
test_dataset = processor.dataset_from_dataframe(
|
||||
df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
|
||||
)
|
||||
test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
|
||||
|
||||
return (train_dataloader, test_dataloader, label_encoder, test_labels)
|
||||
|
||||
|
||||
def get_label_values(label_encoder, label_ids):
|
||||
"""
|
||||
Get the label values from label IDs.
|
||||
Get the label values from label IDs.
|
||||
|
||||
Args:
|
||||
label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance
|
||||
|
|
|
@ -8,18 +8,18 @@ paper link: ("https://www.mendeley.com/catalogue/
|
|||
arabic-text-classification-using-deep-learning-technics/")
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import logging
|
||||
import numpy as np
|
||||
|
||||
import os
|
||||
from tempfile import TemporaryDirectory
|
||||
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
|
||||
from utils_nlp.models.transformers.sequence_classification import Processor
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
|
||||
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
|
||||
from utils_nlp.models.transformers.sequence_classification import Processor
|
||||
|
||||
URL = (
|
||||
"https://data.mendeley.com/datasets/v524p5dhpj/2"
|
||||
|
@ -58,7 +58,7 @@ def load_tc_dataset(
|
|||
cache_dir=TemporaryDirectory().name,
|
||||
max_len=MAX_SEQ_LEN,
|
||||
batch_size=32,
|
||||
num_gpus=None
|
||||
num_gpus=None,
|
||||
):
|
||||
"""
|
||||
Load the multinli dataset and split into training and testing datasets.
|
||||
|
@ -92,9 +92,9 @@ def load_tc_dataset(
|
|||
|
||||
Returns:
|
||||
tuple. The tuple contains four elements:
|
||||
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
|
||||
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
|
||||
|
||||
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
|
||||
label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
|
||||
can be retrieved by calling the `inverse_transform` function.
|
||||
|
@ -104,11 +104,8 @@ def load_tc_dataset(
|
|||
label IDs by using the label_encoder.transform function.
|
||||
"""
|
||||
|
||||
# download and load the original dataset
|
||||
all_df = load_pandas_df(
|
||||
local_cache_path=local_path,
|
||||
num_rows=None
|
||||
)
|
||||
# download and load the original dataset
|
||||
all_df = load_pandas_df(local_cache_path=local_path, num_rows=None)
|
||||
|
||||
# set the text and label columns
|
||||
text_col = all_df.columns[0]
|
||||
|
@ -123,12 +120,8 @@ def load_tc_dataset(
|
|||
if test_fraction < 0 or test_fraction >= 1.0:
|
||||
logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
|
||||
test_fraction = 0.25
|
||||
|
||||
train_df, test_df = train_test_split(
|
||||
all_df,
|
||||
train_size=(1.0 - test_fraction),
|
||||
random_state=random_seed
|
||||
)
|
||||
|
||||
train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
|
||||
|
||||
if train_sample_ratio > 1.0:
|
||||
train_sample_ratio = 1.0
|
||||
|
@ -136,7 +129,7 @@ def load_tc_dataset(
|
|||
elif train_sample_ratio < 0:
|
||||
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
|
||||
|
||||
if test_sample_ratio > 1.0:
|
||||
test_sample_ratio = 1.0
|
||||
logging.warning("Setting the testing sample ratio to 1.0")
|
||||
|
@ -149,35 +142,17 @@ def load_tc_dataset(
|
|||
if test_sample_ratio < 1.0:
|
||||
test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
|
||||
|
||||
processor = Processor(
|
||||
model_name=model_name,
|
||||
to_lower=to_lower,
|
||||
cache_dir=cache_dir
|
||||
)
|
||||
processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
|
||||
|
||||
train_dataloader = processor.create_dataloader_from_df(
|
||||
df=train_df,
|
||||
text_col=text_col,
|
||||
label_col=label_col,
|
||||
max_len=max_len,
|
||||
text2_col=None,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
shuffle=True,
|
||||
distributed=False
|
||||
train_dataset = processor.dataset_from_dataframe(
|
||||
df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
|
||||
)
|
||||
train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
|
||||
|
||||
test_dataloader = processor.create_dataloader_from_df(
|
||||
df=test_df,
|
||||
text_col=text_col,
|
||||
label_col=label_col,
|
||||
max_len=max_len,
|
||||
text2_col=None,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
shuffle=False,
|
||||
distributed=False
|
||||
test_dataset = processor.dataset_from_dataframe(
|
||||
df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
|
||||
)
|
||||
test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
|
||||
|
||||
# the DAC dataset already converted the labels to label ID format
|
||||
test_labels = test_df[label_col]
|
||||
|
|
|
@ -7,18 +7,19 @@
|
|||
https://www.nyu.edu/projects/bowman/multinli/
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
import pandas as pd
|
||||
import logging
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from tempfile import TemporaryDirectory
|
||||
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
|
||||
from utils_nlp.dataset.data_loaders import DaskJSONLoader
|
||||
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
|
||||
from utils_nlp.models.transformers.sequence_classification import Processor
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
URL = "http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip"
|
||||
DATA_FILES = {
|
||||
|
@ -63,9 +64,7 @@ def load_pandas_df(local_cache_path=".", file_split="train"):
|
|||
return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)
|
||||
|
||||
|
||||
def get_generator(
|
||||
local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None
|
||||
):
|
||||
def get_generator(local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None):
|
||||
""" Returns an extracted dataset as a random batch generator that
|
||||
yields pandas dataframes.
|
||||
Args:
|
||||
|
@ -85,9 +84,7 @@ def get_generator(
|
|||
except Exception as e:
|
||||
raise e
|
||||
|
||||
loader = DaskJSONLoader(
|
||||
os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size
|
||||
)
|
||||
loader = DaskJSONLoader(os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size)
|
||||
|
||||
return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)
|
||||
|
||||
|
@ -103,7 +100,7 @@ def load_tc_dataset(
|
|||
cache_dir=TemporaryDirectory().name,
|
||||
max_len=MAX_SEQ_LEN,
|
||||
batch_size=32,
|
||||
num_gpus=None
|
||||
num_gpus=None,
|
||||
):
|
||||
"""
|
||||
Load the multinli dataset and split into training and testing datasets.
|
||||
|
@ -137,9 +134,9 @@ def load_tc_dataset(
|
|||
|
||||
Returns:
|
||||
tuple. The tuple contains four elements:
|
||||
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
|
||||
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
|
||||
|
||||
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
|
||||
label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
|
||||
can be retrieved by calling the `inverse_transform` function.
|
||||
|
@ -150,10 +147,7 @@ def load_tc_dataset(
|
|||
"""
|
||||
|
||||
# download and load the original dataset
|
||||
all_df = load_pandas_df(
|
||||
local_cache_path=local_path,
|
||||
file_split="train"
|
||||
)
|
||||
all_df = load_pandas_df(local_cache_path=local_path, file_split="train")
|
||||
|
||||
# select the examples corresponding to one of the entailment labels (neutral
|
||||
# in this case) to avoid duplicate rows, as the sentences are not unique,
|
||||
|
@ -169,12 +163,8 @@ def load_tc_dataset(
|
|||
if test_fraction < 0 or test_fraction >= 1.0:
|
||||
logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
|
||||
test_fraction = 0.25
|
||||
|
||||
train_df, test_df = train_test_split(
|
||||
all_df,
|
||||
train_size=(1.0 - test_fraction),
|
||||
random_state=random_seed
|
||||
)
|
||||
|
||||
train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
|
||||
|
||||
if train_sample_ratio > 1.0:
|
||||
train_sample_ratio = 1.0
|
||||
|
@ -182,7 +172,7 @@ def load_tc_dataset(
|
|||
elif train_sample_ratio < 0:
|
||||
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
|
||||
|
||||
if test_sample_ratio > 1.0:
|
||||
test_sample_ratio = 1.0
|
||||
logging.warning("Setting the testing sample ratio to 1.0")
|
||||
|
@ -200,35 +190,17 @@ def load_tc_dataset(
|
|||
test_labels = label_encoder.transform(test_df[label_col])
|
||||
test_df[label_col] = test_labels
|
||||
|
||||
processor = Processor(
|
||||
model_name=model_name,
|
||||
to_lower=to_lower,
|
||||
cache_dir=cache_dir
|
||||
)
|
||||
processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
|
||||
|
||||
train_dataloader = processor.create_dataloader_from_df(
|
||||
df=train_df,
|
||||
text_col=text_col,
|
||||
label_col=label_col,
|
||||
max_len=max_len,
|
||||
text2_col=None,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
shuffle=True,
|
||||
distributed=False
|
||||
train_dataset = processor.dataset_from_dataframe(
|
||||
df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
|
||||
)
|
||||
train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
|
||||
|
||||
test_dataloader = processor.create_dataloader_from_df(
|
||||
df=test_df,
|
||||
text_col=text_col,
|
||||
label_col=label_col,
|
||||
max_len=max_len,
|
||||
text2_col=None,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
shuffle=False,
|
||||
distributed=False
|
||||
test_dataset = processor.dataset_from_dataframe(
|
||||
df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
|
||||
)
|
||||
test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
|
||||
|
||||
return (train_dataloader, test_dataloader, label_encoder, test_labels)
|
||||
|
||||
|
|
|
@ -7,18 +7,19 @@
|
|||
https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data
|
||||
"""
|
||||
|
||||
import random
|
||||
import os
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
import os
|
||||
import random
|
||||
from tempfile import TemporaryDirectory
|
||||
from utils_nlp.dataset.url_utils import maybe_download
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
|
||||
from utils_nlp.dataset.ner_utils import preprocess_conll
|
||||
from utils_nlp.dataset.url_utils import maybe_download
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
|
||||
from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor
|
||||
|
||||
|
||||
URL = (
|
||||
"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
|
||||
"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt"
|
||||
|
@ -91,7 +92,7 @@ def load_dataset(
|
|||
max_len=MAX_SEQ_LEN,
|
||||
trailing_piece_tag="X",
|
||||
batch_size=32,
|
||||
num_gpus=None
|
||||
num_gpus=None,
|
||||
):
|
||||
"""
|
||||
Load the wikigold dataset and split into training and testing datasets.
|
||||
|
@ -116,7 +117,7 @@ def load_dataset(
|
|||
cache_dir (str, optional): The default folder for saving cache files.
|
||||
Defaults to './temp'.
|
||||
max_len (int, optional): Maximum length of the list of tokens. Lists longer
|
||||
than this are truncated and shorter ones are padded with "O"s.
|
||||
than this are truncated and shorter ones are padded with "O"s.
|
||||
Default value is BERT_MAX_LEN=512.
|
||||
trailing_piece_tag (str, optional): Tag used to label trailing word pieces.
|
||||
For example, "criticize" is broken into "critic" and "##ize", "critic"
|
||||
|
@ -129,16 +130,12 @@ def load_dataset(
|
|||
|
||||
Returns:
|
||||
tuple. The tuple contains four elements.
|
||||
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
|
||||
|
||||
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
|
||||
label_map (dict): A dictionary object to map a label (str) to an ID (int).
|
||||
|
||||
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
|
||||
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
|
||||
label_map (dict): A dictionary object to map a label (str) to an ID (int).
|
||||
test_dataset (TensorDataset): A TensorDataset containing the following four tensors.
|
||||
1. input_ids_all: Tensor. Each sublist contains numerical values,
|
||||
i.e. token ids, corresponding to the tokens in the input
|
||||
text data.
|
||||
i.e. token ids, corresponding to the tokens in the input text data.
|
||||
2. input_mask_all: Tensor. Each sublist contains the attention
|
||||
mask of the input token id list, 1 for input tokens and 0 for
|
||||
padded tokens, so that padded tokens are not attended to.
|
||||
|
@ -155,9 +152,7 @@ def load_dataset(
|
|||
"""
|
||||
|
||||
train_df, test_df = load_train_test_dfs(
|
||||
local_cache_path=local_path,
|
||||
test_fraction=test_fraction,
|
||||
random_seed=random_seed
|
||||
local_cache_path=local_path, test_fraction=test_fraction, random_seed=random_seed
|
||||
)
|
||||
|
||||
if train_sample_ratio > 1.0:
|
||||
|
@ -166,7 +161,7 @@ def load_dataset(
|
|||
elif train_sample_ratio < 0:
|
||||
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
|
||||
|
||||
|
||||
if test_sample_ratio > 1.0:
|
||||
test_sample_ratio = 1.0
|
||||
logging.warning("Setting the testing sample ratio to 1.0")
|
||||
|
@ -179,47 +174,34 @@ def load_dataset(
|
|||
if test_sample_ratio < 1.0:
|
||||
test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
|
||||
|
||||
processor = TokenClassificationProcessor(
|
||||
model_name=model_name,
|
||||
to_lower=to_lower,
|
||||
cache_dir=cache_dir
|
||||
)
|
||||
processor = TokenClassificationProcessor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
|
||||
|
||||
label_map = TokenClassificationProcessor.create_label_map(
|
||||
label_lists=train_df['labels'],
|
||||
trailing_piece_tag=trailing_piece_tag
|
||||
label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag
|
||||
)
|
||||
|
||||
train_dataset = processor.preprocess_for_bert(
|
||||
text=train_df['sentence'],
|
||||
text=train_df["sentence"],
|
||||
max_len=max_len,
|
||||
labels=train_df['labels'],
|
||||
labels=train_df["labels"],
|
||||
label_map=label_map,
|
||||
trailing_piece_tag=trailing_piece_tag
|
||||
trailing_piece_tag=trailing_piece_tag,
|
||||
)
|
||||
|
||||
test_dataset = processor.preprocess_for_bert(
|
||||
text=test_df['sentence'],
|
||||
text=test_df["sentence"],
|
||||
max_len=max_len,
|
||||
labels=test_df['labels'],
|
||||
labels=test_df["labels"],
|
||||
label_map=label_map,
|
||||
trailing_piece_tag=trailing_piece_tag
|
||||
trailing_piece_tag=trailing_piece_tag,
|
||||
)
|
||||
|
||||
train_dataloader = processor.create_dataloader_from_dataset(
|
||||
train_dataset,
|
||||
shuffle=True,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
distributed=False
|
||||
train_dataloader = dataloader_from_dataset(
|
||||
train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True, distributed=False
|
||||
)
|
||||
|
||||
test_dataloader = processor.create_dataloader_from_dataset(
|
||||
test_dataset,
|
||||
shuffle=False,
|
||||
batch_size=batch_size,
|
||||
num_gpus=num_gpus,
|
||||
distributed=False
|
||||
test_dataloader = dataloader_from_dataset(
|
||||
test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False, distributed=False
|
||||
)
|
||||
|
||||
return (train_dataloader, test_dataloader, label_map, test_dataset)
|
||||
|
|
|
@ -3,22 +3,24 @@
|
|||
|
||||
import os
|
||||
from random import random, seed
|
||||
|
||||
from bertsum.others.utils import test_rouge
|
||||
|
||||
|
||||
def get_rouge(predictions, targets, temp_dir):
|
||||
def get_rouge(predictions, targets, temp_dir, random_seed=42):
|
||||
"""
|
||||
function to get the rouge metric for the prediction and the reference.
|
||||
|
||||
Args:
|
||||
predictions (list of strings): Predictions to be compared.
|
||||
target (list of strings): References
|
||||
temp_dir (str): Path where temporary folders are created to host the files
|
||||
generated by ROUGE applicatoin.
|
||||
temp_dir (str): Path where temporary folders are created to host the files
|
||||
generated by ROUGE application.
|
||||
seed (int, optional): Random seed. Defaults to 42.
|
||||
|
||||
Return:
|
||||
dictionary: rouge metric
|
||||
|
||||
|
||||
"""
|
||||
|
||||
def _write_list_to_file(list_items, filename):
|
||||
|
@ -27,7 +29,7 @@ def get_rouge(predictions, targets, temp_dir):
|
|||
for item in list_items:
|
||||
filehandle.write("%s\n" % item)
|
||||
|
||||
seed(42)
|
||||
seed(random_seed)
|
||||
random_number = random()
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
candidate_path = os.path.join(temp_dir, "candidate" + str(random_number))
|
||||
|
|
|
@ -13,7 +13,7 @@ from pytorch_pretrained_bert.optimization import BertAdam
|
|||
from tqdm import tqdm
|
||||
|
||||
from utils_nlp.models.bert.common import Language
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_to_device
|
||||
from utils_nlp.common.pytorch_utils import get_device
|
||||
|
||||
from cached_property import cached_property
|
||||
|
||||
|
@ -91,7 +91,7 @@ class BERTSequenceClassifier:
|
|||
|
||||
device, num_gpus = get_device(num_gpus)
|
||||
|
||||
self.model = move_to_device(self.model, device, num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, num_gpus)
|
||||
|
||||
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
|
||||
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
|
||||
|
@ -211,7 +211,7 @@ class BERTSequenceClassifier:
|
|||
(classes, probabilities) if probabilities is True.
|
||||
"""
|
||||
device, num_gpus = get_device(num_gpus)
|
||||
self.model = move_to_device(self.model, device, num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, num_gpus)
|
||||
|
||||
# score
|
||||
self.model.eval()
|
||||
|
|
|
@ -14,7 +14,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
|
|||
from pytorch_pretrained_bert.optimization import BertAdam
|
||||
from tqdm import tqdm
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_to_device
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
|
||||
from utils_nlp.models.bert.common import Language
|
||||
|
||||
try:
|
||||
|
@ -192,7 +192,7 @@ class BERTSequenceClassifier:
|
|||
|
||||
device, num_gpus = get_device(num_gpus)
|
||||
|
||||
self.model = move_to_device(self.model, device, num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, num_gpus)
|
||||
|
||||
if bert_optimizer is None:
|
||||
bert_optimizer = self.create_optimizer(
|
||||
|
@ -277,7 +277,7 @@ class BERTSequenceClassifier:
|
|||
a dictionary with classes, target labels, probabilities) if probabilities is True.
|
||||
"""
|
||||
device, num_gpus = get_device(num_gpus)
|
||||
self.model = move_to_device(self.model, device, num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, num_gpus)
|
||||
|
||||
# score
|
||||
self.model.eval()
|
||||
|
|
|
@ -4,19 +4,17 @@
|
|||
# This script reuses code from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples
|
||||
# /extract_features.py, with necessary modifications.
|
||||
|
||||
from pytorch_pretrained_bert.modeling import BertModel
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_to_device
|
||||
from enum import Enum
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import os
|
||||
import torch
|
||||
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
||||
|
||||
from utils_nlp.models.bert.common import Language, Tokenizer
|
||||
from cached_property import cached_property
|
||||
from pytorch_pretrained_bert.modeling import BertModel
|
||||
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
|
||||
from utils_nlp.models.bert.common import Language, Tokenizer
|
||||
|
||||
|
||||
class PoolingStrategy(str, Enum):
|
||||
|
@ -43,27 +41,21 @@ class BERTSentenceEncoder:
|
|||
pooling_strategy=PoolingStrategy.MEAN,
|
||||
):
|
||||
"""Initialize the encoder's underlying model and tokenizer
|
||||
|
||||
|
||||
Args:
|
||||
bert_model: BERT model to use for encoding. Defaults to pretrained BertModel.
|
||||
tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer.
|
||||
language: The pretrained model's language. Defaults to Language.ENGLISH.
|
||||
num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used.
|
||||
num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used.
|
||||
cache_dir: Location of BERT's cache directory. Defaults to "."
|
||||
to_lower: True to lowercase before tokenization. Defaults to False.
|
||||
max_len: Maximum number of tokens.
|
||||
layer_index: The layer from which to extract features.
|
||||
layer_index: The layer from which to extract features.
|
||||
Defaults to the last layer; can also be a list of integers for experimentation.
|
||||
pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding.
|
||||
"""
|
||||
self.model = (
|
||||
bert_model.model.bert
|
||||
if bert_model
|
||||
else BertModel.from_pretrained(language, cache_dir=cache_dir)
|
||||
)
|
||||
self.tokenizer = (
|
||||
tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
|
||||
)
|
||||
self.model = bert_model.model.bert if bert_model else BertModel.from_pretrained(language, cache_dir=cache_dir)
|
||||
self.tokenizer = tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
|
||||
self.num_gpus = num_gpus
|
||||
self.max_len = max_len
|
||||
self.layer_index = layer_index
|
||||
|
@ -98,16 +90,17 @@ class BERTSentenceEncoder:
|
|||
|
||||
def get_hidden_states(self, text, batch_size=32):
|
||||
"""Extract the hidden states from the pretrained model
|
||||
|
||||
|
||||
Args:
|
||||
text: List of documents to extract features from.
|
||||
batch_size: Batch size, defaults to 32.
|
||||
|
||||
|
||||
Returns:
|
||||
pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]).
|
||||
pd.DataFrame with columns:
|
||||
text_index (int), token (str), layer_index (int), values (list[float]).
|
||||
"""
|
||||
device, num_gpus = get_device(self.num_gpus)
|
||||
self.model = move_to_device(self.model, device, self.num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, self.num_gpus)
|
||||
|
||||
self.model.eval()
|
||||
|
||||
|
@ -122,9 +115,7 @@ class BERTSentenceEncoder:
|
|||
input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device)
|
||||
|
||||
eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
|
||||
eval_dataloader = DataLoader(
|
||||
eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size
|
||||
)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size)
|
||||
|
||||
hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []}
|
||||
for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader:
|
||||
|
@ -142,9 +133,7 @@ class BERTSentenceEncoder:
|
|||
hidden_states["text_index"].append(example_index.item())
|
||||
hidden_states["token"].append(token)
|
||||
hidden_states["layer_index"].append(layer_index)
|
||||
hidden_states["values"].append(
|
||||
[round(x.item(), 6) for x in layer_output[i]]
|
||||
)
|
||||
hidden_states["values"].append([round(x.item(), 6) for x in layer_output[i]])
|
||||
|
||||
# empty cache
|
||||
del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
|
||||
|
@ -158,7 +147,7 @@ class BERTSentenceEncoder:
|
|||
|
||||
def pool(self, df):
|
||||
"""Pooling to aggregate token-wise embeddings to sentence embeddings
|
||||
|
||||
|
||||
Args:
|
||||
df: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float])
|
||||
|
||||
|
@ -167,31 +156,16 @@ class BERTSentenceEncoder:
|
|||
"""
|
||||
|
||||
def max_pool(x):
|
||||
values = np.array(
|
||||
[
|
||||
np.reshape(np.array(x.values[i]), self.embedding_dim)
|
||||
for i in range(x.values.shape[0])
|
||||
]
|
||||
)
|
||||
values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
|
||||
m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0)
|
||||
return m.numpy()
|
||||
|
||||
def mean_pool(x):
|
||||
values = np.array(
|
||||
[
|
||||
np.reshape(np.array(x.values[i]), self.embedding_dim)
|
||||
for i in range(x.values.shape[0])
|
||||
]
|
||||
)
|
||||
values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
|
||||
return torch.mean(torch.tensor(values, dtype=torch.float), 0).numpy()
|
||||
|
||||
def cls_pool(x):
|
||||
values = np.array(
|
||||
[
|
||||
np.reshape(np.array(x.values[i]), self.embedding_dim)
|
||||
for i in range(x.values.shape[0])
|
||||
]
|
||||
)
|
||||
values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
|
||||
return values[0]
|
||||
|
||||
try:
|
||||
|
@ -206,15 +180,11 @@ class BERTSentenceEncoder:
|
|||
except ValueError as ve:
|
||||
print(ve)
|
||||
|
||||
return (
|
||||
df.groupby(["text_index", "layer_index"])["values"]
|
||||
.apply(lambda x: pool_func(x))
|
||||
.reset_index()
|
||||
)
|
||||
return df.groupby(["text_index", "layer_index"])["values"].apply(lambda x: pool_func(x)).reset_index()
|
||||
|
||||
def encode(self, text, batch_size=32, as_numpy=False):
|
||||
"""Computes sentence encodings
|
||||
|
||||
"""Computes sentence encodings
|
||||
|
||||
Args:
|
||||
text: List of documents to encode.
|
||||
batch_size: Batch size, defaults to 32.
|
||||
|
|
|
@ -16,7 +16,7 @@ from pytorch_pretrained_bert.optimization import BertAdam
|
|||
from tqdm import tqdm, trange
|
||||
|
||||
from utils_nlp.models.bert.common import Language, create_data_loader
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_to_device
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
|
||||
|
||||
from cached_property import cached_property
|
||||
|
||||
|
@ -144,7 +144,7 @@ class BERTTokenClassifier:
|
|||
|
||||
device, num_gpus = get_device(num_gpus)
|
||||
|
||||
self.model = move_to_device(self.model, device, num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, num_gpus)
|
||||
|
||||
if num_gpus is None:
|
||||
num_gpus_used = torch.cuda.device_count()
|
||||
|
@ -228,7 +228,7 @@ class BERTTokenClassifier:
|
|||
)
|
||||
device, num_gpus = get_device(num_gpus)
|
||||
|
||||
self.model = move_to_device(self.model, device, num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, num_gpus)
|
||||
|
||||
self.model.eval()
|
||||
eval_loss = 0
|
||||
|
|
|
@ -4,17 +4,16 @@
|
|||
# This script reuses some code from
|
||||
# https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py
|
||||
|
||||
from itertools import cycle
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import torch
|
||||
from tqdm import tqdm, trange
|
||||
from itertools import cycle
|
||||
|
||||
from transformers import AdamW
|
||||
from transformers import get_linear_schedule_with_warmup
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
@ -23,7 +22,8 @@ from transformers.tokenization_bert import BertTokenizer
|
|||
from transformers.tokenization_distilbert import DistilBertTokenizer
|
||||
from transformers.tokenization_roberta import RobertaTokenizer
|
||||
from transformers.tokenization_xlnet import XLNetTokenizer
|
||||
from utils_nlp.common.pytorch_utils import get_device
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
|
||||
|
||||
TOKENIZER_CLASS = {}
|
||||
TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
|
@ -38,12 +38,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
class Transformer:
|
||||
def __init__(
|
||||
self,
|
||||
model_class,
|
||||
model_name="bert-base-cased",
|
||||
num_labels=2,
|
||||
cache_dir=".",
|
||||
load_model_from_dir=None,
|
||||
self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None,
|
||||
):
|
||||
|
||||
if model_name not in self.list_supported_models():
|
||||
|
@ -82,22 +77,40 @@ class Transformer:
|
|||
if cuda and torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
@staticmethod
|
||||
def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
|
||||
return optimizer
|
||||
|
||||
@staticmethod
|
||||
def get_default_scheduler(optimizer, warmup_steps, num_training_steps):
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
|
||||
)
|
||||
return scheduler
|
||||
|
||||
def fine_tune(
|
||||
self,
|
||||
train_dataloader,
|
||||
get_inputs,
|
||||
num_gpus=None,
|
||||
gpu_ids=None,
|
||||
max_steps=-1,
|
||||
num_train_epochs=1,
|
||||
max_grad_norm=1.0,
|
||||
gradient_accumulation_steps=1,
|
||||
n_gpu=1,
|
||||
move_batch_to_device=None,
|
||||
optimizer=None,
|
||||
scheduler=None,
|
||||
weight_decay=0.0,
|
||||
learning_rate=5e-5,
|
||||
adam_epsilon=1e-8,
|
||||
warmup_steps=0,
|
||||
fp16=False,
|
||||
fp16_opt_level="O1",
|
||||
local_rank=-1,
|
||||
|
@ -107,51 +120,12 @@ class Transformer:
|
|||
clip_grad_norm=True,
|
||||
):
|
||||
|
||||
device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
|
||||
# get device
|
||||
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
|
||||
|
||||
if seed is not None:
|
||||
Transformer.set_seed(seed, num_gpus > 0)
|
||||
|
||||
try:
|
||||
dataset_length = len(train_dataloader)
|
||||
except:
|
||||
dataset_length = -1
|
||||
|
||||
if max_steps <= 0:
|
||||
if dataset_length != -1 and num_train_epochs > 0:
|
||||
max_steps = dataset_length // gradient_accumulation_steps * num_train_epochs
|
||||
|
||||
if max_steps <= 0:
|
||||
raise Exception("Max steps cannot be determined for fine tuning!")
|
||||
|
||||
if optimizer is None:
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [
|
||||
p
|
||||
for n, p in self.model.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)
|
||||
],
|
||||
"weight_decay": weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
p
|
||||
for n, p in self.model.named_parameters()
|
||||
if any(nd in n for nd in no_decay)
|
||||
],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
|
||||
|
||||
|
||||
if scheduler is None:
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps
|
||||
)
|
||||
|
||||
if fp16:
|
||||
try:
|
||||
from apex import amp
|
||||
|
@ -159,46 +133,22 @@ class Transformer:
|
|||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex")
|
||||
self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level)
|
||||
|
||||
if local_rank != -1:
|
||||
self.model = torch.nn.parallel.DistributedDataParallel(
|
||||
self.model,
|
||||
device_ids=[local_rank],
|
||||
output_device=local_rank,
|
||||
find_unused_parameters=True,
|
||||
)
|
||||
else:
|
||||
if isinstance(self.model, torch.nn.DataParallel):
|
||||
self.model = self.model.module
|
||||
|
||||
if num_gpus > 1:
|
||||
self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
|
||||
|
||||
self.model.to(device)
|
||||
self.model.train()
|
||||
# move model
|
||||
self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
|
||||
|
||||
# init training
|
||||
global_step = 0
|
||||
tr_loss = 0.0
|
||||
self.model.zero_grad()
|
||||
|
||||
|
||||
if move_batch_to_device is None:
|
||||
def move_batch_to_device(batch, device):
|
||||
return tuple(t.to(device) for t in batch)
|
||||
|
||||
start = time.time()
|
||||
accum_loss = 0
|
||||
|
||||
self.model.train()
|
||||
self.model.zero_grad()
|
||||
|
||||
while global_step < max_steps:
|
||||
epoch_iterator = tqdm(
|
||||
train_dataloader,
|
||||
desc="Iteration",
|
||||
disable=local_rank not in [-1, 0] or not verbose
|
||||
)
|
||||
# train
|
||||
start = time.time()
|
||||
while global_step < max_steps:
|
||||
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose)
|
||||
for step, batch in enumerate(epoch_iterator):
|
||||
batch = move_batch_to_device(batch, device)
|
||||
inputs = get_inputs(batch, self.model_name)
|
||||
inputs = get_inputs(batch, device, self.model_name)
|
||||
outputs = self.model(**inputs)
|
||||
loss = outputs[0]
|
||||
|
||||
|
@ -210,28 +160,26 @@ class Transformer:
|
|||
if fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
if clip_grad_norm:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
|
||||
else:
|
||||
loss.backward()
|
||||
if clip_grad_norm:
|
||||
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
|
||||
|
||||
tr_loss += loss.item()
|
||||
|
||||
accum_loss += loss.item()
|
||||
|
||||
if (step + 1) % gradient_accumulation_steps == 0:
|
||||
global_step += 1
|
||||
|
||||
if clip_grad_norm:
|
||||
if fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
|
||||
|
||||
if global_step % report_every == 0 and verbose:
|
||||
# tqdm.write("Loss:{:.6f}".format(loss))
|
||||
end = time.time()
|
||||
print(
|
||||
"loss: {0:.6f}, time: {1:f}, number of examples in current step: {2:.0f}, step {3:.0f} out of total {4:.0f}".format(
|
||||
accum_loss / report_every,
|
||||
end - start,
|
||||
len(batch),
|
||||
global_step,
|
||||
max_steps,
|
||||
"loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format(
|
||||
accum_loss / report_every, end - start, len(batch), global_step, max_steps,
|
||||
)
|
||||
)
|
||||
accum_loss = 0
|
||||
|
@ -246,31 +194,20 @@ class Transformer:
|
|||
epoch_iterator.close()
|
||||
break
|
||||
|
||||
# empty cache
|
||||
torch.cuda.empty_cache()
|
||||
return global_step, tr_loss / global_step
|
||||
|
||||
|
||||
def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True, move_batch_to_device=None):
|
||||
device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
|
||||
|
||||
if isinstance(self.model, torch.nn.DataParallel):
|
||||
self.model = self.model.module
|
||||
def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verbose=True):
|
||||
# get device
|
||||
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
|
||||
|
||||
if num_gpus > 1:
|
||||
self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
|
||||
# move model
|
||||
self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
|
||||
|
||||
self.model.to(device)
|
||||
# predict
|
||||
self.model.eval()
|
||||
|
||||
if move_batch_to_device is None:
|
||||
def move_batch_to_device(batch, device):
|
||||
return tuple(t.to(device) for t in batch)
|
||||
|
||||
for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
|
||||
batch = move_batch_to_device(batch, device) #tuple(t.to(device) for t in batch)
|
||||
for batch in tqdm(eval_dataloader, desc="Scoring", disable=not verbose):
|
||||
with torch.no_grad():
|
||||
inputs = get_inputs(batch, self.model_name, train_mode=False)
|
||||
inputs = get_inputs(batch, device, self.model_name, train_mode=False)
|
||||
outputs = self.model(**inputs)
|
||||
logits = outputs[0]
|
||||
yield logits.detach().cpu().numpy()
|
||||
|
|
|
@ -5,24 +5,22 @@
|
|||
|
||||
import itertools
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import Dataset, IterableDataset
|
||||
from torch.utils.data import DataLoader, SequentialSampler
|
||||
from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSampler
|
||||
|
||||
# from torch.utils.data.distributed import DistributedSampler
|
||||
from transformers import DistilBertModel, BertModel
|
||||
from transformers import BertModel, DistilBertModel
|
||||
|
||||
from bertsum.models import model_builder, data_loader
|
||||
from bertsum.models import data_loader, model_builder
|
||||
from bertsum.models.data_loader import Batch
|
||||
from bertsum.models.model_builder import Summarizer
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device
|
||||
from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
|
||||
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device
|
||||
from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
|
||||
from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
|
||||
|
||||
MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": DistilBertModel}
|
||||
|
||||
|
@ -42,8 +40,8 @@ def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000):
|
|||
|
||||
Args:
|
||||
data_iter (generator): data generator.
|
||||
shuffle (bool): whether the data is shuffled
|
||||
is_labeled (bool): it specifies whether the data objects are labeled data.
|
||||
shuffle (bool): whether the data is shuffled.
|
||||
is_labeled (bool): specifies whether the data objects are labeled data.
|
||||
batch_size (int): number of tokens per batch.
|
||||
|
||||
Returns:
|
||||
|
@ -79,9 +77,7 @@ class ExtSumProcessedIterableDataset(IterableDataset):
|
|||
if self.is_shuffle:
|
||||
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list)))
|
||||
else:
|
||||
return itertools.chain.from_iterable(
|
||||
map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))
|
||||
)
|
||||
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list))))
|
||||
|
||||
def __iter__(self):
|
||||
return self.get_stream()
|
||||
|
@ -114,9 +110,7 @@ class ExtSumProcessedDataset(Dataset):
|
|||
return self.data[idx]
|
||||
|
||||
|
||||
def get_pred(
|
||||
example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3
|
||||
):
|
||||
def get_pred(example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3):
|
||||
"""
|
||||
Get the summarization prediction for the paragraph example based on the scores
|
||||
returned by the transformer summarization model.
|
||||
|
@ -229,9 +223,7 @@ class ExtSumProcessedData:
|
|||
def _get_files(self, root):
|
||||
train_files = []
|
||||
test_files = []
|
||||
files = [
|
||||
os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
|
||||
]
|
||||
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))]
|
||||
for fname in files:
|
||||
if fname.find("train") != -1:
|
||||
train_files.append(fname)
|
||||
|
@ -324,7 +316,7 @@ class ExtSumProcessor:
|
|||
self._model_name = value
|
||||
|
||||
@staticmethod
|
||||
def get_inputs(batch, model_name, train_mode=True):
|
||||
def get_inputs(batch, device, model_name, train_mode=True):
|
||||
"""
|
||||
Creates an input dictionary given a model name.
|
||||
|
||||
|
@ -332,6 +324,7 @@ class ExtSumProcessor:
|
|||
batch (object): A Batch containing input ids, segment ids, sentence class ids,
|
||||
masks for the input ids, masks for sentence class ids and source text.
|
||||
If train_model is True, it also contains the labels and target text.
|
||||
device (torch.device): A PyTorch device.
|
||||
model_name (bool, optional): Model name used to format the inputs.
|
||||
train_mode (bool, optional): Training mode flag.
|
||||
Defaults to True.
|
||||
|
@ -344,6 +337,7 @@ class ExtSumProcessor:
|
|||
|
||||
if model_name.split("-")[0] in ["bert", "distilbert"]:
|
||||
if train_mode:
|
||||
batch = batch.to(device)
|
||||
# labels must be the last
|
||||
return {
|
||||
"x": batch.src,
|
||||
|
@ -354,12 +348,13 @@ class ExtSumProcessor:
|
|||
"labels": batch.labels,
|
||||
}
|
||||
else:
|
||||
batch = Bunch(batch)
|
||||
return {
|
||||
"x": batch.src,
|
||||
"segs": batch.segs,
|
||||
"clss": batch.clss,
|
||||
"mask": batch.mask,
|
||||
"mask_cls": batch.mask_cls,
|
||||
"x": batch.src.to(device),
|
||||
"segs": batch.segs.to(device),
|
||||
"clss": batch.clss.to(device),
|
||||
"mask": batch.mask.to(device),
|
||||
"mask_cls": batch.mask_cls.to(device),
|
||||
}
|
||||
else:
|
||||
raise ValueError("Model not supported: {}".format(model_name))
|
||||
|
@ -476,7 +471,7 @@ class ExtractiveSummarizer(Transformer):
|
|||
Args:
|
||||
model_name (str, optional): Transformer model name used in preprocessing.
|
||||
check MODEL_CLASS for supported models. Defaults to "distilbert-base-uncased".
|
||||
encoder (str, optional): Encoder algorithm used by summarization layer.
|
||||
encoder (str, optional): Encoder algorithm used by summarization layer.
|
||||
There are four options:
|
||||
- baseline: it used a smaller transformer model to replace the bert model
|
||||
and with transformer summarization layer.
|
||||
|
@ -485,13 +480,11 @@ class ExtractiveSummarizer(Transformer):
|
|||
- transformer: it uses pretrained BERT and fine-tune BERT with transformer
|
||||
summarization layer.
|
||||
- RNN: it uses pretrained BERT and fine-tune BERT with LSTM summarization layer.
|
||||
Defaults to "transformer".
|
||||
Defaults to "transformer".
|
||||
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
|
||||
"""
|
||||
|
||||
super().__init__(
|
||||
model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir
|
||||
)
|
||||
super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir)
|
||||
if model_name not in self.list_supported_models():
|
||||
raise ValueError(
|
||||
"Model name {} is not supported by ExtractiveSummarizer. "
|
||||
|
@ -522,6 +515,7 @@ class ExtractiveSummarizer(Transformer):
|
|||
self,
|
||||
train_dataset,
|
||||
num_gpus=None,
|
||||
gpu_ids=None,
|
||||
batch_size=3000,
|
||||
local_rank=-1,
|
||||
max_steps=5e5,
|
||||
|
@ -546,7 +540,10 @@ class ExtractiveSummarizer(Transformer):
|
|||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
batch_size (int, optional): Maximum number of tokens in each batch.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
batch_size (int, optional): Maximum number of tokens in each batch.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
|
||||
-1, which means non-distributed training.
|
||||
max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.
|
||||
|
@ -571,16 +568,7 @@ class ExtractiveSummarizer(Transformer):
|
|||
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
|
||||
"""
|
||||
|
||||
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
|
||||
|
||||
def move_batch_to_device(batch, device):
|
||||
return batch.to(device)
|
||||
|
||||
# if isinstance(self.model, nn.DataParallel):
|
||||
# self.model.module.to(device)
|
||||
# else:
|
||||
self.model.to(device)
|
||||
|
||||
# init optimizer
|
||||
optimizer = model_builder.build_optim(
|
||||
optimization_method,
|
||||
learning_rate,
|
||||
|
@ -594,31 +582,34 @@ class ExtractiveSummarizer(Transformer):
|
|||
)
|
||||
|
||||
# batch_size is the number of tokens in a batch
|
||||
train_dataloader = get_dataloader(
|
||||
train_dataset.get_stream(), is_labeled=True, batch_size=batch_size
|
||||
train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)
|
||||
|
||||
# compute the max number of training steps
|
||||
max_steps = compute_training_steps(
|
||||
train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
super().fine_tune(
|
||||
train_dataloader=train_dataloader,
|
||||
get_inputs=ExtSumProcessor.get_inputs,
|
||||
move_batch_to_device=move_batch_to_device,
|
||||
n_gpu=num_gpus,
|
||||
num_train_epochs=-1,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
max_steps=max_steps,
|
||||
optimizer=optimizer,
|
||||
warmup_steps=warmup_steps,
|
||||
max_grad_norm=max_grad_norm,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
optimizer=optimizer,
|
||||
scheduler=None,
|
||||
verbose=verbose,
|
||||
seed=seed,
|
||||
report_every=report_every,
|
||||
clip_grad_norm=False,
|
||||
max_grad_norm=max_grad_norm,
|
||||
)
|
||||
|
||||
def predict(
|
||||
self,
|
||||
test_dataset,
|
||||
num_gpus=1,
|
||||
gpu_ids=None,
|
||||
batch_size=16,
|
||||
sentence_separator="<q>",
|
||||
top_n=3,
|
||||
|
@ -632,6 +623,9 @@ class ExtractiveSummarizer(Transformer):
|
|||
Args:
|
||||
test_dataset (Dataset): Dataset for which the summary to be predicted
|
||||
num_gpus (int, optional): The number of GPUs used in prediction. Defaults to 1.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
batch_size (int, optional): The number of test examples in each batch. Defaults to 16.
|
||||
sentence_separator (str, optional): String to be inserted between sentences in
|
||||
the prediction. Defaults to '<q>'.
|
||||
|
@ -678,10 +672,8 @@ class ExtractiveSummarizer(Transformer):
|
|||
}
|
||||
|
||||
test_sampler = SequentialSampler(test_dataset)
|
||||
test_dataloader = DataLoader(
|
||||
test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn
|
||||
)
|
||||
sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus)
|
||||
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn)
|
||||
sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids)
|
||||
sent_scores_list = list(sent_scores)
|
||||
scores_list = []
|
||||
for i in sent_scores_list:
|
||||
|
@ -699,15 +691,18 @@ class ExtractiveSummarizer(Transformer):
|
|||
prediction.extend(temp_pred)
|
||||
return prediction
|
||||
|
||||
def predict_scores(self, eval_dataloader, num_gpus=1, verbose=True):
|
||||
def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True):
|
||||
"""
|
||||
Scores a dataset using a fine-tuned model and a given dataloader.
|
||||
|
||||
Args:
|
||||
eval_dataloader (Dataloader): Dataloader for the evaluation data.
|
||||
test_dataloader (Dataloader): Dataloader for scoring the data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
verbose (bool, optional): Whether to print out the training log. Defaults to True.
|
||||
|
||||
Returns
|
||||
|
@ -716,23 +711,13 @@ class ExtractiveSummarizer(Transformer):
|
|||
|
||||
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
|
||||
|
||||
def move_batch_to_device(batch, device):
|
||||
batch["src"] = batch["src"].to(device)
|
||||
batch["segs"] = batch["segs"].to(device)
|
||||
batch["clss"] = batch["clss"].to(device)
|
||||
batch["mask"] = batch["mask"].to(device)
|
||||
batch["mask_cls"] = batch["mask_cls"].to(device)
|
||||
if "labels" in batch:
|
||||
batch["labels"] = batch["labels"].to(device)
|
||||
return Bunch(batch)
|
||||
|
||||
preds = list(
|
||||
super().predict(
|
||||
eval_dataloader=eval_dataloader,
|
||||
eval_dataloader=test_dataloader,
|
||||
get_inputs=ExtSumProcessor.get_inputs,
|
||||
n_gpu=num_gpus,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
verbose=verbose,
|
||||
move_batch_to_device=move_batch_to_device,
|
||||
)
|
||||
)
|
||||
return preds
|
||||
|
|
|
@ -2,18 +2,16 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
import logging
|
||||
from collections import Iterable
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from collections import Iterable
|
||||
from torch.utils.data import TensorDataset
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification
|
||||
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForTokenClassification
|
||||
from utils_nlp.common.pytorch_utils import get_device
|
||||
|
||||
from utils_nlp.common.pytorch_utils import compute_training_steps
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
TC_MODEL_CLASS = {}
|
||||
TC_MODEL_CLASS.update({k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
|
@ -42,27 +40,36 @@ class TokenClassificationProcessor:
|
|||
)
|
||||
|
||||
@staticmethod
|
||||
def get_inputs(batch, model_name, train_mode=True):
|
||||
def get_inputs(batch, device, model_name, train_mode=True):
|
||||
"""
|
||||
Produce a dictionary object for model training or prediction.
|
||||
Creates an input dictionary given a model name.
|
||||
|
||||
Args:
|
||||
model_name (str): The pretained model name.
|
||||
train_mode (bool, optional): Whether it's for model training. Set it to False if
|
||||
it's for testing and it won't have the 'labels' data field.
|
||||
Defaults to True, for model training.
|
||||
batch (tuple): A tuple containing input ids, attention mask,
|
||||
segment ids, and labels tensors.
|
||||
device (torch.device): A PyTorch device.
|
||||
model_name (bool, optional): Model name used to format the inputs.
|
||||
train_mode (bool, optional): Training mode flag.
|
||||
Defaults to True.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary object contains all needed information for training or testing.
|
||||
dict: Dictionary containing input ids, segment ids, masks, and labels.
|
||||
Labels are only returned when train_mode is True.
|
||||
"""
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
if model_name.split("-")[0] in ["bert", "distilbert"]:
|
||||
if train_mode:
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
else:
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
|
||||
|
||||
if model_name not in list(TC_MODEL_CLASS):
|
||||
raise ValueError("Model not supported: {}".format(model_name))
|
||||
# distilbert doesn't support segment ids
|
||||
if model_name.split("-")[0] not in ["distilbert"]:
|
||||
inputs["token_type_ids"] = batch[2]
|
||||
|
||||
if train_mode:
|
||||
return {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
return inputs
|
||||
else:
|
||||
return {"input_ids": batch[0], "attention_mask": batch[1]}
|
||||
raise ValueError("Model not supported: {}".format(model_name))
|
||||
|
||||
@staticmethod
|
||||
def create_label_map(label_lists, trailing_piece_tag="X"):
|
||||
|
@ -89,9 +96,7 @@ class TokenClassificationProcessor:
|
|||
label_map[trailing_piece_tag] = len(label_set)
|
||||
return label_map
|
||||
|
||||
def preprocess_for_bert(
|
||||
self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"
|
||||
):
|
||||
def preprocess_for_bert(self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"):
|
||||
"""
|
||||
Tokenize and preprocesses input word lists, involving the following steps
|
||||
0. WordPiece tokenization.
|
||||
|
@ -125,7 +130,7 @@ class TokenClassificationProcessor:
|
|||
Returns:
|
||||
TensorDataset: A TensorDataset containing the following four tensors.
|
||||
1. input_ids_all: Tensor. Each sublist contains numerical values,
|
||||
i.e. token ids, corresponding to the tokens in the input
|
||||
i.e. token ids, corresponding to the tokens in the input
|
||||
text data.
|
||||
2. input_mask_all: Tensor. Each sublist contains the attention
|
||||
mask of the input token id list, 1 for input tokens and 0 for
|
||||
|
@ -146,9 +151,7 @@ class TokenClassificationProcessor:
|
|||
return isinstance(obj, Iterable) and not isinstance(obj, str)
|
||||
|
||||
if max_len > MAX_SEQ_LEN:
|
||||
logging.warning(
|
||||
"Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)
|
||||
)
|
||||
logging.warning("Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
|
||||
max_len = MAX_SEQ_LEN
|
||||
|
||||
if not _is_iterable_but_not_string(text):
|
||||
|
@ -181,9 +184,7 @@ class TokenClassificationProcessor:
|
|||
for t, t_labels in zip(text, labels):
|
||||
if len(t) != len(t_labels):
|
||||
raise ValueError(
|
||||
"The number of words is {0}, but the number of labels is {1}.".format(
|
||||
len(t), len(t_labels)
|
||||
)
|
||||
"The number of words is {0}, but the number of labels is {1}.".format(len(t), len(t_labels))
|
||||
)
|
||||
|
||||
new_labels = []
|
||||
|
@ -197,11 +198,7 @@ class TokenClassificationProcessor:
|
|||
new_tokens.append(sub_word)
|
||||
|
||||
if len(new_tokens) > max_len:
|
||||
logging.warn(
|
||||
"Text after tokenization with length {} has been truncated".format(
|
||||
len(new_tokens)
|
||||
)
|
||||
)
|
||||
logging.warn("Text after tokenization with length {} has been truncated".format(len(new_tokens)))
|
||||
new_tokens = new_tokens[:max_len]
|
||||
new_labels = new_labels[:max_len]
|
||||
input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
|
||||
|
@ -218,9 +215,7 @@ class TokenClassificationProcessor:
|
|||
input_mask += padding
|
||||
new_labels += label_padding
|
||||
|
||||
trailing_token_mask_all.append(
|
||||
[True if label != trailing_piece_tag else False for label in new_labels]
|
||||
)
|
||||
trailing_token_mask_all.append([True if label != trailing_piece_tag else False for label in new_labels])
|
||||
|
||||
if label_map:
|
||||
label_ids = [label_map[label] for label in new_labels]
|
||||
|
@ -235,32 +230,17 @@ class TokenClassificationProcessor:
|
|||
td = TensorDataset(
|
||||
torch.tensor(input_ids_all, dtype=torch.long),
|
||||
torch.tensor(input_mask_all, dtype=torch.long),
|
||||
torch.tensor(trailing_token_mask_all, dtype=torch.bool),
|
||||
torch.tensor(trailing_token_mask_all, dtype=torch.long),
|
||||
torch.tensor(label_ids_all, dtype=torch.long),
|
||||
)
|
||||
else:
|
||||
td = TensorDataset(
|
||||
torch.tensor(input_ids_all, dtype=torch.long),
|
||||
torch.tensor(input_mask_all, dtype=torch.long),
|
||||
torch.tensor(trailing_token_mask_all, dtype=torch.bool),
|
||||
torch.tensor(trailing_token_mask_all, dtype=torch.long),
|
||||
)
|
||||
return td
|
||||
|
||||
def create_dataloader_from_dataset(
|
||||
self, dataset, shuffle=False, batch_size=32, num_gpus=None, distributed=False
|
||||
):
|
||||
if num_gpus is None:
|
||||
num_gpus = torch.cuda.device_count()
|
||||
|
||||
batch_size = batch_size * max(1, num_gpus)
|
||||
|
||||
if distributed:
|
||||
sampler = DistributedSampler(dataset)
|
||||
else:
|
||||
sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset)
|
||||
|
||||
return DataLoader(dataset, sampler=sampler, batch_size=batch_size)
|
||||
|
||||
|
||||
class TokenClassifier(Transformer):
|
||||
"""
|
||||
|
@ -277,10 +257,7 @@ class TokenClassifier(Transformer):
|
|||
|
||||
def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
|
||||
super().__init__(
|
||||
model_class=TC_MODEL_CLASS,
|
||||
model_name=model_name,
|
||||
num_labels=num_labels,
|
||||
cache_dir=cache_dir,
|
||||
model_class=TC_MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -291,7 +268,10 @@ class TokenClassifier(Transformer):
|
|||
self,
|
||||
train_dataloader,
|
||||
num_epochs=1,
|
||||
max_steps=-1,
|
||||
gradient_accumulation_steps=1,
|
||||
num_gpus=None,
|
||||
gpu_ids=None,
|
||||
local_rank=-1,
|
||||
weight_decay=0.0,
|
||||
learning_rate=5e-5,
|
||||
|
@ -301,73 +281,96 @@ class TokenClassifier(Transformer):
|
|||
seed=None,
|
||||
):
|
||||
"""
|
||||
Fit the TokenClassifier model using the given training dataset.
|
||||
Fine-tunes a pre-trained token classification model.
|
||||
|
||||
Args:
|
||||
train_dataloader (DataLoader): DataLoader instance for training.
|
||||
num_epochs (int, optional): Number of training epochs.
|
||||
Defaults to 1.
|
||||
train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
|
||||
num_epochs (int, optional): Number of training epochs. Defaults to 1.
|
||||
max_steps (int, optional): Total number of training steps.
|
||||
If set to a positive value, it overrides num_epochs.
|
||||
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
|
||||
Defualts to -1.
|
||||
gradient_accumulation_steps (int, optional): Number of steps to accumulate
|
||||
before performing a backward/update pass.
|
||||
Default to 1.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
local_rank (int, optional): Whether need to do distributed training.
|
||||
Defaults to -1, no distributed training.
|
||||
weight_decay (float, optional): Weight decay rate.
|
||||
Defaults to 0.
|
||||
learning_rate (float, optional): The learning rate.
|
||||
Defaults to 5e-5.
|
||||
adam_espilon (float, optional): The 'eps' parameter for the 'AdamW' optimizer.
|
||||
Defaults to 1e-8.
|
||||
warmup_steps (int, optional): Number of warmup steps for 'WarmupLinearSchedule'.
|
||||
Defaults to 0.
|
||||
verbose (bool, optional): Verbose model.
|
||||
Defaults to False.
|
||||
seed (int, optional): The seed for the transformers.
|
||||
Defaults to None, use the default seed.
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
|
||||
-1, which means non-distributed training.
|
||||
weight_decay (float, optional): Weight decay to apply after each parameter update.
|
||||
Defaults to 0.0.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
|
||||
5e-5.
|
||||
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
|
||||
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
|
||||
to `learning rate`. Defaults to 0.
|
||||
verbose (bool, optional): Whether to print out the training log. Defaults to True.
|
||||
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
|
||||
"""
|
||||
|
||||
# init optimizer
|
||||
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
|
||||
|
||||
# compute the max number of training steps
|
||||
max_steps = compute_training_steps(
|
||||
train_dataloader,
|
||||
num_epochs=num_epochs,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# init scheduler
|
||||
scheduler = Transformer.get_default_scheduler(
|
||||
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
|
||||
)
|
||||
|
||||
# fine tune
|
||||
super().fine_tune(
|
||||
train_dataloader=train_dataloader,
|
||||
get_inputs=TokenClassificationProcessor.get_inputs,
|
||||
n_gpu=num_gpus,
|
||||
num_train_epochs=num_epochs,
|
||||
weight_decay=weight_decay,
|
||||
learning_rate=learning_rate,
|
||||
adam_epsilon=adam_epsilon,
|
||||
warmup_steps=warmup_steps,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
optimizer=optimizer,
|
||||
scheduler=scheduler,
|
||||
local_rank=local_rank,
|
||||
verbose=verbose,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
def predict(self, eval_dataloader, num_gpus=None, verbose=True):
|
||||
def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
|
||||
"""
|
||||
Test on an evaluation dataset and get the token label predictions.
|
||||
Scores a dataset using a fine-tuned model and a given dataloader.
|
||||
|
||||
Args:
|
||||
eval_dataset (TensorDataset): A TensorDataset for evaluation.
|
||||
test_dataloader (DataLoader): DataLoader for scoring the data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
verbose (bool, optional): Verbose model.
|
||||
Defaults to False.
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
verbose (bool, optional): Whether to print out the training log. Defaults to True.
|
||||
|
||||
Returns:
|
||||
ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is
|
||||
[number_of_examples, sequence_length, number_of_labels]. Each
|
||||
value in the ndarray is not normalized. Post-process will be needed
|
||||
to get the probability for each class label.
|
||||
Returns
|
||||
1darray: numpy array of predicted label indices.
|
||||
"""
|
||||
|
||||
preds = list(
|
||||
super().predict(
|
||||
eval_dataloader=eval_dataloader,
|
||||
eval_dataloader=test_dataloader,
|
||||
get_inputs=TokenClassificationProcessor.get_inputs,
|
||||
n_gpu=num_gpus,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
verbose=verbose,
|
||||
)
|
||||
)
|
||||
preds_np = np.concatenate(preds)
|
||||
return preds_np
|
||||
return np.concatenate(preds)
|
||||
|
||||
def get_predicted_token_labels(self, predictions, label_map, dataset):
|
||||
"""
|
||||
|
@ -376,21 +379,19 @@ class TokenClassifier(Transformer):
|
|||
Args:
|
||||
predictions (ndarray): A numpy ndarray produced from the `predict` function call.
|
||||
The shape of the ndarray is [number_of_examples, sequence_length, number_of_labels].
|
||||
label_map (dict): A dictionary object to map a label (str) to an ID (int).
|
||||
label_map (dict): A dictionary object to map a label (str) to an ID (int).
|
||||
dataset (TensorDataset): The TensorDataset for evaluation.
|
||||
dataset (Dataset): The test Dataset instance.
|
||||
|
||||
Returns:
|
||||
list: A list of lists. The size of the retured list is the number of testing samples.
|
||||
Each sublist represents the predicted label for each token.
|
||||
Each sublist represents the predicted label for each token.
|
||||
"""
|
||||
|
||||
num_samples = len(dataset.tensors[0])
|
||||
if num_samples != predictions.shape[0]:
|
||||
raise ValueError(
|
||||
"Predictions have {0} samples, but got {1} samples in dataset".format(
|
||||
predictions.shape[0], num_samples
|
||||
)
|
||||
"Predictions have {0} samples, but got {1} samples in dataset".format(predictions.shape[0], num_samples)
|
||||
)
|
||||
|
||||
label_id2str = {v: k for k, v in label_map.items()}
|
||||
|
@ -409,7 +410,7 @@ class TokenClassifier(Transformer):
|
|||
if attention_mask[sid] == 0:
|
||||
break
|
||||
|
||||
if not trailing_mask[sid]:
|
||||
if not bool(trailing_mask[sid]):
|
||||
continue
|
||||
|
||||
label_id = seq_probs[sid].argmax()
|
||||
|
@ -422,13 +423,13 @@ class TokenClassifier(Transformer):
|
|||
Get the true testing label values.
|
||||
|
||||
Args:
|
||||
label_map (dict): A dictionary object to map a label (str) to an ID (int).
|
||||
label_map (dict): A dictionary object to map a label (str) to an ID (int).
|
||||
dataset (TensorDataset): The TensorDataset for evaluation.
|
||||
dataset (Dataset): The test Dataset instance.
|
||||
|
||||
Returns:
|
||||
list: A list of lists. The size of the retured list is the number of testing samples.
|
||||
Each sublist represents the predicted label for each token.
|
||||
Each sublist represents the predicted label for each token.
|
||||
"""
|
||||
|
||||
num_samples = len(dataset.tensors[0])
|
||||
|
|
|
@ -17,38 +17,30 @@
|
|||
# Modifications copyright © Microsoft Corporation
|
||||
|
||||
|
||||
import os
|
||||
import logging
|
||||
from tqdm import tqdm
|
||||
import collections
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
|
||||
import jsonlines
|
||||
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
from torch.utils.data import TensorDataset
|
||||
from tqdm import tqdm
|
||||
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
|
||||
from transformers.modeling_xlnet import (
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
XLNetForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_distilbert import (
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
DistilBertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
|
||||
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
|
||||
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device
|
||||
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
|
||||
|
||||
MODEL_CLASS = {}
|
||||
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update(
|
||||
{k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
|
||||
)
|
||||
MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
|
||||
# cached files during preprocessing
|
||||
# these are used in postprocessing to generate the final answer texts
|
||||
|
@ -85,9 +77,7 @@ class QAProcessor:
|
|||
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."
|
||||
):
|
||||
def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
|
||||
self.model_name = model_name
|
||||
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
|
||||
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
|
||||
|
@ -116,13 +106,14 @@ class QAProcessor:
|
|||
return self._model_type
|
||||
|
||||
@staticmethod
|
||||
def get_inputs(batch, model_name, train_mode=True):
|
||||
def get_inputs(batch, device, model_name, train_mode=True):
|
||||
"""
|
||||
Creates an input dictionary given a model name.
|
||||
|
||||
Args:
|
||||
batch (tuple): A tuple containing input ids, attention mask,
|
||||
segment ids, and labels tensors.
|
||||
device (torch.device): A PyTorch device.
|
||||
model_name (bool, optional): Model name used to format the inputs.
|
||||
train_mode (bool, optional): Training mode flag.
|
||||
Defaults to True.
|
||||
|
@ -131,6 +122,7 @@ class QAProcessor:
|
|||
dict: Dictionary containing input ids, segment ids, masks, and labels.
|
||||
Labels are only returned when train_mode is True.
|
||||
"""
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
model_type = model_name.split("-")[0]
|
||||
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
|
||||
|
@ -191,6 +183,8 @@ class QAProcessor:
|
|||
directory. These files are required during postprocessing to generate the final
|
||||
answer texts from predicted answer start and answer end indices. Defaults to
|
||||
"./cached_qa_features".
|
||||
Returns:
|
||||
DataSet: A Pytorch DataSet.
|
||||
"""
|
||||
|
||||
if not os.path.exists(feature_cache_dir):
|
||||
|
@ -223,9 +217,7 @@ class QAProcessor:
|
|||
|
||||
qa_examples.append(qa_example_cur)
|
||||
|
||||
qa_examples_json.append(
|
||||
{"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
|
||||
)
|
||||
qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
|
||||
|
||||
features_cur = _create_qa_features(
|
||||
qa_example_cur,
|
||||
|
@ -271,28 +263,13 @@ class QAProcessor:
|
|||
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids,
|
||||
input_mask,
|
||||
segment_ids,
|
||||
start_positions,
|
||||
end_positions,
|
||||
cls_index,
|
||||
p_mask,
|
||||
input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
|
||||
)
|
||||
else:
|
||||
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
|
||||
)
|
||||
qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
|
||||
|
||||
if num_gpus is not None:
|
||||
batch_size = batch_size * max(1, num_gpus)
|
||||
if distributed:
|
||||
sampler = DistributedSampler(qa_dataset)
|
||||
else:
|
||||
sampler = RandomSampler(qa_dataset) if is_training else SequentialSampler(qa_dataset)
|
||||
|
||||
return DataLoader(qa_dataset, sampler=sampler, batch_size=batch_size)
|
||||
return qa_dataset
|
||||
|
||||
def postprocess(
|
||||
self,
|
||||
|
@ -420,14 +397,7 @@ class QAResult(QAResult_):
|
|||
|
||||
QAResultExtended_ = collections.namedtuple(
|
||||
"QAResultExtended",
|
||||
[
|
||||
"unique_id",
|
||||
"start_top_log_probs",
|
||||
"start_top_index",
|
||||
"end_top_log_probs",
|
||||
"end_top_index",
|
||||
"cls_logits",
|
||||
],
|
||||
["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
|
||||
)
|
||||
|
||||
|
||||
|
@ -489,18 +459,16 @@ class AnswerExtractor(Transformer):
|
|||
def fit(
|
||||
self,
|
||||
train_dataloader,
|
||||
num_gpus=None,
|
||||
num_epochs=1,
|
||||
learning_rate=5e-5,
|
||||
max_grad_norm=1.0,
|
||||
max_steps=-1,
|
||||
gradient_accumulation_steps=1,
|
||||
warmup_steps=0,
|
||||
weight_decay=0.0,
|
||||
adam_epsilon=1e-8,
|
||||
fp16=False,
|
||||
fp16_opt_level="O1",
|
||||
num_gpus=None,
|
||||
gpu_ids=None,
|
||||
local_rank=-1,
|
||||
weight_decay=0.0,
|
||||
learning_rate=5e-5,
|
||||
adam_epsilon=1e-8,
|
||||
warmup_steps=0,
|
||||
verbose=True,
|
||||
seed=None,
|
||||
cache_model=True,
|
||||
|
@ -509,31 +477,30 @@ class AnswerExtractor(Transformer):
|
|||
Fine-tune pre-trained transofmer models for question answering.
|
||||
|
||||
Args:
|
||||
train_dataloader (Dataloader): Dataloader for the training data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
|
||||
num_epochs (int, optional): Number of training epochs. Defaults to 1.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
|
||||
5e-5.
|
||||
max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
|
||||
Defaults to 1.0.
|
||||
max_steps (int, optional): Maximum number of training steps. If specified,
|
||||
`num_epochs` will be ignored. Defaults to -1.
|
||||
gradient_accumulation_steps (int, optional): Number of batches to accumulate
|
||||
gradients on between each model parameter update. Defaults to 1.
|
||||
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
|
||||
to `learning rate`. Defaults to 0.
|
||||
weight_decay (float, optional): Weight decay to apply after each parameter update.
|
||||
Defaults to 0.0.
|
||||
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
|
||||
fp16 (bool, optional): Whether to use 16-bit (mixed) precision (through NVIDIA apex)
|
||||
instead of 32-bit. Defaults to False.
|
||||
fp16_opt_level (str, optional): For fp16: Apex AMP optimization level selected in
|
||||
['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html.
|
||||
Defaults to "O1",
|
||||
max_steps (int, optional): Total number of training steps.
|
||||
If set to a positive value, it overrides num_epochs.
|
||||
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
|
||||
Defualts to -1.
|
||||
gradient_accumulation_steps (int, optional): Number of steps to accumulate
|
||||
before performing a backward/update pass.
|
||||
Default to 1.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
|
||||
-1, which means non-distributed training.
|
||||
weight_decay (float, optional): Weight decay to apply after each parameter update.
|
||||
Defaults to 0.0.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
|
||||
5e-5.
|
||||
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
|
||||
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
|
||||
to `learning rate`. Defaults to 0.
|
||||
verbose (bool, optional): Whether to print out the training log. Defaults to True.
|
||||
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
|
||||
cache_model (bool, optional): Whether to save the fine-tuned model. If True,
|
||||
|
@ -542,39 +509,53 @@ class AnswerExtractor(Transformer):
|
|||
|
||||
"""
|
||||
|
||||
# init optimizer
|
||||
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
|
||||
|
||||
# compute the max number of training steps
|
||||
max_steps = compute_training_steps(
|
||||
train_dataloader,
|
||||
num_epochs=num_epochs,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# inin scheduler
|
||||
scheduler = Transformer.get_default_scheduler(
|
||||
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
|
||||
)
|
||||
|
||||
# fine tune
|
||||
super().fine_tune(
|
||||
train_dataloader=train_dataloader,
|
||||
get_inputs=QAProcessor.get_inputs,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
max_steps=max_steps,
|
||||
num_train_epochs=num_epochs,
|
||||
max_grad_norm=max_grad_norm,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
n_gpu=num_gpus,
|
||||
weight_decay=weight_decay,
|
||||
learning_rate=learning_rate,
|
||||
adam_epsilon=adam_epsilon,
|
||||
warmup_steps=warmup_steps,
|
||||
fp16=fp16,
|
||||
fp16_opt_level=fp16_opt_level,
|
||||
optimizer=optimizer,
|
||||
scheduler=scheduler,
|
||||
local_rank=local_rank,
|
||||
verbose=verbose,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
if cache_model:
|
||||
self.save_model()
|
||||
|
||||
def predict(self, test_dataloader, num_gpus=None, verbose=True):
|
||||
def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
|
||||
|
||||
"""
|
||||
Predicts answer start and end logits.
|
||||
|
||||
Args:
|
||||
test_dataloader (QADataset): Dataloader for the testing data.
|
||||
test_dataloader (DataLoader): DataLoader for scoring the data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
|
||||
-1, which means non-distributed.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
|
||||
|
||||
Returns:
|
||||
|
@ -584,25 +565,16 @@ class AnswerExtractor(Transformer):
|
|||
def _to_list(tensor):
|
||||
return tensor.detach().cpu().tolist()
|
||||
|
||||
# get device
|
||||
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
|
||||
|
||||
if isinstance(self.model, torch.nn.DataParallel):
|
||||
self.model = self.model.module
|
||||
|
||||
if num_gpus > 1:
|
||||
self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
|
||||
|
||||
self.model.to(device)
|
||||
self.model.eval()
|
||||
# move model
|
||||
self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
|
||||
|
||||
all_results = []
|
||||
for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
with torch.no_grad():
|
||||
inputs = QAProcessor.get_inputs(batch, self.model_name, train_mode=False)
|
||||
|
||||
inputs = QAProcessor.get_inputs(batch, device, self.model_name, train_mode=False)
|
||||
outputs = self.model(**inputs)
|
||||
|
||||
unique_id_tensor = batch[5]
|
||||
|
||||
for i, u_id in enumerate(unique_id_tensor):
|
||||
|
@ -617,9 +589,7 @@ class AnswerExtractor(Transformer):
|
|||
)
|
||||
else:
|
||||
result = QAResult(
|
||||
unique_id=u_id.item(),
|
||||
start_logits=_to_list(outputs[0][i]),
|
||||
end_logits=_to_list(outputs[1][i]),
|
||||
unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
|
||||
)
|
||||
all_results.append(result)
|
||||
torch.cuda.empty_cache()
|
||||
|
@ -783,9 +753,7 @@ def postprocess_bert_answer(
|
|||
|
||||
# Sort by the sum of the start and end logits in ascending order,
|
||||
# so that the first element is the most probable answer
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
|
||||
)
|
||||
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
|
@ -818,19 +786,11 @@ def postprocess_bert_answer(
|
|||
final_text = ""
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
|
||||
)
|
||||
)
|
||||
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
|
||||
# if we didn't include the empty option in the n-best, include it
|
||||
if unanswerable_exists:
|
||||
if "" not in seen_predictions:
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text="", start_logit=null_start_logit, end_logit=null_end_logit
|
||||
)
|
||||
)
|
||||
nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
|
||||
|
||||
# In very rare edge cases we could only have single null prediction.
|
||||
# So we just create a nonce prediction in this case to avoid failure.
|
||||
|
@ -874,9 +834,7 @@ def postprocess_bert_answer(
|
|||
all_probs[example["qa_id"]] = nbest_json[0]["probability"]
|
||||
else:
|
||||
# predict "" iff the null score - the score of best non-null > threshold
|
||||
score_diff = (
|
||||
score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
|
||||
)
|
||||
score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
|
||||
scores_diff_json[example["qa_id"]] = score_diff
|
||||
if score_diff > null_score_diff_threshold:
|
||||
all_predictions[example["qa_id"]] = ""
|
||||
|
@ -1042,9 +1000,7 @@ def postprocess_xlnet_answer(
|
|||
)
|
||||
)
|
||||
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
|
||||
)
|
||||
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
|
@ -1075,20 +1031,14 @@ def postprocess_xlnet_answer(
|
|||
tok_text = " ".join(tok_text.split())
|
||||
orig_text = " ".join(orig_tokens)
|
||||
|
||||
final_text = _get_final_text(
|
||||
tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
|
||||
)
|
||||
final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
|
||||
|
||||
if final_text in seen_predictions:
|
||||
continue
|
||||
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
|
||||
)
|
||||
)
|
||||
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
|
||||
|
||||
# In very rare edge cases we could have no valid predictions. So we
|
||||
# just create a nonce prediction in this case to avoid failure.
|
||||
|
@ -1235,9 +1185,7 @@ def _create_qa_example(qa_input, is_training):
|
|||
actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
|
||||
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
|
||||
if actual_text.find(cleaned_answer_text) == -1:
|
||||
logger.warning(
|
||||
"Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text
|
||||
)
|
||||
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
|
||||
return
|
||||
else:
|
||||
start_position = -1
|
||||
|
@ -1696,9 +1644,7 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
|
|||
|
||||
if len(orig_ns_text) != len(tok_ns_text):
|
||||
if verbose_logging:
|
||||
logger.info(
|
||||
"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text
|
||||
)
|
||||
logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
|
||||
return orig_text
|
||||
|
||||
# We then project the characters in `pred_text` back to `orig_text` using
|
||||
|
|
|
@ -2,37 +2,25 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from transformers.modeling_bert import (
|
||||
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
BertForSequenceClassification,
|
||||
)
|
||||
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForSequenceClassification
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification
|
||||
from transformers.modeling_distilbert import (
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
DistilBertForSequenceClassification,
|
||||
)
|
||||
from transformers.modeling_roberta import (
|
||||
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
RobertaForSequenceClassification,
|
||||
)
|
||||
from transformers.modeling_xlnet import (
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
XLNetForSequenceClassification,
|
||||
)
|
||||
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForSequenceClassification
|
||||
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification
|
||||
|
||||
from utils_nlp.common.pytorch_utils import compute_training_steps
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
|
||||
from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
|
||||
|
||||
MODEL_CLASS = {}
|
||||
MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update(
|
||||
{k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
|
||||
)
|
||||
MODEL_CLASS.update({k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: XLNetForSequenceClassification for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update(
|
||||
{k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
|
||||
)
|
||||
MODEL_CLASS.update({k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: AlbertForSequenceClassification for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
|
||||
|
||||
class Processor:
|
||||
|
@ -56,13 +44,14 @@ class Processor:
|
|||
)
|
||||
|
||||
@staticmethod
|
||||
def get_inputs(batch, model_name, train_mode=True):
|
||||
def get_inputs(batch, device, model_name, train_mode=True):
|
||||
"""
|
||||
Creates an input dictionary given a model name.
|
||||
|
||||
Args:
|
||||
batch (tuple): A tuple containing input ids, attention mask,
|
||||
segment ids, and labels tensors.
|
||||
device (torch.device): A PyTorch device.
|
||||
model_name (bool, optional): Model name used to format the inputs.
|
||||
train_mode (bool, optional): Training mode flag.
|
||||
Defaults to True.
|
||||
|
@ -71,7 +60,8 @@ class Processor:
|
|||
dict: Dictionary containing input ids, segment ids, masks, and labels.
|
||||
Labels are only returned when train_mode is True.
|
||||
"""
|
||||
if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert"]:
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert", "albert"]:
|
||||
if train_mode:
|
||||
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
|
||||
else:
|
||||
|
@ -103,11 +93,7 @@ class Processor:
|
|||
print("setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
|
||||
max_len = MAX_SEQ_LEN
|
||||
# truncate and add CLS & SEP markers
|
||||
tokens = (
|
||||
[tokenizer.cls_token]
|
||||
+ tokenizer.tokenize(text)[0 : max_len - 2]
|
||||
+ [tokenizer.sep_token]
|
||||
)
|
||||
tokens = [tokenizer.cls_token] + tokenizer.tokenize(text)[0 : max_len - 2] + [tokenizer.sep_token]
|
||||
# get input ids
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
# pad sequence
|
||||
|
@ -188,55 +174,13 @@ class Processor:
|
|||
|
||||
return input_ids, attention_mask, token_type_ids
|
||||
|
||||
def create_dataloader_from_df(
|
||||
self,
|
||||
df,
|
||||
text_col,
|
||||
label_col=None,
|
||||
text2_col=None,
|
||||
shuffle=False,
|
||||
max_len=MAX_SEQ_LEN,
|
||||
batch_size=32,
|
||||
num_gpus=None,
|
||||
distributed=False,
|
||||
):
|
||||
"""
|
||||
Creates a PyTorch DataLoader from a Pandas DataFrame for sequence classification tasks.
|
||||
|
||||
Args:
|
||||
df (pandas.DataFrame): Input Pandas DataFrame.
|
||||
text_col (str/int): Text column name or index.
|
||||
label_col (str/int, optional): Label column name or index. Defualts to None.
|
||||
text2_col (str/int, optional): Second text column name or index for sequence-pair tasks.
|
||||
Defualts to None.
|
||||
shuffle (bool, optional): If set to True, the DataLoader will use a RandomSampler,
|
||||
otherwise it will use a SequentialSampler.
|
||||
Defaults to False.
|
||||
max_len (int, optional): Maximum sequence length. Defaults to 512.
|
||||
batch_size (int, optional): Batch size. Defaults to 32.
|
||||
num_gpus (int, optional): Number of GPUs to use.
|
||||
If None, all available GPUs will be used.
|
||||
If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
distributed (bool, optional): If set to True, the DataLoader will use
|
||||
a DistributedSampler.
|
||||
Defaults to False.
|
||||
|
||||
Returns:
|
||||
DataLoader: A PyTorch DataLoader object that can be used for training or scoring.
|
||||
"""
|
||||
|
||||
def dataset_from_dataframe(self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN):
|
||||
if text2_col is None:
|
||||
ds = SCDataSet(
|
||||
df,
|
||||
text_col,
|
||||
label_col,
|
||||
transform=Processor.text_transform,
|
||||
tokenizer=self.tokenizer,
|
||||
max_len=max_len,
|
||||
return SCDataSet(
|
||||
df, text_col, label_col, transform=Processor.text_transform, tokenizer=self.tokenizer, max_len=max_len,
|
||||
)
|
||||
else:
|
||||
ds = SPCDataSet(
|
||||
return SPCDataSet(
|
||||
df,
|
||||
text_col,
|
||||
text2_col,
|
||||
|
@ -246,26 +190,11 @@ class Processor:
|
|||
max_len=max_len,
|
||||
)
|
||||
|
||||
if num_gpus is None:
|
||||
num_gpus = torch.cuda.device_count()
|
||||
|
||||
batch_size = batch_size * max(1, num_gpus)
|
||||
|
||||
if distributed:
|
||||
sampler = DistributedSampler(ds)
|
||||
else:
|
||||
sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
|
||||
|
||||
return DataLoader(ds, sampler=sampler, batch_size=batch_size)
|
||||
|
||||
|
||||
class SequenceClassifier(Transformer):
|
||||
def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
|
||||
super().__init__(
|
||||
model_class=MODEL_CLASS,
|
||||
model_name=model_name,
|
||||
num_labels=num_labels,
|
||||
cache_dir=cache_dir,
|
||||
model_class=MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -276,7 +205,10 @@ class SequenceClassifier(Transformer):
|
|||
self,
|
||||
train_dataloader,
|
||||
num_epochs=1,
|
||||
max_steps=-1,
|
||||
gradient_accumulation_steps=1,
|
||||
num_gpus=None,
|
||||
gpu_ids=None,
|
||||
local_rank=-1,
|
||||
weight_decay=0.0,
|
||||
learning_rate=5e-5,
|
||||
|
@ -289,11 +221,21 @@ class SequenceClassifier(Transformer):
|
|||
Fine-tunes a pre-trained sequence classification model.
|
||||
|
||||
Args:
|
||||
train_dataloader (Dataloader): Dataloader for the training data.
|
||||
train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
|
||||
num_epochs (int, optional): Number of training epochs. Defaults to 1.
|
||||
max_steps (int, optional): Total number of training steps.
|
||||
If set to a positive value, it overrides num_epochs.
|
||||
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
|
||||
Defualts to -1.
|
||||
gradient_accumulation_steps (int, optional): Number of steps to accumulate
|
||||
before performing a backward/update pass.
|
||||
Default to 1.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
|
||||
-1, which means non-distributed training.
|
||||
weight_decay (float, optional): Weight decay to apply after each parameter update.
|
||||
|
@ -307,28 +249,49 @@ class SequenceClassifier(Transformer):
|
|||
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
|
||||
"""
|
||||
|
||||
# init optimizer
|
||||
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
|
||||
|
||||
# compute the max number of training steps
|
||||
max_steps = compute_training_steps(
|
||||
train_dataloader,
|
||||
num_epochs=num_epochs,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# init scheduler
|
||||
scheduler = Transformer.get_default_scheduler(
|
||||
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
|
||||
)
|
||||
|
||||
# fine tune
|
||||
super().fine_tune(
|
||||
train_dataloader=train_dataloader,
|
||||
get_inputs=Processor.get_inputs,
|
||||
n_gpu=num_gpus,
|
||||
num_train_epochs=num_epochs,
|
||||
weight_decay=weight_decay,
|
||||
learning_rate=learning_rate,
|
||||
adam_epsilon=adam_epsilon,
|
||||
warmup_steps=warmup_steps,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
optimizer=optimizer,
|
||||
scheduler=scheduler,
|
||||
local_rank=local_rank,
|
||||
verbose=verbose,
|
||||
seed=seed,
|
||||
)
|
||||
|
||||
def predict(self, eval_dataloader, num_gpus=None, verbose=True):
|
||||
def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
|
||||
"""
|
||||
Scores a dataset using a fine-tuned model and a given dataloader.
|
||||
|
||||
Args:
|
||||
eval_dataloader (Dataloader): Dataloader for the evaluation data.
|
||||
test_dataloader (DataLoader): DataLoader for scoring the data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
verbose (bool, optional): Whether to print out the training log. Defaults to True.
|
||||
|
||||
Returns
|
||||
|
@ -337,12 +300,12 @@ class SequenceClassifier(Transformer):
|
|||
|
||||
preds = list(
|
||||
super().predict(
|
||||
eval_dataloader=eval_dataloader,
|
||||
eval_dataloader=test_dataloader,
|
||||
get_inputs=Processor.get_inputs,
|
||||
n_gpu=num_gpus,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
verbose=verbose,
|
||||
)
|
||||
)
|
||||
preds = np.concatenate(preds)
|
||||
# todo generator & probs
|
||||
return np.argmax(preds, axis=1)
|
||||
|
|
|
@ -2,23 +2,20 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
"""Utilities for Xlnet Sequence Classification"""
|
||||
import numpy as np
|
||||
import os
|
||||
from collections import namedtuple
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import (
|
||||
XLNetConfig,
|
||||
XLNetForSequenceClassification,
|
||||
AdamW,
|
||||
WarmupLinearSchedule,
|
||||
)
|
||||
from tqdm import tqdm
|
||||
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_to_device
|
||||
from utils_nlp.models.xlnet.common import Language
|
||||
|
||||
import mlflow
|
||||
import mlflow.pytorch
|
||||
import os
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
|
||||
from tqdm import tqdm
|
||||
from transformers import AdamW, WarmupLinearSchedule, XLNetConfig, XLNetForSequenceClassification
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
|
||||
from utils_nlp.models.xlnet.common import Language
|
||||
|
||||
|
||||
class XLNetSequenceClassifier:
|
||||
|
@ -79,9 +76,7 @@ class XLNetSequenceClassifier:
|
|||
self.max_grad_norm = max_grad_norm
|
||||
|
||||
# create classifier
|
||||
self.config = XLNetConfig.from_pretrained(
|
||||
self.language.value, num_labels=num_labels, cache_dir=cache_dir
|
||||
)
|
||||
self.config = XLNetConfig.from_pretrained(self.language.value, num_labels=num_labels, cache_dir=cache_dir)
|
||||
self.model = XLNetForSequenceClassification(self.config)
|
||||
|
||||
def fit(
|
||||
|
@ -114,7 +109,7 @@ class XLNetSequenceClassifier:
|
|||
"""
|
||||
|
||||
device, num_gpus = get_device(self.num_gpus)
|
||||
self.model = move_to_device(self.model, device, self.num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, self.num_gpus)
|
||||
|
||||
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
|
||||
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
|
||||
|
@ -128,24 +123,17 @@ class XLNetSequenceClassifier:
|
|||
token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
|
||||
val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)
|
||||
|
||||
train_dataset = TensorDataset(
|
||||
token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor
|
||||
)
|
||||
train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor)
|
||||
|
||||
val_dataset = TensorDataset(
|
||||
val_token_ids_tensor,
|
||||
val_input_mask_tensor,
|
||||
val_token_type_ids_tensor,
|
||||
val_labels_tensor,
|
||||
val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor,
|
||||
)
|
||||
|
||||
else:
|
||||
|
||||
train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)
|
||||
|
||||
val_dataset = TensorDataset(
|
||||
val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor
|
||||
)
|
||||
val_dataset = TensorDataset(val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor)
|
||||
|
||||
# define optimizer and model parameters
|
||||
param_optimizer = list(self.model.named_parameters())
|
||||
|
@ -155,10 +143,7 @@ class XLNetSequenceClassifier:
|
|||
"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
|
||||
"weight_decay": self.weight_decay,
|
||||
},
|
||||
{
|
||||
"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
|
||||
"weight_decay": 0.0,
|
||||
},
|
||||
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
||||
]
|
||||
|
||||
val_sampler = RandomSampler(val_dataset)
|
||||
|
@ -181,9 +166,7 @@ class XLNetSequenceClassifier:
|
|||
|
||||
train_sampler = RandomSampler(train_dataset)
|
||||
|
||||
train_dataloader = DataLoader(
|
||||
train_dataset, sampler=train_sampler, batch_size=self.batch_size
|
||||
)
|
||||
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size)
|
||||
|
||||
tr_loss = 0.0
|
||||
logging_loss = 0.0
|
||||
|
@ -191,18 +174,13 @@ class XLNetSequenceClassifier:
|
|||
|
||||
for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
||||
if token_type_ids:
|
||||
x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
|
||||
t.to(device) for t in batch
|
||||
)
|
||||
x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(t.to(device) for t in batch)
|
||||
else:
|
||||
token_type_ids_batch = None
|
||||
x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)
|
||||
|
||||
outputs = self.model(
|
||||
input_ids=x_batch,
|
||||
token_type_ids=token_type_ids_batch,
|
||||
attention_mask=mask_batch,
|
||||
labels=y_batch,
|
||||
input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch,
|
||||
)
|
||||
|
||||
loss = outputs[0] # model outputs are always tuple in pytorch-transformers
|
||||
|
@ -220,9 +198,7 @@ class XLNetSequenceClassifier:
|
|||
if logging_steps > 0 and global_step % logging_steps == 0:
|
||||
mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step)
|
||||
mlflow.log_metric(
|
||||
"training loss",
|
||||
(tr_loss - logging_loss) / (logging_steps * self.batch_size),
|
||||
step=global_step,
|
||||
"training loss", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step,
|
||||
)
|
||||
logging_loss = tr_loss
|
||||
# model checkpointing
|
||||
|
@ -245,9 +221,7 @@ class XLNetSequenceClassifier:
|
|||
)
|
||||
else:
|
||||
token_type_ids_batch = None
|
||||
val_x_batch, val_mask_batch, val_y_batch = tuple(
|
||||
t.to(device) for t in val_batch
|
||||
)
|
||||
val_x_batch, val_mask_batch, val_y_batch = tuple(t.to(device) for t in val_batch)
|
||||
val_outputs = self.model(
|
||||
input_ids=val_x_batch,
|
||||
token_type_ids=val_token_type_ids_batch,
|
||||
|
@ -256,9 +230,7 @@ class XLNetSequenceClassifier:
|
|||
)
|
||||
vloss = val_outputs[0]
|
||||
val_loss += vloss.sum().item()
|
||||
mlflow.log_metric(
|
||||
"validation loss", val_loss / len(val_dataset), step=global_step
|
||||
)
|
||||
mlflow.log_metric("validation loss", val_loss / len(val_dataset), step=global_step)
|
||||
self.model.train()
|
||||
|
||||
if verbose:
|
||||
|
@ -300,13 +272,7 @@ class XLNetSequenceClassifier:
|
|||
torch.cuda.empty_cache()
|
||||
|
||||
def predict(
|
||||
self,
|
||||
token_ids,
|
||||
input_mask,
|
||||
token_type_ids=None,
|
||||
num_gpus=None,
|
||||
batch_size=8,
|
||||
probabilities=False,
|
||||
self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False,
|
||||
):
|
||||
"""Scores the given dataset and returns the predicted classes.
|
||||
|
||||
|
@ -330,7 +296,7 @@ class XLNetSequenceClassifier:
|
|||
"""
|
||||
|
||||
device, num_gpus = get_device(num_gpus)
|
||||
self.model = move_to_device(self.model, device, num_gpus)
|
||||
self.model = move_model_to_device(self.model, device, num_gpus)
|
||||
|
||||
self.model.eval()
|
||||
preds = []
|
||||
|
@ -342,16 +308,11 @@ class XLNetSequenceClassifier:
|
|||
x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device)
|
||||
mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device)
|
||||
|
||||
token_type_ids_batch = torch.tensor(
|
||||
token_type_ids[start:end], dtype=torch.long, device=device
|
||||
)
|
||||
token_type_ids_batch = torch.tensor(token_type_ids[start:end], dtype=torch.long, device=device)
|
||||
|
||||
with torch.no_grad():
|
||||
pred_batch = self.model(
|
||||
input_ids=x_batch,
|
||||
token_type_ids=token_type_ids_batch,
|
||||
attention_mask=mask_batch,
|
||||
labels=None,
|
||||
input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None,
|
||||
)
|
||||
preds.append(pred_batch[0].cpu())
|
||||
if i % batch_size == 0:
|
||||
|
|
Загрузка…
Ссылка в новой задаче