Merge pull request #529 from microsoft/bleik/optim-patch

bleik/common transformers utils update
This commit is contained in:
Said Bleik 2020-01-24 20:06:37 -05:00 коммит произвёл GitHub
Родитель abeb88acb2 6b35c4917a
Коммит 7dcdc32399
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
28 изменённых файлов: 1110 добавлений и 1224 удалений

Просмотреть файл

@ -233,7 +233,7 @@
"source": [
"with Timer() as t:\n",
" preds = model.predict(\n",
" eval_dataloader=test_dataloader,\n",
" test_dataloader=test_dataloader,\n",
" num_gpus=None,\n",
" verbose=True\n",
" )\n",

Просмотреть файл

@ -32,6 +32,7 @@
"from sklearn.preprocessing import LabelEncoder\n",
"from tqdm import tqdm\n",
"from utils_nlp.common.timer import Timer\n",
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
"from utils_nlp.dataset.multinli import load_pandas_df\n",
"from utils_nlp.models.transformers.sequence_classification import (\n",
" Processor, SequenceClassifier)"
@ -93,7 +94,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 222k/222k [01:25<00:00, 2.60kKB/s] \n"
"100%|██████████| 222k/222k [01:20<00:00, 2.74kKB/s] \n"
]
}
],
@ -196,7 +197,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
" FutureWarning)\n"
]
}
@ -232,11 +233,11 @@
{
"data": {
"text/plain": [
"telephone 1055\n",
"slate 1003\n",
"travel 961\n",
"fiction 952\n",
"government 938\n",
"telephone 1043\n",
"slate 989\n",
"fiction 968\n",
"travel 964\n",
"government 945\n",
"Name: genre, dtype: int64"
]
},
@ -385,32 +386,108 @@
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>roberta-base</td>\n",
" <td>bert-base-japanese</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>roberta-large</td>\n",
" <td>bert-base-japanese-whole-word-masking</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>roberta-large-mnli</td>\n",
" <td>bert-base-japanese-char</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>xlnet-base-cased</td>\n",
" <td>bert-base-japanese-char-whole-word-masking</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>xlnet-large-cased</td>\n",
" <td>bert-base-finnish-cased-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>distilbert-base-uncased</td>\n",
" <td>bert-base-finnish-uncased-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>roberta-base</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>roberta-large</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>roberta-large-mnli</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>distilroberta-base</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>roberta-base-openai-detector</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>roberta-large-openai-detector</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>xlnet-base-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>xlnet-large-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>distilbert-base-uncased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>distilbert-base-uncased-distilled-squad</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>distilbert-base-german-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>distilbert-base-multilingual-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>albert-base-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>albert-large-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>albert-xlarge-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>albert-xxlarge-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>albert-base-v2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>albert-large-v2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>albert-xlarge-v2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>albert-xxlarge-v2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
@ -432,13 +509,32 @@
"12 bert-base-cased-finetuned-mrpc\n",
"13 bert-base-german-dbmdz-cased\n",
"14 bert-base-german-dbmdz-uncased\n",
"15 roberta-base\n",
"16 roberta-large\n",
"17 roberta-large-mnli\n",
"18 xlnet-base-cased\n",
"19 xlnet-large-cased\n",
"20 distilbert-base-uncased\n",
"21 distilbert-base-uncased-distilled-squad"
"15 bert-base-japanese\n",
"16 bert-base-japanese-whole-word-masking\n",
"17 bert-base-japanese-char\n",
"18 bert-base-japanese-char-whole-word-masking\n",
"19 bert-base-finnish-cased-v1\n",
"20 bert-base-finnish-uncased-v1\n",
"21 roberta-base\n",
"22 roberta-large\n",
"23 roberta-large-mnli\n",
"24 distilroberta-base\n",
"25 roberta-base-openai-detector\n",
"26 roberta-large-openai-detector\n",
"27 xlnet-base-cased\n",
"28 xlnet-large-cased\n",
"29 distilbert-base-uncased\n",
"30 distilbert-base-uncased-distilled-squad\n",
"31 distilbert-base-german-cased\n",
"32 distilbert-base-multilingual-cased\n",
"33 albert-base-v1\n",
"34 albert-large-v1\n",
"35 albert-xlarge-v1\n",
"36 albert-xxlarge-v1\n",
"37 albert-base-v2\n",
"38 albert-large-v2\n",
"39 albert-xlarge-v2\n",
"40 albert-xxlarge-v2"
]
},
"execution_count": 10,
@ -492,18 +588,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 231508/231508 [00:00<00:00, 15545441.79B/s]\n",
"100%|██████████| 492/492 [00:00<00:00, 560455.61B/s]\n",
"100%|██████████| 267967963/267967963 [00:04<00:00, 61255588.46B/s]\n",
"/media/bleik2/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
" warnings.warn('Was asked to gather along dimension 0, but all '\n",
"100%|██████████| 898823/898823 [00:00<00:00, 23932308.55B/s]\n",
"100%|██████████| 456318/456318 [00:00<00:00, 23321916.66B/s]\n",
"100%|██████████| 473/473 [00:00<00:00, 477015.10B/s]\n",
"100%|██████████| 501200538/501200538 [00:07<00:00, 64332558.45B/s]\n",
"100%|██████████| 798011/798011 [00:00<00:00, 25002433.16B/s]\n",
"100%|██████████| 641/641 [00:00<00:00, 695974.34B/s]\n",
"100%|██████████| 467042463/467042463 [00:08<00:00, 55154509.21B/s]\n"
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
" warnings.warn('Was asked to gather along dimension 0, but all '\n"
]
}
],
@ -518,11 +604,17 @@
" to_lower=model_name.endswith(\"uncased\"),\n",
" cache_dir=CACHE_DIR,\n",
" )\n",
" train_dataloader = processor.create_dataloader_from_df(\n",
" df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
" train_dataset = processor.dataset_from_dataframe(\n",
" df_train, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
" )\n",
" test_dataloader = processor.create_dataloader_from_df(\n",
" df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
" train_dataloader = dataloader_from_dataset(\n",
" train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
" )\n",
" test_dataset = processor.dataset_from_dataframe(\n",
" df_test, TEXT_COL, LABEL_COL, max_len=MAX_LEN\n",
" )\n",
" test_dataloader = dataloader_from_dataset(\n",
" test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
" )\n",
"\n",
" # fine-tune\n",
@ -531,17 +623,12 @@
" )\n",
" with Timer() as t:\n",
" classifier.fit(\n",
" train_dataloader,\n",
" num_epochs=NUM_EPOCHS,\n",
" num_gpus=NUM_GPUS,\n",
" verbose=False,\n",
" train_dataloader, num_epochs=NUM_EPOCHS, num_gpus=NUM_GPUS, verbose=False,\n",
" )\n",
" train_time = t.interval / 3600\n",
"\n",
" # predict\n",
" preds = classifier.predict(\n",
" test_dataloader, num_gpus=NUM_GPUS, verbose=False\n",
" )\n",
" preds = classifier.predict(test_dataloader, num_gpus=NUM_GPUS, verbose=False)\n",
"\n",
" # eval\n",
" accuracy = accuracy_score(df_test[LABEL_COL], preds)\n",
@ -600,21 +687,21 @@
" <tbody>\n",
" <tr>\n",
" <th>accuracy</th>\n",
" <td>0.895477</td>\n",
" <td>0.879584</td>\n",
" <td>0.894866</td>\n",
" <td>0.889364</td>\n",
" <td>0.885697</td>\n",
" <td>0.886308</td>\n",
" </tr>\n",
" <tr>\n",
" <th>f1-score</th>\n",
" <td>0.896656</td>\n",
" <td>0.881218</td>\n",
" <td>0.896108</td>\n",
" <td>0.885225</td>\n",
" <td>0.880926</td>\n",
" <td>0.881819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>time(hrs)</th>\n",
" <td>0.021865</td>\n",
" <td>0.035351</td>\n",
" <td>0.046295</td>\n",
" <td>0.023326</td>\n",
" <td>0.044209</td>\n",
" <td>0.052801</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
@ -622,9 +709,9 @@
],
"text/plain": [
" distilbert-base-uncased roberta-base xlnet-base-cased\n",
"accuracy 0.895477 0.879584 0.894866\n",
"f1-score 0.896656 0.881218 0.896108\n",
"time(hrs) 0.021865 0.035351 0.046295"
"accuracy 0.889364 0.885697 0.886308\n",
"f1-score 0.885225 0.880926 0.881819\n",
"time(hrs) 0.023326 0.044209 0.052801"
]
},
"execution_count": 13,
@ -645,7 +732,7 @@
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.8899755501222494,
"data": 0.887123064384678,
"encoder": "json",
"name": "accuracy",
"version": 1
@ -663,7 +750,7 @@
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.8913273009038569,
"data": 0.8826569624491233,
"encoder": "json",
"name": "f1",
"version": 1
@ -688,9 +775,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "nlp_gpu",
"display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
"language": "python",
"name": "nlp_gpu"
"name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
},
"language_info": {
"codemirror_mode": {

Просмотреть файл

@ -13,7 +13,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -69,7 +69,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
@ -183,32 +183,108 @@
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>roberta-base</td>\n",
" <td>bert-base-japanese</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>roberta-large</td>\n",
" <td>bert-base-japanese-whole-word-masking</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>roberta-large-mnli</td>\n",
" <td>bert-base-japanese-char</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>xlnet-base-cased</td>\n",
" <td>bert-base-japanese-char-whole-word-masking</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>xlnet-large-cased</td>\n",
" <td>bert-base-finnish-cased-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>distilbert-base-uncased</td>\n",
" <td>bert-base-finnish-uncased-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>roberta-base</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>roberta-large</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>roberta-large-mnli</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>distilroberta-base</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>roberta-base-openai-detector</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>roberta-large-openai-detector</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>xlnet-base-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>xlnet-large-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>distilbert-base-uncased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>distilbert-base-uncased-distilled-squad</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>distilbert-base-german-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>distilbert-base-multilingual-cased</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>albert-base-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>albert-large-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>albert-xlarge-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>albert-xxlarge-v1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>albert-base-v2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>albert-large-v2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>albert-xlarge-v2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>albert-xxlarge-v2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
@ -230,13 +306,32 @@
"12 bert-base-cased-finetuned-mrpc\n",
"13 bert-base-german-dbmdz-cased\n",
"14 bert-base-german-dbmdz-uncased\n",
"15 roberta-base\n",
"16 roberta-large\n",
"17 roberta-large-mnli\n",
"18 xlnet-base-cased\n",
"19 xlnet-large-cased\n",
"20 distilbert-base-uncased\n",
"21 distilbert-base-uncased-distilled-squad"
"15 bert-base-japanese\n",
"16 bert-base-japanese-whole-word-masking\n",
"17 bert-base-japanese-char\n",
"18 bert-base-japanese-char-whole-word-masking\n",
"19 bert-base-finnish-cased-v1\n",
"20 bert-base-finnish-uncased-v1\n",
"21 roberta-base\n",
"22 roberta-large\n",
"23 roberta-large-mnli\n",
"24 distilroberta-base\n",
"25 roberta-base-openai-detector\n",
"26 roberta-large-openai-detector\n",
"27 xlnet-base-cased\n",
"28 xlnet-large-cased\n",
"29 distilbert-base-uncased\n",
"30 distilbert-base-uncased-distilled-squad\n",
"31 distilbert-base-german-cased\n",
"32 distilbert-base-multilingual-cased\n",
"33 albert-base-v1\n",
"34 albert-large-v1\n",
"35 albert-xlarge-v1\n",
"36 albert-xxlarge-v1\n",
"37 albert-base-v2\n",
"38 albert-large-v2\n",
"39 albert-xlarge-v2\n",
"40 albert-xxlarge-v2"
]
},
"execution_count": 3,
@ -264,7 +359,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -281,7 +376,7 @@
" 'num_train_epochs': 5,\n",
" 'num_gpus': 2,\n",
" 'batch_size': 16,\n",
" 'verbose': True,\n",
" 'verbose': False,\n",
" 'load_dataset_func': None,\n",
" 'get_labels_func': None\n",
"}\n",
@ -325,9 +420,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 80.1k/80.1k [00:02<00:00, 30.8kKB/s]\n",
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/sklearn/model_selection/_split.py:2179: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
" FutureWarning)\n"
]
}
],
"source": [
"train_dataloader, test_dataloader, label_encoder, test_labels = CONFIG['load_dataset_func'](\n",
" local_path=CONFIG['local_path'],\n",
@ -354,11 +459,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/media/bleik2/backup/.conda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
" warnings.warn('Was asked to gather along dimension 0, but all '\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training time : 0.190 hrs\n"
]
}
],
"source": [
"model = SequenceClassifier(\n",
" model_name=CONFIG['model_name'],\n",
@ -390,9 +511,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction time : 0.021 hrs\n"
]
}
],
"source": [
"with Timer() as t:\n",
" preds = model.predict(\n",
@ -422,11 +551,11 @@
"text": [
" precision recall f1-score support\n",
"\n",
" culture 0.89 0.89 0.89 843\n",
" diverse 0.99 0.99 0.99 1738\n",
" economy 0.96 0.96 0.96 661\n",
" politics 0.94 0.94 0.94 530\n",
" sports 0.87 0.87 0.87 580\n",
" culture 0.93 0.94 0.93 548\n",
" diverse 0.94 0.94 0.94 640\n",
" economy 0.90 0.88 0.89 570\n",
" politics 0.87 0.88 0.88 809\n",
" sports 0.99 0.98 0.99 1785\n",
"\n",
" micro avg 0.94 0.94 0.94 4352\n",
" macro avg 0.93 0.93 0.93 4352\n",
@ -449,9 +578,64 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.94,
"encoder": "json",
"name": "precision",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "precision"
}
},
"output_type": "display_data"
},
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.94,
"encoder": "json",
"name": "recall",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "recall"
}
},
"output_type": "display_data"
},
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.94,
"encoder": "json",
"name": "f1",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "f1"
}
},
"output_type": "display_data"
}
],
"source": [
"# for testing\n",
"report_splits = report.split('\\n')[-2].split()\n",
@ -463,11 +647,10 @@
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3.6.8 64-bit ('nlp_gpu': conda)",
"language": "python",
"name": "python3"
"name": "python36864bitnlpgpucondaa579511bcea84c65877ff3dca4205921"
},
"language_info": {
"codemirror_mode": {

Просмотреть файл

@ -1,14 +1,10 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import json
import shutil
import pytest
import papermill as pm
import pytest
import scrapbook as sb
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
from tests.notebooks_common import KERNEL_NAME, OUTPUT_NOTEBOOK
ABS_TOL = 0.02
@ -31,13 +27,10 @@ def test_extractive_summarization_cnndm_transformers(notebooks, tmp):
CACHE_DIR=tmp,
BATCH_SIZE=3000,
REPORT_EVERY=50,
MAX_STEPS=1e3,
MAX_STEPS=1000,
WARMUP_STEPS=5e2,
MODEL_NAME="distilbert-base-uncased",
),
)
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
print(result)
assert pytest.approx(result["rouge_2_f_score"], 0.1, abs=ABS_TOL)

Просмотреть файл

@ -33,8 +33,8 @@ def test_tc_mnli_transformers(notebooks, tmp):
),
)
result = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict
assert pytest.approx(result["accuracy"], 0.89, abs=ABS_TOL)
assert pytest.approx(result["f1"], 0.89, abs=ABS_TOL)
assert pytest.approx(result["accuracy"], 0.885, abs=ABS_TOL)
assert pytest.approx(result["f1"], 0.885, abs=ABS_TOL)
@pytest.mark.integration

Просмотреть файл

@ -9,4 +9,3 @@ import torch
@pytest.mark.gpu
def test_machine_is_gpu_machine():
assert torch.cuda.is_available() is True

Просмотреть файл

@ -1,79 +0,0 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
from utils_nlp.models.bert.token_classification import (
BERTTokenClassifier,
postprocess_token_labels,
)
def test_token_classifier_num_labels():
with pytest.raises(ValueError):
BERTTokenClassifier(num_labels=1)
def test_token_classifier_fit_predict(tmp_path, ner_test_data):
token_classifier = BERTTokenClassifier(num_labels=6, cache_dir=tmp_path)
# test fit, no warmup
token_classifier.fit(
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
input_mask=ner_test_data["INPUT_MASK"],
labels=ner_test_data["INPUT_LABEL_IDS"],
)
# test fit, with warmup
token_classifier.fit(
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
input_mask=ner_test_data["INPUT_MASK"],
labels=ner_test_data["INPUT_LABEL_IDS"],
warmup_proportion=0.1,
)
# test predict, no labels
token_classifier.predict(
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
input_mask=ner_test_data["INPUT_MASK"],
)
# test predict, with labels
token_classifier.predict(
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
input_mask=ner_test_data["INPUT_MASK"],
labels=ner_test_data["INPUT_LABEL_IDS"],
)
# test output probabilities
predictions = token_classifier.predict(
token_ids=ner_test_data["INPUT_TOKEN_IDS"],
input_mask=ner_test_data["INPUT_MASK"],
labels=ner_test_data["INPUT_LABEL_IDS"],
probabilities=True,
)
assert len(predictions.classes) == predictions.probabilities.shape[0]
def test_postprocess_token_labels(ner_test_data):
labels_no_padding = postprocess_token_labels(
labels=ner_test_data["PREDICTED_LABELS"],
input_mask=ner_test_data["INPUT_MASK"],
label_map=ner_test_data["LABEL_MAP"],
)
assert labels_no_padding == ner_test_data["EXPECTED_TOKENS_NO_PADDING"]
def test_postprocess_token_labels_remove_trailing(ner_test_data):
labels_no_padding_no_trailing = postprocess_token_labels(
labels=ner_test_data["PREDICTED_LABELS"],
input_mask=ner_test_data["INPUT_MASK"],
label_map=ner_test_data["LABEL_MAP"],
remove_trailing_word_pieces=True,
trailing_token_mask=ner_test_data["TRAILING_TOKEN_MASK"],
)
assert (
labels_no_padding_no_trailing
== ner_test_data["EXPECTED_TOKENS_NO_PADDING_NO_TRAILING"]
)

Просмотреть файл

@ -1,14 +1,15 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""PyTorch utils tests."""
import pytest
import torch
import torch.nn as nn
from torch.nn.parallel.data_parallel import DataParallel
from torch.nn.modules.container import Sequential
from torch.nn.parallel.data_parallel import DataParallel
from utils_nlp.common.pytorch_utils import get_device, move_to_device
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
@pytest.fixture
@ -55,49 +56,47 @@ def test_get_device_local_rank():
def test_move_to_device_cpu(model):
# test when device.type="cpu"
model_cpu = move_to_device(model, torch.device("cpu"))
model_cpu = move_model_to_device(model, torch.device("cpu"))
assert isinstance(model_cpu, nn.modules.container.Sequential)
def test_move_to_device_cpu_parallelized(model):
# test when input model is parallelized
model_parallelized = nn.DataParallel(model)
model_parallelized_output = move_to_device(model_parallelized, torch.device("cpu"))
model_parallelized_output = move_model_to_device(model_parallelized, torch.device("cpu"))
assert isinstance(model_parallelized_output, nn.modules.container.Sequential)
def test_move_to_device_exception_not_torch_device(model):
# test when device is not torch.device
with pytest.raises(ValueError):
move_to_device(model, "abc")
move_model_to_device(model, "abc")
def test_move_to_device_exception_wrong_type(model):
# test when device.type is not "cuda" or "cpu"
with pytest.raises(Exception):
move_to_device(model, torch.device("opengl"))
move_model_to_device(model, torch.device("opengl"))
@pytest.mark.skipif(
torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine"
)
@pytest.mark.skipif(torch.cuda.is_available(), reason="Skip if we are executing the cpu tests on a gpu machine")
def test_move_to_device_exception_gpu_model_on_cpu_machine(model):
# test when the model is moved to a gpu but it is a cpu machine
with pytest.raises(Exception):
move_to_device(model, torch.device("cuda"))
move_model_to_device(model, torch.device("cuda"))
@pytest.mark.gpu
def test_move_to_device_exception_cuda_zero_gpus(model):
# test when device.type is cuda, but num_gpus is 0
with pytest.raises(ValueError):
move_to_device(model, torch.device("cuda"), num_gpus=0)
move_model_to_device(model, torch.device("cuda"), num_gpus=0)
@pytest.mark.gpu
def test_move_to_device_gpu(model):
# test when device.type="cuda"
model_cuda = move_to_device(model, torch.device("cuda"))
model_cuda = move_model_to_device(model, torch.device("cuda"))
num_cuda_devices = torch.cuda.device_count()
if num_cuda_devices > 1:
@ -105,18 +104,16 @@ def test_move_to_device_gpu(model):
else:
assert isinstance(model_cuda, Sequential)
model_cuda_1_gpu = move_to_device(model, torch.device("cuda"), num_gpus=1)
model_cuda_1_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=1)
assert isinstance(model_cuda_1_gpu, Sequential)
model_cuda_1_more_gpu = move_to_device(
model, torch.device("cuda"), num_gpus=num_cuda_devices + 1
)
model_cuda_1_more_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices + 1)
if num_cuda_devices > 1:
assert isinstance(model_cuda_1_more_gpu, DataParallel)
else:
assert isinstance(model_cuda_1_more_gpu, Sequential)
model_cuda_same_gpu = move_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
model_cuda_same_gpu = move_model_to_device(model, torch.device("cuda"), num_gpus=num_cuda_devices)
if num_cuda_devices > 1:
assert isinstance(model_cuda_same_gpu, DataParallel)
else:

Просмотреть файл

@ -1,14 +1,12 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import nltk
nltk.download("punkt")
from nltk import tokenize
import pytest
import os
import shutil
import nltk
nltk.download("punkt")
import pytest
from nltk import tokenize
from utils_nlp.models.transformers.datasets import SummarizationDataset
from utils_nlp.models.transformers.extractive_summarization import (
@ -17,6 +15,9 @@ from utils_nlp.models.transformers.extractive_summarization import (
ExtSumProcessor,
)
# @pytest.fixture()
def source_data():
return (
@ -48,18 +49,10 @@ def data_to_file(tmp_module):
f.write(target)
f.close()
train_dataset = SummarizationDataset(
source_file,
target_file,
[tokenize.sent_tokenize],
[tokenize.sent_tokenize],
nltk.word_tokenize,
source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
)
test_dataset = SummarizationDataset(
source_file,
target_file,
[tokenize.sent_tokenize],
[tokenize.sent_tokenize],
nltk.word_tokenize,
source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize,
)
processor = ExtSumProcessor(
@ -70,20 +63,12 @@ def data_to_file(tmp_module):
min_nsents=0,
min_src_ntokens=1,
)
ext_sum_train = processor.preprocess(
train_dataset, train_dataset.get_target(), oracle_mode="greedy"
)
ext_sum_test = processor.preprocess(
test_dataset, test_dataset.get_target(), oracle_mode="greedy"
)
ext_sum_train = processor.preprocess(train_dataset, train_dataset.get_target(), oracle_mode="greedy")
ext_sum_test = processor.preprocess(test_dataset, test_dataset.get_target(), oracle_mode="greedy")
save_path = os.path.join(tmp_module, "processed")
train_files = ExtSumProcessedData.save_data(
ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000
)
test_files = ExtSumProcessedData.save_data(
ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000
)
train_files = ExtSumProcessedData.save_data(ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000)
test_files = ExtSumProcessedData.save_data(ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000)
print(train_files)
print(test_files)
assert os.path.exists(train_files[0])
@ -96,10 +81,10 @@ def test_bert_training(data_to_file, tmp_module):
CACHE_DIR = tmp_module
ENCODER = "transformer"
BATCH_SIZE = 200
BATCH_SIZE = 128
LEARNING_RATE = 2e-3
REPORT_EVERY = 100
MAX_STEPS = 5e2
REPORT_EVERY = 50
MAX_STEPS = 2e2
WARMUP_STEPS = 1e2
DATA_SAVED_PATH = data_to_file
result_base_path = "./results"

Просмотреть файл

@ -1,18 +1,20 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
import os
import pytest
import torch
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.models.transformers.datasets import QADataset
from utils_nlp.models.transformers.question_answering import (
QAProcessor,
AnswerExtractor,
CACHED_EXAMPLES_TEST_FILE,
CACHED_FEATURES_TEST_FILE,
AnswerExtractor,
QAProcessor,
)
import torch
NUM_GPUS = max(1, torch.cuda.device_count())
BATCH_SIZE = 8
@ -109,9 +111,7 @@ def qa_test_data(qa_test_df, tmp_module):
feature_cache_dir=tmp_module,
)
qa_processor_distilbert = QAProcessor(
model_name="distilbert-base-uncased", cache_dir=tmp_module
)
qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
train_features_distilbert = qa_processor_distilbert.preprocess(
train_dataset,
batch_size=BATCH_SIZE,
@ -153,15 +153,9 @@ def qa_test_data(qa_test_df, tmp_module):
def test_QAProcessor(qa_test_data, tmp_module):
for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
qa_processor.preprocess(
qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module
)
qa_processor.preprocess(
qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module
)
qa_processor.preprocess(
qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module
)
qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
# test unsupported model type
with pytest.raises(ValueError):
@ -169,51 +163,49 @@ def test_QAProcessor(qa_test_data, tmp_module):
# test training data has no ground truth exception
with pytest.raises(Exception):
qa_processor.preprocess(
qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
)
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
# test when answer start is a list, but answer text is not
with pytest.raises(Exception):
qa_processor.preprocess(
qa_test_data["train_dataset_start_text_mismatch"],
is_training=True,
feature_cache_dir=tmp_module,
qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
)
# test when training data has multiple answers
with pytest.raises(Exception):
qa_processor.preprocess(
qa_test_data["train_dataset_multi_answers"],
is_training=True,
feature_cache_dir=tmp_module,
qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
)
def test_AnswerExtractor(qa_test_data, tmp_module):
# test bert
# bert
qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)
qa_extractor_bert.fit(qa_test_data["train_features_bert"], cache_model=True)
train_loader_bert = dataloader_from_dataset(qa_test_data["train_features_bert"])
test_loader_bert = dataloader_from_dataset(qa_test_data["test_features_bert"], shuffle=False)
qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)
# test saving fine-tuned model
model_output_dir = os.path.join(tmp_module, "fine_tuned")
assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
assert os.path.exists(os.path.join(model_output_dir, "config.json"))
qa_extractor_from_cache = AnswerExtractor(
cache_dir=tmp_module, load_model_from_dir=model_output_dir
)
qa_extractor_from_cache.predict(qa_test_data["test_features_bert"])
qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
qa_extractor_from_cache.predict(test_loader_bert, verbose=False)
# xlnet
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_xlnet"])
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_xlnet"], shuffle=False)
qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
qa_extractor_xlnet.fit(qa_test_data["train_features_xlnet"], cache_model=False)
qa_extractor_xlnet.predict(qa_test_data["test_features_xlnet"])
qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False)
qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)
qa_extractor_distilbert = AnswerExtractor(
model_name="distilbert-base-uncased", cache_dir=tmp_module
)
qa_extractor_distilbert.fit(qa_test_data["train_features_distilbert"], cache_model=False)
qa_extractor_distilbert.predict(qa_test_data["test_features_distilbert"])
# distilbert
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)
def test_postprocess_bert_answer(qa_test_data, tmp_module):
@ -226,8 +218,9 @@ def test_postprocess_bert_answer(qa_test_data, tmp_module):
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_loader = dataloader_from_dataset(test_features, shuffle=False)
qa_extractor = AnswerExtractor(cache_dir=tmp_module)
predictions = qa_extractor.predict(test_features)
predictions = qa_extractor.predict(test_loader)
qa_processor.postprocess(
results=predictions,
@ -260,8 +253,9 @@ def test_postprocess_xlnet_answer(qa_test_data, tmp_module):
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_loader = dataloader_from_dataset(test_features, shuffle=False)
qa_extractor = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
predictions = qa_extractor.predict(test_features)
predictions = qa_extractor.predict(test_loader)
qa_processor.postprocess(
results=predictions,

21
tests/unit/test_transformers_sequence_classification.py Normal file → Executable file
Просмотреть файл

@ -5,6 +5,7 @@ import pytest
import pandas as pd
from utils_nlp.models.transformers.sequence_classification import SequenceClassifier, Processor
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
@pytest.fixture()
@ -19,12 +20,11 @@ def test_classifier(data, tmpdir):
num_labels = len(pd.unique(data[1]))
model_name = "bert-base-uncased"
processor = Processor(model_name=model_name, cache_dir=tmpdir)
train_dataloader = processor.create_dataloader_from_df(
df, "text", "label", batch_size=2, num_gpus=0
)
ds = processor.dataset_from_dataframe(df, "text", "label")
dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=0, shuffle=True)
classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=0, verbose=False)
preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=0, verbose=False)
preds = classifier.predict(dl, num_gpus=0, verbose=False)
assert len(preds) == len(data[1])
@ -35,17 +35,16 @@ def test_classifier_gpu_train_cpu_predict(data, tmpdir):
num_labels = len(pd.unique(data[1]))
model_name = "bert-base-uncased"
processor = Processor(model_name=model_name, cache_dir=tmpdir)
train_dataloader = processor.create_dataloader_from_df(
df, "text", "label", batch_size=2, num_gpus=1
)
ds = processor.dataset_from_dataframe(df, "text", "label")
dl = dataloader_from_dataset(ds, batch_size=2, num_gpus=1, shuffle=True)
classifier = SequenceClassifier(model_name=model_name, num_labels=num_labels, cache_dir=tmpdir)
classifier.fit(train_dataloader=train_dataloader, num_epochs=1, num_gpus=1, verbose=False)
classifier.fit(train_dataloader=dl, num_epochs=1, num_gpus=1, verbose=False)
assert next(classifier.model.parameters()).is_cuda is True
# gpu prediction, no model move
preds = classifier.predict(train_dataloader, num_gpus=1, verbose=False)
preds = classifier.predict(dl, num_gpus=1, verbose=False)
assert len(preds) == len(data[1])
# cpu prediction, need model move
assert next(classifier.model.parameters()).is_cuda is True
preds = classifier.predict(train_dataloader, num_gpus=0, verbose=False)
preds = classifier.predict(dl, num_gpus=0, verbose=False)
assert next(classifier.model.parameters()).is_cuda is False

Просмотреть файл

@ -0,0 +1,23 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor, TokenClassifier
@pytest.mark.cpu
def test_token_classifier_fit_predict(tmpdir, ner_test_data):
token_classifier = TokenClassifier(model_name="bert-base-uncased", num_labels=6, cache_dir=tmpdir)
processor = TokenClassificationProcessor(model_name="bert-base-uncased", cache_dir=tmpdir)
# test fit, no warmup
train_dataset = processor.preprocess_for_bert(
text=ner_test_data["INPUT_TEXT"], labels=ner_test_data["INPUT_LABELS"], label_map=ner_test_data["LABEL_MAP"],
)
train_dataloader = dataloader_from_dataset(train_dataset)
token_classifier.fit(train_dataloader)
# test predict, no labels
_ = token_classifier.predict(train_dataloader, verbose=False)

Просмотреть файл

@ -1,11 +1,11 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""Common PyTorch utilities that facilitate building Pytorch models."""
"""Common PyTorch utilities that facilitate building PyTorch models."""
import torch
import torch.nn as nn
import warnings
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
def get_device(
@ -17,11 +17,7 @@ def get_device(
# init_method="file:///distributed",
):
if local_rank == -1:
num_gpus = (
min(num_gpus, torch.cuda.device_count())
if num_gpus is not None
else torch.cuda.device_count()
)
num_gpus = min(num_gpus, torch.cuda.device_count()) if num_gpus is not None else torch.cuda.device_count()
device = torch.device("cuda" if torch.cuda.is_available() and num_gpus > 0 else "cpu")
else:
torch.cuda.set_device(local_rank)
@ -32,59 +28,109 @@ def get_device(
return device, num_gpus
def move_to_device(model, device, num_gpus=None):
def move_model_to_device(model, device, num_gpus=None, gpu_ids=None, local_rank=-1):
"""Moves a model to the specified device (cpu or gpu/s)
and implements data parallelism when multiple gpus are specified.
Args:
model (Module): A PyTorch model
device (torch.device): A PyTorch device
num_gpus (int): The number of GPUs to be used. Defaults to None,
all gpus are used.
model (Module): A PyTorch model.
device (torch.device): A PyTorch device.
num_gpus (int): The number of GPUs to be used.
If set to None, all available GPUs will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If None, the first num_gpus GPUs will be used.
If not None, overrides num_gpus.
Defaults to None.
local_rank (int): Local GPU ID within a node. Used in distributed environments.
If not -1, num_gpus and gpu_ids are ignored.
Defaults to -1.
Returns:
Module, DataParallel, DistributedDataParallel: A PyTorch Module or
a DataParallel/DistributedDataParallel wrapper (when multiple gpus are used).
"""
if not isinstance(device, torch.device):
raise ValueError("device must be of type torch.device.")
# unwrap model
if isinstance(model, torch.nn.DataParallel):
model = model.module
# wrap in DataParallel or DistributedDataParallel
if local_rank != -1:
self.model = torch.nn.parallel.DistributedDataParallel(
self.model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True,
)
else:
if device.type == "cuda":
if num_gpus is not None:
if num_gpus < 1:
raise ValueError("num_gpus must be at least 1 or None")
num_cuda_devices = torch.cuda.device_count()
if num_cuda_devices < 1:
raise Exception("CUDA devices are not available.")
if gpu_ids is None:
num_gpus = num_cuda_devices if num_gpus is None else min(num_gpus, num_cuda_devices)
gpu_ids = list(range(num_gpus))
if len(gpu_ids) > 1:
model = torch.nn.DataParallel(model, device_ids=gpu_ids)
# move to device
return model.to(device)
def dataloader_from_dataset(ds, batch_size=32, num_gpus=None, shuffle=False, distributed=False):
"""Creates a PyTorch DataLoader given a Dataset object.
Args:
ds (torch.utils.data.DataSet): A PyTorch dataset.
batch_size (int, optional): Batch size.
If more than 1 gpu is used, this would be the batch size per gpu.
Defaults to 32.
num_gpus (int, optional): The number of GPUs to be used. Defaults to None.
shuffle (bool, optional): If True, a RandomSampler is used. Defaults to False.
distributed (book, optional): If True, a DistributedSampler is used. Defaults to False.
Returns:
Module, DataParallel: A PyTorch Module or
a DataParallel wrapper (when multiple gpus are used).
"""
if isinstance(model, nn.DataParallel):
model = model.module
if num_gpus is None:
num_gpus = torch.cuda.device_count()
if not isinstance(device, torch.device):
raise ValueError("device must be of type torch.device.")
if device.type == "cuda":
model.to(device) # inplace
if num_gpus == 0:
raise ValueError("num_gpus must be non-zero when device.type is 'cuda'")
elif num_gpus == 1:
return model
else:
# parallelize
num_cuda_devices = torch.cuda.device_count()
if num_cuda_devices < 1:
raise Exception("CUDA devices are not available.")
elif num_cuda_devices < 2:
print("Warning: Only 1 CUDA device is available. Data parallelism is not possible.")
return model
else:
if num_gpus is None:
# use all available devices
return nn.DataParallel(model, device_ids=None)
elif num_gpus > num_cuda_devices:
print(
"Warning: Only {0} devices are available. "
"Setting the number of gpus to {0}".format(num_cuda_devices)
)
return nn.DataParallel(model, device_ids=None)
else:
return nn.DataParallel(model, device_ids=list(range(num_gpus)))
elif device.type == "cpu":
if num_gpus != 0 and num_gpus is not None:
warnings.warn("Device type is 'cpu'. num_gpus is ignored.")
return model.to(device)
batch_size = batch_size * max(1, num_gpus)
if distributed:
sampler = DistributedSampler(ds)
else:
raise Exception(
"Device type '{}' not supported. Currently, only cpu "
"and cuda devices are supported.".format(device.type)
)
sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
return DataLoader(ds, sampler=sampler, batch_size=batch_size)
def compute_training_steps(dataloader, num_epochs=1, max_steps=-1, gradient_accumulation_steps=1):
"""Computes the max training steps given a dataloader.
Args:
dataloader (Dataloader): A PyTorch DataLoader.
num_epochs (int, optional): Number of training epochs. Defaults to 1.
max_steps (int, optional): Total number of training steps.
If set to a positive value, it overrides num_epochs.
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
Defualts to -1.
gradient_accumulation_steps (int, optional): Number of steps to accumulate
before performing a backward/update pass.
Default to 1.
Returns:
int: The max number of steps to be used in a training loop.
"""
try:
dataset_length = len(dataloader)
except Exception:
dataset_length = -1
if max_steps <= 0:
if dataset_length != -1 and num_epochs > 0:
max_steps = dataset_length // gradient_accumulation_steps * num_epochs
if max_steps <= 0:
raise Exception("Max steps cannot be determined.")
return max_steps

Просмотреть файл

@ -7,24 +7,21 @@
https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1
"""
import os
import pandas as pd
import logging
import numpy as np
import os
import tarfile
from tempfile import TemporaryDirectory
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.sequence_classification import Processor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
URL = (
"https://github.com/NirantK/hindi2vec/releases/"
"download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"
)
URL = "https://github.com/NirantK/hindi2vec/releases/" "download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz"
def load_pandas_df(local_cache_path=TemporaryDirectory().name):
@ -49,19 +46,9 @@ def load_pandas_df(local_cache_path=TemporaryDirectory().name):
train_csv_file_path = os.path.join(local_cache_path, "hindi-train.csv")
test_csv_file_path = os.path.join(local_cache_path, "hindi-test.csv")
train_df = pd.read_csv(
train_csv_file_path,
sep="\t",
encoding='utf-8',
header=None
)
train_df = pd.read_csv(train_csv_file_path, sep="\t", encoding="utf-8", header=None)
test_df = pd.read_csv(
test_csv_file_path,
sep="\t",
encoding='utf-8',
header=None
)
test_df = pd.read_csv(test_csv_file_path, sep="\t", encoding="utf-8", header=None)
train_df = train_df.fillna("")
test_df = test_df.fillna("")
@ -80,7 +67,7 @@ def load_tc_dataset(
cache_dir=TemporaryDirectory().name,
max_len=MAX_SEQ_LEN,
batch_size=32,
num_gpus=None
num_gpus=None,
):
"""
Load the multinli dataset and split into training and testing datasets.
@ -105,7 +92,7 @@ def load_tc_dataset(
cache_dir (str, optional): The default folder for saving cache files.
Defaults to TemporaryDirectory().name.
max_len (int, optional): Maximum length of the list of tokens. Lists longer
than this are truncated and shorter ones are padded with "O"s.
than this are truncated and shorter ones are padded with "O"s.
Default value is BERT_MAX_LEN=512.
batch_size (int, optional): The batch size for training and testing.
Defaults to 32.
@ -114,15 +101,15 @@ def load_tc_dataset(
Returns:
tuple. The tuple contains four elements:
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
can be retrieved by calling the `inverse_transform` function.
test_labels (Series): a Pandas Series of testing label (in label ID format). If
the labels are in raw label values format, we will need to transform it to
the labels are in raw label values format, we will need to transform it to
label IDs by using the label_encoder.transform function.
"""
@ -140,12 +127,8 @@ def load_tc_dataset(
if test_fraction < 0 or test_fraction >= 1.0:
logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
test_fraction = 0.25
train_df, test_df = train_test_split(
all_df,
train_size=(1.0 - test_fraction),
random_state=random_seed
)
train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
if train_sample_ratio > 1.0:
train_sample_ratio = 1.0
@ -153,7 +136,7 @@ def load_tc_dataset(
elif train_sample_ratio < 0:
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
if test_sample_ratio > 1.0:
test_sample_ratio = 1.0
logging.warning("Setting the testing sample ratio to 1.0")
@ -171,42 +154,24 @@ def load_tc_dataset(
test_labels = label_encoder.transform(test_df[label_col])
test_df[label_col] = test_labels
processor = Processor(
model_name=model_name,
to_lower=to_lower,
cache_dir=cache_dir
)
processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
train_dataloader = processor.create_dataloader_from_df(
df=train_df,
text_col=text_col,
label_col=label_col,
max_len=max_len,
text2_col=None,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=True,
distributed=False
train_dataset = processor.dataset_from_dataframe(
df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
test_dataloader = processor.create_dataloader_from_df(
df=test_df,
text_col=text_col,
label_col=label_col,
max_len=max_len,
text2_col=None,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=False,
distributed=False
test_dataset = processor.dataset_from_dataframe(
df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
return (train_dataloader, test_dataloader, label_encoder, test_labels)
def get_label_values(label_encoder, label_ids):
"""
Get the label values from label IDs.
Get the label values from label IDs.
Args:
label_encoder (LabelEncoder): a fitted sklearn LabelEncoder instance

Просмотреть файл

@ -8,18 +8,18 @@ paper link: ("https://www.mendeley.com/catalogue/
arabic-text-classification-using-deep-learning-technics/")
"""
import os
import pandas as pd
import logging
import numpy as np
import os
from tempfile import TemporaryDirectory
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.sequence_classification import Processor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.sequence_classification import Processor
URL = (
"https://data.mendeley.com/datasets/v524p5dhpj/2"
@ -58,7 +58,7 @@ def load_tc_dataset(
cache_dir=TemporaryDirectory().name,
max_len=MAX_SEQ_LEN,
batch_size=32,
num_gpus=None
num_gpus=None,
):
"""
Load the multinli dataset and split into training and testing datasets.
@ -92,9 +92,9 @@ def load_tc_dataset(
Returns:
tuple. The tuple contains four elements:
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
can be retrieved by calling the `inverse_transform` function.
@ -104,11 +104,8 @@ def load_tc_dataset(
label IDs by using the label_encoder.transform function.
"""
# download and load the original dataset
all_df = load_pandas_df(
local_cache_path=local_path,
num_rows=None
)
# download and load the original dataset
all_df = load_pandas_df(local_cache_path=local_path, num_rows=None)
# set the text and label columns
text_col = all_df.columns[0]
@ -123,12 +120,8 @@ def load_tc_dataset(
if test_fraction < 0 or test_fraction >= 1.0:
logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
test_fraction = 0.25
train_df, test_df = train_test_split(
all_df,
train_size=(1.0 - test_fraction),
random_state=random_seed
)
train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
if train_sample_ratio > 1.0:
train_sample_ratio = 1.0
@ -136,7 +129,7 @@ def load_tc_dataset(
elif train_sample_ratio < 0:
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
if test_sample_ratio > 1.0:
test_sample_ratio = 1.0
logging.warning("Setting the testing sample ratio to 1.0")
@ -149,35 +142,17 @@ def load_tc_dataset(
if test_sample_ratio < 1.0:
test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
processor = Processor(
model_name=model_name,
to_lower=to_lower,
cache_dir=cache_dir
)
processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
train_dataloader = processor.create_dataloader_from_df(
df=train_df,
text_col=text_col,
label_col=label_col,
max_len=max_len,
text2_col=None,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=True,
distributed=False
train_dataset = processor.dataset_from_dataframe(
df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
test_dataloader = processor.create_dataloader_from_df(
df=test_df,
text_col=text_col,
label_col=label_col,
max_len=max_len,
text2_col=None,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=False,
distributed=False
test_dataset = processor.dataset_from_dataframe(
df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
# the DAC dataset already converted the labels to label ID format
test_labels = test_df[label_col]

Просмотреть файл

@ -7,18 +7,19 @@
https://www.nyu.edu/projects/bowman/multinli/
"""
import logging
import os
from tempfile import TemporaryDirectory
import pandas as pd
import logging
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tempfile import TemporaryDirectory
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.data_loaders import DaskJSONLoader
from utils_nlp.dataset.url_utils import extract_zip, maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.sequence_classification import Processor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
URL = "http://www.nyu.edu/projects/bowman/multinli/multinli_1.0.zip"
DATA_FILES = {
@ -63,9 +64,7 @@ def load_pandas_df(local_cache_path=".", file_split="train"):
return pd.read_json(os.path.join(local_cache_path, DATA_FILES[file_split]), lines=True)
def get_generator(
local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None
):
def get_generator(local_cache_path=".", file_split="train", block_size=10e6, batch_size=10e6, num_batches=None):
""" Returns an extracted dataset as a random batch generator that
yields pandas dataframes.
Args:
@ -85,9 +84,7 @@ def get_generator(
except Exception as e:
raise e
loader = DaskJSONLoader(
os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size
)
loader = DaskJSONLoader(os.path.join(local_cache_path, DATA_FILES[file_split]), block_size=block_size)
return loader.get_sequential_batches(batch_size=int(batch_size), num_batches=num_batches)
@ -103,7 +100,7 @@ def load_tc_dataset(
cache_dir=TemporaryDirectory().name,
max_len=MAX_SEQ_LEN,
batch_size=32,
num_gpus=None
num_gpus=None,
):
"""
Load the multinli dataset and split into training and testing datasets.
@ -137,9 +134,9 @@ def load_tc_dataset(
Returns:
tuple. The tuple contains four elements:
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
label_encoder (LabelEncoder): a sklearn LabelEncoder instance. The label values
can be retrieved by calling the `inverse_transform` function.
@ -150,10 +147,7 @@ def load_tc_dataset(
"""
# download and load the original dataset
all_df = load_pandas_df(
local_cache_path=local_path,
file_split="train"
)
all_df = load_pandas_df(local_cache_path=local_path, file_split="train")
# select the examples corresponding to one of the entailment labels (neutral
# in this case) to avoid duplicate rows, as the sentences are not unique,
@ -169,12 +163,8 @@ def load_tc_dataset(
if test_fraction < 0 or test_fraction >= 1.0:
logging.warning("Invalid test fraction value: {}, changed to 0.25".format(test_fraction))
test_fraction = 0.25
train_df, test_df = train_test_split(
all_df,
train_size=(1.0 - test_fraction),
random_state=random_seed
)
train_df, test_df = train_test_split(all_df, train_size=(1.0 - test_fraction), random_state=random_seed)
if train_sample_ratio > 1.0:
train_sample_ratio = 1.0
@ -182,7 +172,7 @@ def load_tc_dataset(
elif train_sample_ratio < 0:
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
if test_sample_ratio > 1.0:
test_sample_ratio = 1.0
logging.warning("Setting the testing sample ratio to 1.0")
@ -200,35 +190,17 @@ def load_tc_dataset(
test_labels = label_encoder.transform(test_df[label_col])
test_df[label_col] = test_labels
processor = Processor(
model_name=model_name,
to_lower=to_lower,
cache_dir=cache_dir
)
processor = Processor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
train_dataloader = processor.create_dataloader_from_df(
df=train_df,
text_col=text_col,
label_col=label_col,
max_len=max_len,
text2_col=None,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=True,
distributed=False
train_dataset = processor.dataset_from_dataframe(
df=train_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
train_dataloader = dataloader_from_dataset(train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True)
test_dataloader = processor.create_dataloader_from_df(
df=test_df,
text_col=text_col,
label_col=label_col,
max_len=max_len,
text2_col=None,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=False,
distributed=False
test_dataset = processor.dataset_from_dataframe(
df=test_df, text_col=text_col, label_col=label_col, max_len=max_len,
)
test_dataloader = dataloader_from_dataset(test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False)
return (train_dataloader, test_dataloader, label_encoder, test_labels)

Просмотреть файл

@ -7,18 +7,19 @@
https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold/CONLL-format/data
"""
import random
import os
import pandas as pd
import logging
import os
import random
from tempfile import TemporaryDirectory
from utils_nlp.dataset.url_utils import maybe_download
import pandas as pd
from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.ner_utils import preprocess_conll
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor
URL = (
"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt"
@ -91,7 +92,7 @@ def load_dataset(
max_len=MAX_SEQ_LEN,
trailing_piece_tag="X",
batch_size=32,
num_gpus=None
num_gpus=None,
):
"""
Load the wikigold dataset and split into training and testing datasets.
@ -116,7 +117,7 @@ def load_dataset(
cache_dir (str, optional): The default folder for saving cache files.
Defaults to './temp'.
max_len (int, optional): Maximum length of the list of tokens. Lists longer
than this are truncated and shorter ones are padded with "O"s.
than this are truncated and shorter ones are padded with "O"s.
Default value is BERT_MAX_LEN=512.
trailing_piece_tag (str, optional): Tag used to label trailing word pieces.
For example, "criticize" is broken into "critic" and "##ize", "critic"
@ -129,16 +130,12 @@ def load_dataset(
Returns:
tuple. The tuple contains four elements.
train_dataload (DataLoader): a PyTorch DataLoader instance for training.
test_dataload (DataLoader): a PyTorch DataLoader instance for testing.
label_map (dict): A dictionary object to map a label (str) to an ID (int).
train_dataloader (DataLoader): a PyTorch DataLoader instance for training.
test_dataloader (DataLoader): a PyTorch DataLoader instance for testing.
label_map (dict): A dictionary object to map a label (str) to an ID (int).
test_dataset (TensorDataset): A TensorDataset containing the following four tensors.
1. input_ids_all: Tensor. Each sublist contains numerical values,
i.e. token ids, corresponding to the tokens in the input
text data.
i.e. token ids, corresponding to the tokens in the input text data.
2. input_mask_all: Tensor. Each sublist contains the attention
mask of the input token id list, 1 for input tokens and 0 for
padded tokens, so that padded tokens are not attended to.
@ -155,9 +152,7 @@ def load_dataset(
"""
train_df, test_df = load_train_test_dfs(
local_cache_path=local_path,
test_fraction=test_fraction,
random_seed=random_seed
local_cache_path=local_path, test_fraction=test_fraction, random_seed=random_seed
)
if train_sample_ratio > 1.0:
@ -166,7 +161,7 @@ def load_dataset(
elif train_sample_ratio < 0:
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
if test_sample_ratio > 1.0:
test_sample_ratio = 1.0
logging.warning("Setting the testing sample ratio to 1.0")
@ -179,47 +174,34 @@ def load_dataset(
if test_sample_ratio < 1.0:
test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
processor = TokenClassificationProcessor(
model_name=model_name,
to_lower=to_lower,
cache_dir=cache_dir
)
processor = TokenClassificationProcessor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
label_map = TokenClassificationProcessor.create_label_map(
label_lists=train_df['labels'],
trailing_piece_tag=trailing_piece_tag
label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag
)
train_dataset = processor.preprocess_for_bert(
text=train_df['sentence'],
text=train_df["sentence"],
max_len=max_len,
labels=train_df['labels'],
labels=train_df["labels"],
label_map=label_map,
trailing_piece_tag=trailing_piece_tag
trailing_piece_tag=trailing_piece_tag,
)
test_dataset = processor.preprocess_for_bert(
text=test_df['sentence'],
text=test_df["sentence"],
max_len=max_len,
labels=test_df['labels'],
labels=test_df["labels"],
label_map=label_map,
trailing_piece_tag=trailing_piece_tag
trailing_piece_tag=trailing_piece_tag,
)
train_dataloader = processor.create_dataloader_from_dataset(
train_dataset,
shuffle=True,
batch_size=batch_size,
num_gpus=num_gpus,
distributed=False
train_dataloader = dataloader_from_dataset(
train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True, distributed=False
)
test_dataloader = processor.create_dataloader_from_dataset(
test_dataset,
shuffle=False,
batch_size=batch_size,
num_gpus=num_gpus,
distributed=False
test_dataloader = dataloader_from_dataset(
test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False, distributed=False
)
return (train_dataloader, test_dataloader, label_map, test_dataset)

Просмотреть файл

@ -3,22 +3,24 @@
import os
from random import random, seed
from bertsum.others.utils import test_rouge
def get_rouge(predictions, targets, temp_dir):
def get_rouge(predictions, targets, temp_dir, random_seed=42):
"""
function to get the rouge metric for the prediction and the reference.
Args:
predictions (list of strings): Predictions to be compared.
target (list of strings): References
temp_dir (str): Path where temporary folders are created to host the files
generated by ROUGE applicatoin.
temp_dir (str): Path where temporary folders are created to host the files
generated by ROUGE application.
seed (int, optional): Random seed. Defaults to 42.
Return:
dictionary: rouge metric
"""
def _write_list_to_file(list_items, filename):
@ -27,7 +29,7 @@ def get_rouge(predictions, targets, temp_dir):
for item in list_items:
filehandle.write("%s\n" % item)
seed(42)
seed(random_seed)
random_number = random()
os.makedirs(temp_dir, exist_ok=True)
candidate_path = os.path.join(temp_dir, "candidate" + str(random_number))

Просмотреть файл

@ -13,7 +13,7 @@ from pytorch_pretrained_bert.optimization import BertAdam
from tqdm import tqdm
from utils_nlp.models.bert.common import Language
from utils_nlp.common.pytorch_utils import get_device, move_to_device
from utils_nlp.common.pytorch_utils import get_device
from cached_property import cached_property
@ -91,7 +91,7 @@ class BERTSequenceClassifier:
device, num_gpus = get_device(num_gpus)
self.model = move_to_device(self.model, device, num_gpus)
self.model = move_model_to_device(self.model, device, num_gpus)
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
@ -211,7 +211,7 @@ class BERTSequenceClassifier:
(classes, probabilities) if probabilities is True.
"""
device, num_gpus = get_device(num_gpus)
self.model = move_to_device(self.model, device, num_gpus)
self.model = move_model_to_device(self.model, device, num_gpus)
# score
self.model.eval()

Просмотреть файл

@ -14,7 +14,7 @@ from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from tqdm import tqdm
from utils_nlp.common.pytorch_utils import get_device, move_to_device
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
from utils_nlp.models.bert.common import Language
try:
@ -192,7 +192,7 @@ class BERTSequenceClassifier:
device, num_gpus = get_device(num_gpus)
self.model = move_to_device(self.model, device, num_gpus)
self.model = move_model_to_device(self.model, device, num_gpus)
if bert_optimizer is None:
bert_optimizer = self.create_optimizer(
@ -277,7 +277,7 @@ class BERTSequenceClassifier:
a dictionary with classes, target labels, probabilities) if probabilities is True.
"""
device, num_gpus = get_device(num_gpus)
self.model = move_to_device(self.model, device, num_gpus)
self.model = move_model_to_device(self.model, device, num_gpus)
# score
self.model.eval()

Просмотреть файл

@ -4,19 +4,17 @@
# This script reuses code from https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples
# /extract_features.py, with necessary modifications.
from pytorch_pretrained_bert.modeling import BertModel
from utils_nlp.common.pytorch_utils import get_device, move_to_device
from enum import Enum
import numpy as np
import pandas as pd
import os
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from utils_nlp.models.bert.common import Language, Tokenizer
from cached_property import cached_property
from pytorch_pretrained_bert.modeling import BertModel
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
from utils_nlp.models.bert.common import Language, Tokenizer
class PoolingStrategy(str, Enum):
@ -43,27 +41,21 @@ class BERTSentenceEncoder:
pooling_strategy=PoolingStrategy.MEAN,
):
"""Initialize the encoder's underlying model and tokenizer
Args:
bert_model: BERT model to use for encoding. Defaults to pretrained BertModel.
tokenizer: Tokenizer to use for preprocessing. Defaults to pretrained BERT tokenizer.
language: The pretrained model's language. Defaults to Language.ENGLISH.
num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used.
num_gpus: The number of gpus to use. Defaults to None, which forces all available GPUs to be used.
cache_dir: Location of BERT's cache directory. Defaults to "."
to_lower: True to lowercase before tokenization. Defaults to False.
max_len: Maximum number of tokens.
layer_index: The layer from which to extract features.
layer_index: The layer from which to extract features.
Defaults to the last layer; can also be a list of integers for experimentation.
pooling_strategy: Pooling strategy to aggregate token embeddings into sentence embedding.
"""
self.model = (
bert_model.model.bert
if bert_model
else BertModel.from_pretrained(language, cache_dir=cache_dir)
)
self.tokenizer = (
tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
)
self.model = bert_model.model.bert if bert_model else BertModel.from_pretrained(language, cache_dir=cache_dir)
self.tokenizer = tokenizer if tokenizer else Tokenizer(language, to_lower=to_lower, cache_dir=cache_dir)
self.num_gpus = num_gpus
self.max_len = max_len
self.layer_index = layer_index
@ -98,16 +90,17 @@ class BERTSentenceEncoder:
def get_hidden_states(self, text, batch_size=32):
"""Extract the hidden states from the pretrained model
Args:
text: List of documents to extract features from.
batch_size: Batch size, defaults to 32.
Returns:
pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float]).
pd.DataFrame with columns:
text_index (int), token (str), layer_index (int), values (list[float]).
"""
device, num_gpus = get_device(self.num_gpus)
self.model = move_to_device(self.model, device, self.num_gpus)
self.model = move_model_to_device(self.model, device, self.num_gpus)
self.model.eval()
@ -122,9 +115,7 @@ class BERTSentenceEncoder:
input_type_ids = torch.arange(input_ids.size(0), dtype=torch.long, device=device)
eval_data = TensorDataset(input_ids, input_mask, input_type_ids)
eval_dataloader = DataLoader(
eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size
)
eval_dataloader = DataLoader(eval_data, sampler=SequentialSampler(eval_data), batch_size=batch_size)
hidden_states = {"text_index": [], "token": [], "layer_index": [], "values": []}
for (input_ids_tensor, input_mask_tensor, example_indices_tensor) in eval_dataloader:
@ -142,9 +133,7 @@ class BERTSentenceEncoder:
hidden_states["text_index"].append(example_index.item())
hidden_states["token"].append(token)
hidden_states["layer_index"].append(layer_index)
hidden_states["values"].append(
[round(x.item(), 6) for x in layer_output[i]]
)
hidden_states["values"].append([round(x.item(), 6) for x in layer_output[i]])
# empty cache
del [input_ids_tensor, input_mask_tensor, example_indices_tensor]
@ -158,7 +147,7 @@ class BERTSentenceEncoder:
def pool(self, df):
"""Pooling to aggregate token-wise embeddings to sentence embeddings
Args:
df: pd.DataFrame with columns text_index (int), token (str), layer_index (int), values (list[float])
@ -167,31 +156,16 @@ class BERTSentenceEncoder:
"""
def max_pool(x):
values = np.array(
[
np.reshape(np.array(x.values[i]), self.embedding_dim)
for i in range(x.values.shape[0])
]
)
values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
m, _ = torch.max(torch.tensor(values, dtype=torch.float), 0)
return m.numpy()
def mean_pool(x):
values = np.array(
[
np.reshape(np.array(x.values[i]), self.embedding_dim)
for i in range(x.values.shape[0])
]
)
values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
return torch.mean(torch.tensor(values, dtype=torch.float), 0).numpy()
def cls_pool(x):
values = np.array(
[
np.reshape(np.array(x.values[i]), self.embedding_dim)
for i in range(x.values.shape[0])
]
)
values = np.array([np.reshape(np.array(x.values[i]), self.embedding_dim) for i in range(x.values.shape[0])])
return values[0]
try:
@ -206,15 +180,11 @@ class BERTSentenceEncoder:
except ValueError as ve:
print(ve)
return (
df.groupby(["text_index", "layer_index"])["values"]
.apply(lambda x: pool_func(x))
.reset_index()
)
return df.groupby(["text_index", "layer_index"])["values"].apply(lambda x: pool_func(x)).reset_index()
def encode(self, text, batch_size=32, as_numpy=False):
"""Computes sentence encodings
"""Computes sentence encodings
Args:
text: List of documents to encode.
batch_size: Batch size, defaults to 32.

Просмотреть файл

@ -16,7 +16,7 @@ from pytorch_pretrained_bert.optimization import BertAdam
from tqdm import tqdm, trange
from utils_nlp.models.bert.common import Language, create_data_loader
from utils_nlp.common.pytorch_utils import get_device, move_to_device
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
from cached_property import cached_property
@ -144,7 +144,7 @@ class BERTTokenClassifier:
device, num_gpus = get_device(num_gpus)
self.model = move_to_device(self.model, device, num_gpus)
self.model = move_model_to_device(self.model, device, num_gpus)
if num_gpus is None:
num_gpus_used = torch.cuda.device_count()
@ -228,7 +228,7 @@ class BERTTokenClassifier:
)
device, num_gpus = get_device(num_gpus)
self.model = move_to_device(self.model, device, num_gpus)
self.model = move_model_to_device(self.model, device, num_gpus)
self.model.eval()
eval_loss = 0

187
utils_nlp/models/transformers/common.py Normal file → Executable file
Просмотреть файл

@ -4,17 +4,16 @@
# This script reuses some code from
# https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py
from itertools import cycle
import logging
import numpy as np
import os
import random
import time
import torch
from tqdm import tqdm, trange
from itertools import cycle
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import numpy as np
import torch
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@ -23,7 +22,8 @@ from transformers.tokenization_bert import BertTokenizer
from transformers.tokenization_distilbert import DistilBertTokenizer
from transformers.tokenization_roberta import RobertaTokenizer
from transformers.tokenization_xlnet import XLNetTokenizer
from utils_nlp.common.pytorch_utils import get_device
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
TOKENIZER_CLASS = {}
TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
@ -38,12 +38,7 @@ logger = logging.getLogger(__name__)
class Transformer:
def __init__(
self,
model_class,
model_name="bert-base-cased",
num_labels=2,
cache_dir=".",
load_model_from_dir=None,
self, model_class, model_name="bert-base-cased", num_labels=2, cache_dir=".", load_model_from_dir=None,
):
if model_name not in self.list_supported_models():
@ -82,22 +77,40 @@ class Transformer:
if cuda and torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
@staticmethod
def get_default_optimizer(model, weight_decay, learning_rate, adam_epsilon):
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
"weight_decay": weight_decay,
},
{
"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
return optimizer
@staticmethod
def get_default_scheduler(optimizer, warmup_steps, num_training_steps):
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps
)
return scheduler
def fine_tune(
self,
train_dataloader,
get_inputs,
num_gpus=None,
gpu_ids=None,
max_steps=-1,
num_train_epochs=1,
max_grad_norm=1.0,
gradient_accumulation_steps=1,
n_gpu=1,
move_batch_to_device=None,
optimizer=None,
scheduler=None,
weight_decay=0.0,
learning_rate=5e-5,
adam_epsilon=1e-8,
warmup_steps=0,
fp16=False,
fp16_opt_level="O1",
local_rank=-1,
@ -107,51 +120,12 @@ class Transformer:
clip_grad_norm=True,
):
device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
# get device
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
if seed is not None:
Transformer.set_seed(seed, num_gpus > 0)
try:
dataset_length = len(train_dataloader)
except:
dataset_length = -1
if max_steps <= 0:
if dataset_length != -1 and num_train_epochs > 0:
max_steps = dataset_length // gradient_accumulation_steps * num_train_epochs
if max_steps <= 0:
raise Exception("Max steps cannot be determined for fine tuning!")
if optimizer is None:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p
for n, p in self.model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay": weight_decay,
},
{
"params": [
p
for n, p in self.model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay": 0.0,
},
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
if scheduler is None:
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps
)
if fp16:
try:
from apex import amp
@ -159,46 +133,22 @@ class Transformer:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex")
self.model, optimizer = amp.initialize(self.model, optimizer, opt_level=fp16_opt_level)
if local_rank != -1:
self.model = torch.nn.parallel.DistributedDataParallel(
self.model,
device_ids=[local_rank],
output_device=local_rank,
find_unused_parameters=True,
)
else:
if isinstance(self.model, torch.nn.DataParallel):
self.model = self.model.module
if num_gpus > 1:
self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
self.model.to(device)
self.model.train()
# move model
self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank)
# init training
global_step = 0
tr_loss = 0.0
self.model.zero_grad()
if move_batch_to_device is None:
def move_batch_to_device(batch, device):
return tuple(t.to(device) for t in batch)
start = time.time()
accum_loss = 0
self.model.train()
self.model.zero_grad()
while global_step < max_steps:
epoch_iterator = tqdm(
train_dataloader,
desc="Iteration",
disable=local_rank not in [-1, 0] or not verbose
)
# train
start = time.time()
while global_step < max_steps:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=local_rank not in [-1, 0] or not verbose)
for step, batch in enumerate(epoch_iterator):
batch = move_batch_to_device(batch, device)
inputs = get_inputs(batch, self.model_name)
inputs = get_inputs(batch, device, self.model_name)
outputs = self.model(**inputs)
loss = outputs[0]
@ -210,28 +160,26 @@ class Transformer:
if fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
if clip_grad_norm:
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
else:
loss.backward()
if clip_grad_norm:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
tr_loss += loss.item()
accum_loss += loss.item()
if (step + 1) % gradient_accumulation_steps == 0:
global_step += 1
if clip_grad_norm:
if fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
if global_step % report_every == 0 and verbose:
# tqdm.write("Loss:{:.6f}".format(loss))
end = time.time()
print(
"loss: {0:.6f}, time: {1:f}, number of examples in current step: {2:.0f}, step {3:.0f} out of total {4:.0f}".format(
accum_loss / report_every,
end - start,
len(batch),
global_step,
max_steps,
"loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, step:{3:.0f}/{4:.0f}".format(
accum_loss / report_every, end - start, len(batch), global_step, max_steps,
)
)
accum_loss = 0
@ -246,31 +194,20 @@ class Transformer:
epoch_iterator.close()
break
# empty cache
torch.cuda.empty_cache()
return global_step, tr_loss / global_step
def predict(self, eval_dataloader, get_inputs, n_gpu=1, verbose=True, move_batch_to_device=None):
device, num_gpus = get_device(num_gpus=n_gpu, local_rank=-1)
if isinstance(self.model, torch.nn.DataParallel):
self.model = self.model.module
def predict(self, eval_dataloader, get_inputs, num_gpus, gpu_ids, verbose=True):
# get device
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
if num_gpus > 1:
self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
# move model
self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
self.model.to(device)
# predict
self.model.eval()
if move_batch_to_device is None:
def move_batch_to_device(batch, device):
return tuple(t.to(device) for t in batch)
for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not verbose):
batch = move_batch_to_device(batch, device) #tuple(t.to(device) for t in batch)
for batch in tqdm(eval_dataloader, desc="Scoring", disable=not verbose):
with torch.no_grad():
inputs = get_inputs(batch, self.model_name, train_mode=False)
inputs = get_inputs(batch, device, self.model_name, train_mode=False)
outputs = self.model(**inputs)
logits = outputs[0]
yield logits.detach().cpu().numpy()

Просмотреть файл

@ -5,24 +5,22 @@
import itertools
import logging
import numpy as np
import os
import random
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, IterableDataset
from torch.utils.data import DataLoader, SequentialSampler
from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSampler
# from torch.utils.data.distributed import DistributedSampler
from transformers import DistilBertModel, BertModel
from transformers import BertModel, DistilBertModel
from bertsum.models import model_builder, data_loader
from bertsum.models import data_loader, model_builder
from bertsum.models.data_loader import Batch
from bertsum.models.model_builder import Summarizer
from utils_nlp.common.pytorch_utils import get_device
from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device
from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": DistilBertModel}
@ -42,8 +40,8 @@ def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000):
Args:
data_iter (generator): data generator.
shuffle (bool): whether the data is shuffled
is_labeled (bool): it specifies whether the data objects are labeled data.
shuffle (bool): whether the data is shuffled.
is_labeled (bool): specifies whether the data objects are labeled data.
batch_size (int): number of tokens per batch.
Returns:
@ -79,9 +77,7 @@ class ExtSumProcessedIterableDataset(IterableDataset):
if self.is_shuffle:
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list)))
else:
return itertools.chain.from_iterable(
map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))
)
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list))))
def __iter__(self):
return self.get_stream()
@ -114,9 +110,7 @@ class ExtSumProcessedDataset(Dataset):
return self.data[idx]
def get_pred(
example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3
):
def get_pred(example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3):
"""
Get the summarization prediction for the paragraph example based on the scores
returned by the transformer summarization model.
@ -229,9 +223,7 @@ class ExtSumProcessedData:
def _get_files(self, root):
train_files = []
test_files = []
files = [
os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
]
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))]
for fname in files:
if fname.find("train") != -1:
train_files.append(fname)
@ -324,7 +316,7 @@ class ExtSumProcessor:
self._model_name = value
@staticmethod
def get_inputs(batch, model_name, train_mode=True):
def get_inputs(batch, device, model_name, train_mode=True):
"""
Creates an input dictionary given a model name.
@ -332,6 +324,7 @@ class ExtSumProcessor:
batch (object): A Batch containing input ids, segment ids, sentence class ids,
masks for the input ids, masks for sentence class ids and source text.
If train_model is True, it also contains the labels and target text.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
@ -344,6 +337,7 @@ class ExtSumProcessor:
if model_name.split("-")[0] in ["bert", "distilbert"]:
if train_mode:
batch = batch.to(device)
# labels must be the last
return {
"x": batch.src,
@ -354,12 +348,13 @@ class ExtSumProcessor:
"labels": batch.labels,
}
else:
batch = Bunch(batch)
return {
"x": batch.src,
"segs": batch.segs,
"clss": batch.clss,
"mask": batch.mask,
"mask_cls": batch.mask_cls,
"x": batch.src.to(device),
"segs": batch.segs.to(device),
"clss": batch.clss.to(device),
"mask": batch.mask.to(device),
"mask_cls": batch.mask_cls.to(device),
}
else:
raise ValueError("Model not supported: {}".format(model_name))
@ -476,7 +471,7 @@ class ExtractiveSummarizer(Transformer):
Args:
model_name (str, optional): Transformer model name used in preprocessing.
check MODEL_CLASS for supported models. Defaults to "distilbert-base-uncased".
encoder (str, optional): Encoder algorithm used by summarization layer.
encoder (str, optional): Encoder algorithm used by summarization layer.
There are four options:
- baseline: it used a smaller transformer model to replace the bert model
and with transformer summarization layer.
@ -485,13 +480,11 @@ class ExtractiveSummarizer(Transformer):
- transformer: it uses pretrained BERT and fine-tune BERT with transformer
summarization layer.
- RNN: it uses pretrained BERT and fine-tune BERT with LSTM summarization layer.
Defaults to "transformer".
Defaults to "transformer".
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
"""
super().__init__(
model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir
)
super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir)
if model_name not in self.list_supported_models():
raise ValueError(
"Model name {} is not supported by ExtractiveSummarizer. "
@ -522,6 +515,7 @@ class ExtractiveSummarizer(Transformer):
self,
train_dataset,
num_gpus=None,
gpu_ids=None,
batch_size=3000,
local_rank=-1,
max_steps=5e5,
@ -546,7 +540,10 @@ class ExtractiveSummarizer(Transformer):
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will
be used. Defaults to None.
batch_size (int, optional): Maximum number of tokens in each batch.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
batch_size (int, optional): Maximum number of tokens in each batch.
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-1, which means non-distributed training.
max_steps (int, optional): Maximum number of training steps. Defaults to 5e5.
@ -571,16 +568,7 @@ class ExtractiveSummarizer(Transformer):
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
"""
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
def move_batch_to_device(batch, device):
return batch.to(device)
# if isinstance(self.model, nn.DataParallel):
# self.model.module.to(device)
# else:
self.model.to(device)
# init optimizer
optimizer = model_builder.build_optim(
optimization_method,
learning_rate,
@ -594,31 +582,34 @@ class ExtractiveSummarizer(Transformer):
)
# batch_size is the number of tokens in a batch
train_dataloader = get_dataloader(
train_dataset.get_stream(), is_labeled=True, batch_size=batch_size
train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)
# compute the max number of training steps
max_steps = compute_training_steps(
train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
)
super().fine_tune(
train_dataloader=train_dataloader,
get_inputs=ExtSumProcessor.get_inputs,
move_batch_to_device=move_batch_to_device,
n_gpu=num_gpus,
num_train_epochs=-1,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
max_steps=max_steps,
optimizer=optimizer,
warmup_steps=warmup_steps,
max_grad_norm=max_grad_norm,
gradient_accumulation_steps=gradient_accumulation_steps,
optimizer=optimizer,
scheduler=None,
verbose=verbose,
seed=seed,
report_every=report_every,
clip_grad_norm=False,
max_grad_norm=max_grad_norm,
)
def predict(
self,
test_dataset,
num_gpus=1,
gpu_ids=None,
batch_size=16,
sentence_separator="<q>",
top_n=3,
@ -632,6 +623,9 @@ class ExtractiveSummarizer(Transformer):
Args:
test_dataset (Dataset): Dataset for which the summary to be predicted
num_gpus (int, optional): The number of GPUs used in prediction. Defaults to 1.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
batch_size (int, optional): The number of test examples in each batch. Defaults to 16.
sentence_separator (str, optional): String to be inserted between sentences in
the prediction. Defaults to '<q>'.
@ -678,10 +672,8 @@ class ExtractiveSummarizer(Transformer):
}
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(
test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn
)
sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn)
sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids)
sent_scores_list = list(sent_scores)
scores_list = []
for i in sent_scores_list:
@ -699,15 +691,18 @@ class ExtractiveSummarizer(Transformer):
prediction.extend(temp_pred)
return prediction
def predict_scores(self, eval_dataloader, num_gpus=1, verbose=True):
def predict_scores(self, test_dataloader, num_gpus=1, gpu_ids=None, verbose=True):
"""
Scores a dataset using a fine-tuned model and a given dataloader.
Args:
eval_dataloader (Dataloader): Dataloader for the evaluation data.
test_dataloader (Dataloader): Dataloader for scoring the data.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
verbose (bool, optional): Whether to print out the training log. Defaults to True.
Returns
@ -716,23 +711,13 @@ class ExtractiveSummarizer(Transformer):
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
def move_batch_to_device(batch, device):
batch["src"] = batch["src"].to(device)
batch["segs"] = batch["segs"].to(device)
batch["clss"] = batch["clss"].to(device)
batch["mask"] = batch["mask"].to(device)
batch["mask_cls"] = batch["mask_cls"].to(device)
if "labels" in batch:
batch["labels"] = batch["labels"].to(device)
return Bunch(batch)
preds = list(
super().predict(
eval_dataloader=eval_dataloader,
eval_dataloader=test_dataloader,
get_inputs=ExtSumProcessor.get_inputs,
n_gpu=num_gpus,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
verbose=verbose,
move_batch_to_device=move_batch_to_device,
)
)
return preds

215
utils_nlp/models/transformers/named_entity_recognition.py Normal file → Executable file
Просмотреть файл

@ -2,18 +2,16 @@
# Licensed under the MIT License.
import logging
from collections import Iterable
import numpy as np
import torch
import torch.nn as nn
from collections import Iterable
from torch.utils.data import TensorDataset
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForTokenClassification
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForTokenClassification
from utils_nlp.common.pytorch_utils import get_device
from utils_nlp.common.pytorch_utils import compute_training_steps
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
TC_MODEL_CLASS = {}
TC_MODEL_CLASS.update({k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
@ -42,27 +40,36 @@ class TokenClassificationProcessor:
)
@staticmethod
def get_inputs(batch, model_name, train_mode=True):
def get_inputs(batch, device, model_name, train_mode=True):
"""
Produce a dictionary object for model training or prediction.
Creates an input dictionary given a model name.
Args:
model_name (str): The pretained model name.
train_mode (bool, optional): Whether it's for model training. Set it to False if
it's for testing and it won't have the 'labels' data field.
Defaults to True, for model training.
batch (tuple): A tuple containing input ids, attention mask,
segment ids, and labels tensors.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
Returns:
dict: A dictionary object contains all needed information for training or testing.
dict: Dictionary containing input ids, segment ids, masks, and labels.
Labels are only returned when train_mode is True.
"""
batch = tuple(t.to(device) for t in batch)
if model_name.split("-")[0] in ["bert", "distilbert"]:
if train_mode:
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
else:
inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
if model_name not in list(TC_MODEL_CLASS):
raise ValueError("Model not supported: {}".format(model_name))
# distilbert doesn't support segment ids
if model_name.split("-")[0] not in ["distilbert"]:
inputs["token_type_ids"] = batch[2]
if train_mode:
return {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
return inputs
else:
return {"input_ids": batch[0], "attention_mask": batch[1]}
raise ValueError("Model not supported: {}".format(model_name))
@staticmethod
def create_label_map(label_lists, trailing_piece_tag="X"):
@ -89,9 +96,7 @@ class TokenClassificationProcessor:
label_map[trailing_piece_tag] = len(label_set)
return label_map
def preprocess_for_bert(
self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"
):
def preprocess_for_bert(self, text, max_len=MAX_SEQ_LEN, labels=None, label_map=None, trailing_piece_tag="X"):
"""
Tokenize and preprocesses input word lists, involving the following steps
0. WordPiece tokenization.
@ -125,7 +130,7 @@ class TokenClassificationProcessor:
Returns:
TensorDataset: A TensorDataset containing the following four tensors.
1. input_ids_all: Tensor. Each sublist contains numerical values,
i.e. token ids, corresponding to the tokens in the input
i.e. token ids, corresponding to the tokens in the input
text data.
2. input_mask_all: Tensor. Each sublist contains the attention
mask of the input token id list, 1 for input tokens and 0 for
@ -146,9 +151,7 @@ class TokenClassificationProcessor:
return isinstance(obj, Iterable) and not isinstance(obj, str)
if max_len > MAX_SEQ_LEN:
logging.warning(
"Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN)
)
logging.warning("Setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
max_len = MAX_SEQ_LEN
if not _is_iterable_but_not_string(text):
@ -181,9 +184,7 @@ class TokenClassificationProcessor:
for t, t_labels in zip(text, labels):
if len(t) != len(t_labels):
raise ValueError(
"The number of words is {0}, but the number of labels is {1}.".format(
len(t), len(t_labels)
)
"The number of words is {0}, but the number of labels is {1}.".format(len(t), len(t_labels))
)
new_labels = []
@ -197,11 +198,7 @@ class TokenClassificationProcessor:
new_tokens.append(sub_word)
if len(new_tokens) > max_len:
logging.warn(
"Text after tokenization with length {} has been truncated".format(
len(new_tokens)
)
)
logging.warn("Text after tokenization with length {} has been truncated".format(len(new_tokens)))
new_tokens = new_tokens[:max_len]
new_labels = new_labels[:max_len]
input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
@ -218,9 +215,7 @@ class TokenClassificationProcessor:
input_mask += padding
new_labels += label_padding
trailing_token_mask_all.append(
[True if label != trailing_piece_tag else False for label in new_labels]
)
trailing_token_mask_all.append([True if label != trailing_piece_tag else False for label in new_labels])
if label_map:
label_ids = [label_map[label] for label in new_labels]
@ -235,32 +230,17 @@ class TokenClassificationProcessor:
td = TensorDataset(
torch.tensor(input_ids_all, dtype=torch.long),
torch.tensor(input_mask_all, dtype=torch.long),
torch.tensor(trailing_token_mask_all, dtype=torch.bool),
torch.tensor(trailing_token_mask_all, dtype=torch.long),
torch.tensor(label_ids_all, dtype=torch.long),
)
else:
td = TensorDataset(
torch.tensor(input_ids_all, dtype=torch.long),
torch.tensor(input_mask_all, dtype=torch.long),
torch.tensor(trailing_token_mask_all, dtype=torch.bool),
torch.tensor(trailing_token_mask_all, dtype=torch.long),
)
return td
def create_dataloader_from_dataset(
self, dataset, shuffle=False, batch_size=32, num_gpus=None, distributed=False
):
if num_gpus is None:
num_gpus = torch.cuda.device_count()
batch_size = batch_size * max(1, num_gpus)
if distributed:
sampler = DistributedSampler(dataset)
else:
sampler = RandomSampler(dataset) if shuffle else SequentialSampler(dataset)
return DataLoader(dataset, sampler=sampler, batch_size=batch_size)
class TokenClassifier(Transformer):
"""
@ -277,10 +257,7 @@ class TokenClassifier(Transformer):
def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
super().__init__(
model_class=TC_MODEL_CLASS,
model_name=model_name,
num_labels=num_labels,
cache_dir=cache_dir,
model_class=TC_MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
)
@staticmethod
@ -291,7 +268,10 @@ class TokenClassifier(Transformer):
self,
train_dataloader,
num_epochs=1,
max_steps=-1,
gradient_accumulation_steps=1,
num_gpus=None,
gpu_ids=None,
local_rank=-1,
weight_decay=0.0,
learning_rate=5e-5,
@ -301,73 +281,96 @@ class TokenClassifier(Transformer):
seed=None,
):
"""
Fit the TokenClassifier model using the given training dataset.
Fine-tunes a pre-trained token classification model.
Args:
train_dataloader (DataLoader): DataLoader instance for training.
num_epochs (int, optional): Number of training epochs.
Defaults to 1.
train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
num_epochs (int, optional): Number of training epochs. Defaults to 1.
max_steps (int, optional): Total number of training steps.
If set to a positive value, it overrides num_epochs.
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
Defualts to -1.
gradient_accumulation_steps (int, optional): Number of steps to accumulate
before performing a backward/update pass.
Default to 1.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will
be used. Defaults to None.
local_rank (int, optional): Whether need to do distributed training.
Defaults to -1, no distributed training.
weight_decay (float, optional): Weight decay rate.
Defaults to 0.
learning_rate (float, optional): The learning rate.
Defaults to 5e-5.
adam_espilon (float, optional): The 'eps' parameter for the 'AdamW' optimizer.
Defaults to 1e-8.
warmup_steps (int, optional): Number of warmup steps for 'WarmupLinearSchedule'.
Defaults to 0.
verbose (bool, optional): Verbose model.
Defaults to False.
seed (int, optional): The seed for the transformers.
Defaults to None, use the default seed.
be used. If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-1, which means non-distributed training.
weight_decay (float, optional): Weight decay to apply after each parameter update.
Defaults to 0.0.
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
5e-5.
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
to `learning rate`. Defaults to 0.
verbose (bool, optional): Whether to print out the training log. Defaults to True.
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
"""
# init optimizer
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
# compute the max number of training steps
max_steps = compute_training_steps(
train_dataloader,
num_epochs=num_epochs,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
)
# init scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
)
# fine tune
super().fine_tune(
train_dataloader=train_dataloader,
get_inputs=TokenClassificationProcessor.get_inputs,
n_gpu=num_gpus,
num_train_epochs=num_epochs,
weight_decay=weight_decay,
learning_rate=learning_rate,
adam_epsilon=adam_epsilon,
warmup_steps=warmup_steps,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
optimizer=optimizer,
scheduler=scheduler,
local_rank=local_rank,
verbose=verbose,
seed=seed,
)
def predict(self, eval_dataloader, num_gpus=None, verbose=True):
def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
"""
Test on an evaluation dataset and get the token label predictions.
Scores a dataset using a fine-tuned model and a given dataloader.
Args:
eval_dataset (TensorDataset): A TensorDataset for evaluation.
test_dataloader (DataLoader): DataLoader for scoring the data.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will
be used. Defaults to None.
verbose (bool, optional): Verbose model.
Defaults to False.
be used. If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
verbose (bool, optional): Whether to print out the training log. Defaults to True.
Returns:
ndarray: Numpy ndarray of raw predictions. The shape of the ndarray is
[number_of_examples, sequence_length, number_of_labels]. Each
value in the ndarray is not normalized. Post-process will be needed
to get the probability for each class label.
Returns
1darray: numpy array of predicted label indices.
"""
preds = list(
super().predict(
eval_dataloader=eval_dataloader,
eval_dataloader=test_dataloader,
get_inputs=TokenClassificationProcessor.get_inputs,
n_gpu=num_gpus,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
verbose=verbose,
)
)
preds_np = np.concatenate(preds)
return preds_np
return np.concatenate(preds)
def get_predicted_token_labels(self, predictions, label_map, dataset):
"""
@ -376,21 +379,19 @@ class TokenClassifier(Transformer):
Args:
predictions (ndarray): A numpy ndarray produced from the `predict` function call.
The shape of the ndarray is [number_of_examples, sequence_length, number_of_labels].
label_map (dict): A dictionary object to map a label (str) to an ID (int).
label_map (dict): A dictionary object to map a label (str) to an ID (int).
dataset (TensorDataset): The TensorDataset for evaluation.
dataset (Dataset): The test Dataset instance.
Returns:
list: A list of lists. The size of the retured list is the number of testing samples.
Each sublist represents the predicted label for each token.
Each sublist represents the predicted label for each token.
"""
num_samples = len(dataset.tensors[0])
if num_samples != predictions.shape[0]:
raise ValueError(
"Predictions have {0} samples, but got {1} samples in dataset".format(
predictions.shape[0], num_samples
)
"Predictions have {0} samples, but got {1} samples in dataset".format(predictions.shape[0], num_samples)
)
label_id2str = {v: k for k, v in label_map.items()}
@ -409,7 +410,7 @@ class TokenClassifier(Transformer):
if attention_mask[sid] == 0:
break
if not trailing_mask[sid]:
if not bool(trailing_mask[sid]):
continue
label_id = seq_probs[sid].argmax()
@ -422,13 +423,13 @@ class TokenClassifier(Transformer):
Get the true testing label values.
Args:
label_map (dict): A dictionary object to map a label (str) to an ID (int).
label_map (dict): A dictionary object to map a label (str) to an ID (int).
dataset (TensorDataset): The TensorDataset for evaluation.
dataset (Dataset): The test Dataset instance.
Returns:
list: A list of lists. The size of the retured list is the number of testing samples.
Each sublist represents the predicted label for each token.
Each sublist represents the predicted label for each token.
"""
num_samples = len(dataset.tensors[0])

236
utils_nlp/models/transformers/question_answering.py Normal file → Executable file
Просмотреть файл

@ -17,38 +17,30 @@
# Modifications copyright © Microsoft Corporation
import os
import logging
from tqdm import tqdm
import collections
import json
import logging
import math
import os
import jsonlines
import torch
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
from torch.utils.data import TensorDataset
from tqdm import tqdm
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
from transformers.modeling_xlnet import (
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
XLNetForQuestionAnswering,
)
from transformers.modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForQuestionAnswering,
)
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
from utils_nlp.common.pytorch_utils import get_device
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
MODEL_CLASS = {}
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update(
{k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
# cached files during preprocessing
# these are used in postprocessing to generate the final answer texts
@ -85,9 +77,7 @@ class QAProcessor:
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
"""
def __init__(
self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."
):
def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
self.model_name = model_name
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
@ -116,13 +106,14 @@ class QAProcessor:
return self._model_type
@staticmethod
def get_inputs(batch, model_name, train_mode=True):
def get_inputs(batch, device, model_name, train_mode=True):
"""
Creates an input dictionary given a model name.
Args:
batch (tuple): A tuple containing input ids, attention mask,
segment ids, and labels tensors.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
@ -131,6 +122,7 @@ class QAProcessor:
dict: Dictionary containing input ids, segment ids, masks, and labels.
Labels are only returned when train_mode is True.
"""
batch = tuple(t.to(device) for t in batch)
model_type = model_name.split("-")[0]
inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
@ -191,6 +183,8 @@ class QAProcessor:
directory. These files are required during postprocessing to generate the final
answer texts from predicted answer start and answer end indices. Defaults to
"./cached_qa_features".
Returns:
DataSet: A Pytorch DataSet.
"""
if not os.path.exists(feature_cache_dir):
@ -223,9 +217,7 @@ class QAProcessor:
qa_examples.append(qa_example_cur)
qa_examples_json.append(
{"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
)
qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
features_cur = _create_qa_features(
qa_example_cur,
@ -271,28 +263,13 @@ class QAProcessor:
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
qa_dataset = TensorDataset(
input_ids,
input_mask,
segment_ids,
start_positions,
end_positions,
cls_index,
p_mask,
input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
)
else:
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
qa_dataset = TensorDataset(
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
)
qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
if num_gpus is not None:
batch_size = batch_size * max(1, num_gpus)
if distributed:
sampler = DistributedSampler(qa_dataset)
else:
sampler = RandomSampler(qa_dataset) if is_training else SequentialSampler(qa_dataset)
return DataLoader(qa_dataset, sampler=sampler, batch_size=batch_size)
return qa_dataset
def postprocess(
self,
@ -420,14 +397,7 @@ class QAResult(QAResult_):
QAResultExtended_ = collections.namedtuple(
"QAResultExtended",
[
"unique_id",
"start_top_log_probs",
"start_top_index",
"end_top_log_probs",
"end_top_index",
"cls_logits",
],
["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
)
@ -489,18 +459,16 @@ class AnswerExtractor(Transformer):
def fit(
self,
train_dataloader,
num_gpus=None,
num_epochs=1,
learning_rate=5e-5,
max_grad_norm=1.0,
max_steps=-1,
gradient_accumulation_steps=1,
warmup_steps=0,
weight_decay=0.0,
adam_epsilon=1e-8,
fp16=False,
fp16_opt_level="O1",
num_gpus=None,
gpu_ids=None,
local_rank=-1,
weight_decay=0.0,
learning_rate=5e-5,
adam_epsilon=1e-8,
warmup_steps=0,
verbose=True,
seed=None,
cache_model=True,
@ -509,31 +477,30 @@ class AnswerExtractor(Transformer):
Fine-tune pre-trained transofmer models for question answering.
Args:
train_dataloader (Dataloader): Dataloader for the training data.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will
be used. Defaults to None.
train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
num_epochs (int, optional): Number of training epochs. Defaults to 1.
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
5e-5.
max_grad_norm (float, optional): Maximum gradient norm for gradient clipping.
Defaults to 1.0.
max_steps (int, optional): Maximum number of training steps. If specified,
`num_epochs` will be ignored. Defaults to -1.
gradient_accumulation_steps (int, optional): Number of batches to accumulate
gradients on between each model parameter update. Defaults to 1.
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
to `learning rate`. Defaults to 0.
weight_decay (float, optional): Weight decay to apply after each parameter update.
Defaults to 0.0.
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
fp16 (bool, optional): Whether to use 16-bit (mixed) precision (through NVIDIA apex)
instead of 32-bit. Defaults to False.
fp16_opt_level (str, optional): For fp16: Apex AMP optimization level selected in
['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html.
Defaults to "O1",
max_steps (int, optional): Total number of training steps.
If set to a positive value, it overrides num_epochs.
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
Defualts to -1.
gradient_accumulation_steps (int, optional): Number of steps to accumulate
before performing a backward/update pass.
Default to 1.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-1, which means non-distributed training.
weight_decay (float, optional): Weight decay to apply after each parameter update.
Defaults to 0.0.
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
5e-5.
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
to `learning rate`. Defaults to 0.
verbose (bool, optional): Whether to print out the training log. Defaults to True.
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
cache_model (bool, optional): Whether to save the fine-tuned model. If True,
@ -542,39 +509,53 @@ class AnswerExtractor(Transformer):
"""
# init optimizer
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
# compute the max number of training steps
max_steps = compute_training_steps(
train_dataloader,
num_epochs=num_epochs,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
)
# inin scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
)
# fine tune
super().fine_tune(
train_dataloader=train_dataloader,
get_inputs=QAProcessor.get_inputs,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
max_steps=max_steps,
num_train_epochs=num_epochs,
max_grad_norm=max_grad_norm,
gradient_accumulation_steps=gradient_accumulation_steps,
n_gpu=num_gpus,
weight_decay=weight_decay,
learning_rate=learning_rate,
adam_epsilon=adam_epsilon,
warmup_steps=warmup_steps,
fp16=fp16,
fp16_opt_level=fp16_opt_level,
optimizer=optimizer,
scheduler=scheduler,
local_rank=local_rank,
verbose=verbose,
seed=seed,
)
if cache_model:
self.save_model()
def predict(self, test_dataloader, num_gpus=None, verbose=True):
def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
"""
Predicts answer start and end logits.
Args:
test_dataloader (QADataset): Dataloader for the testing data.
test_dataloader (DataLoader): DataLoader for scoring the data.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will
be used. Defaults to None.
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-1, which means non-distributed.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
Returns:
@ -584,25 +565,16 @@ class AnswerExtractor(Transformer):
def _to_list(tensor):
return tensor.detach().cpu().tolist()
# get device
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=-1)
if isinstance(self.model, torch.nn.DataParallel):
self.model = self.model.module
if num_gpus > 1:
self.model = torch.nn.DataParallel(self.model, device_ids=list(range(num_gpus)))
self.model.to(device)
self.model.eval()
# move model
self.model = move_model_to_device(self.model, device, num_gpus, gpu_ids, local_rank=-1)
all_results = []
for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose):
batch = tuple(t.to(device) for t in batch)
with torch.no_grad():
inputs = QAProcessor.get_inputs(batch, self.model_name, train_mode=False)
inputs = QAProcessor.get_inputs(batch, device, self.model_name, train_mode=False)
outputs = self.model(**inputs)
unique_id_tensor = batch[5]
for i, u_id in enumerate(unique_id_tensor):
@ -617,9 +589,7 @@ class AnswerExtractor(Transformer):
)
else:
result = QAResult(
unique_id=u_id.item(),
start_logits=_to_list(outputs[0][i]),
end_logits=_to_list(outputs[1][i]),
unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
)
all_results.append(result)
torch.cuda.empty_cache()
@ -783,9 +753,7 @@ def postprocess_bert_answer(
# Sort by the sum of the start and end logits in ascending order,
# so that the first element is the most probable answer
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
)
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
seen_predictions = {}
nbest = []
@ -818,19 +786,11 @@ def postprocess_bert_answer(
final_text = ""
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
)
)
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
# if we didn't include the empty option in the n-best, include it
if unanswerable_exists:
if "" not in seen_predictions:
nbest.append(
_NbestPrediction(
text="", start_logit=null_start_logit, end_logit=null_end_logit
)
)
nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
@ -874,9 +834,7 @@ def postprocess_bert_answer(
all_probs[example["qa_id"]] = nbest_json[0]["probability"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = (
score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
)
score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
scores_diff_json[example["qa_id"]] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example["qa_id"]] = ""
@ -1042,9 +1000,7 @@ def postprocess_xlnet_answer(
)
)
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True
)
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
seen_predictions = {}
nbest = []
@ -1075,20 +1031,14 @@ def postprocess_xlnet_answer(
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = _get_final_text(
tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
)
final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit
)
)
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
@ -1235,9 +1185,7 @@ def _create_qa_example(qa_input, is_training):
actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning(
"Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text
)
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
return
else:
start_position = -1
@ -1696,9 +1644,7 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info(
"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text
)
logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using

167
utils_nlp/models/transformers/sequence_classification.py Normal file → Executable file
Просмотреть файл

@ -2,37 +2,25 @@
# Licensed under the MIT License.
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from transformers.modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
BertForSequenceClassification,
)
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForSequenceClassification
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForSequenceClassification
from transformers.modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForSequenceClassification,
)
from transformers.modeling_roberta import (
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
RobertaForSequenceClassification,
)
from transformers.modeling_xlnet import (
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
XLNetForSequenceClassification,
)
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForSequenceClassification
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForSequenceClassification
from utils_nlp.common.pytorch_utils import compute_training_steps
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
MODEL_CLASS = {}
MODEL_CLASS.update({k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update(
{k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update({k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: XLNetForSequenceClassification for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update(
{k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update({k: DistilBertForSequenceClassification for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: AlbertForSequenceClassification for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
class Processor:
@ -56,13 +44,14 @@ class Processor:
)
@staticmethod
def get_inputs(batch, model_name, train_mode=True):
def get_inputs(batch, device, model_name, train_mode=True):
"""
Creates an input dictionary given a model name.
Args:
batch (tuple): A tuple containing input ids, attention mask,
segment ids, and labels tensors.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
@ -71,7 +60,8 @@ class Processor:
dict: Dictionary containing input ids, segment ids, masks, and labels.
Labels are only returned when train_mode is True.
"""
if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert"]:
batch = tuple(t.to(device) for t in batch)
if model_name.split("-")[0] in ["bert", "xlnet", "roberta", "distilbert", "albert"]:
if train_mode:
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
else:
@ -103,11 +93,7 @@ class Processor:
print("setting max_len to max allowed sequence length: {}".format(MAX_SEQ_LEN))
max_len = MAX_SEQ_LEN
# truncate and add CLS & SEP markers
tokens = (
[tokenizer.cls_token]
+ tokenizer.tokenize(text)[0 : max_len - 2]
+ [tokenizer.sep_token]
)
tokens = [tokenizer.cls_token] + tokenizer.tokenize(text)[0 : max_len - 2] + [tokenizer.sep_token]
# get input ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# pad sequence
@ -188,55 +174,13 @@ class Processor:
return input_ids, attention_mask, token_type_ids
def create_dataloader_from_df(
self,
df,
text_col,
label_col=None,
text2_col=None,
shuffle=False,
max_len=MAX_SEQ_LEN,
batch_size=32,
num_gpus=None,
distributed=False,
):
"""
Creates a PyTorch DataLoader from a Pandas DataFrame for sequence classification tasks.
Args:
df (pandas.DataFrame): Input Pandas DataFrame.
text_col (str/int): Text column name or index.
label_col (str/int, optional): Label column name or index. Defualts to None.
text2_col (str/int, optional): Second text column name or index for sequence-pair tasks.
Defualts to None.
shuffle (bool, optional): If set to True, the DataLoader will use a RandomSampler,
otherwise it will use a SequentialSampler.
Defaults to False.
max_len (int, optional): Maximum sequence length. Defaults to 512.
batch_size (int, optional): Batch size. Defaults to 32.
num_gpus (int, optional): Number of GPUs to use.
If None, all available GPUs will be used.
If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
distributed (bool, optional): If set to True, the DataLoader will use
a DistributedSampler.
Defaults to False.
Returns:
DataLoader: A PyTorch DataLoader object that can be used for training or scoring.
"""
def dataset_from_dataframe(self, df, text_col, label_col=None, text2_col=None, max_len=MAX_SEQ_LEN):
if text2_col is None:
ds = SCDataSet(
df,
text_col,
label_col,
transform=Processor.text_transform,
tokenizer=self.tokenizer,
max_len=max_len,
return SCDataSet(
df, text_col, label_col, transform=Processor.text_transform, tokenizer=self.tokenizer, max_len=max_len,
)
else:
ds = SPCDataSet(
return SPCDataSet(
df,
text_col,
text2_col,
@ -246,26 +190,11 @@ class Processor:
max_len=max_len,
)
if num_gpus is None:
num_gpus = torch.cuda.device_count()
batch_size = batch_size * max(1, num_gpus)
if distributed:
sampler = DistributedSampler(ds)
else:
sampler = RandomSampler(ds) if shuffle else SequentialSampler(ds)
return DataLoader(ds, sampler=sampler, batch_size=batch_size)
class SequenceClassifier(Transformer):
def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
super().__init__(
model_class=MODEL_CLASS,
model_name=model_name,
num_labels=num_labels,
cache_dir=cache_dir,
model_class=MODEL_CLASS, model_name=model_name, num_labels=num_labels, cache_dir=cache_dir,
)
@staticmethod
@ -276,7 +205,10 @@ class SequenceClassifier(Transformer):
self,
train_dataloader,
num_epochs=1,
max_steps=-1,
gradient_accumulation_steps=1,
num_gpus=None,
gpu_ids=None,
local_rank=-1,
weight_decay=0.0,
learning_rate=5e-5,
@ -289,11 +221,21 @@ class SequenceClassifier(Transformer):
Fine-tunes a pre-trained sequence classification model.
Args:
train_dataloader (Dataloader): Dataloader for the training data.
train_dataloader (Dataloader): A PyTorch DataLoader to be used for training.
num_epochs (int, optional): Number of training epochs. Defaults to 1.
max_steps (int, optional): Total number of training steps.
If set to a positive value, it overrides num_epochs.
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
Defualts to -1.
gradient_accumulation_steps (int, optional): Number of steps to accumulate
before performing a backward/update pass.
Default to 1.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-1, which means non-distributed training.
weight_decay (float, optional): Weight decay to apply after each parameter update.
@ -307,28 +249,49 @@ class SequenceClassifier(Transformer):
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
"""
# init optimizer
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
# compute the max number of training steps
max_steps = compute_training_steps(
train_dataloader,
num_epochs=num_epochs,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
)
# init scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
)
# fine tune
super().fine_tune(
train_dataloader=train_dataloader,
get_inputs=Processor.get_inputs,
n_gpu=num_gpus,
num_train_epochs=num_epochs,
weight_decay=weight_decay,
learning_rate=learning_rate,
adam_epsilon=adam_epsilon,
warmup_steps=warmup_steps,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
optimizer=optimizer,
scheduler=scheduler,
local_rank=local_rank,
verbose=verbose,
seed=seed,
)
def predict(self, eval_dataloader, num_gpus=None, verbose=True):
def predict(self, test_dataloader, num_gpus=None, gpu_ids=None, verbose=True):
"""
Scores a dataset using a fine-tuned model and a given dataloader.
Args:
eval_dataloader (Dataloader): Dataloader for the evaluation data.
test_dataloader (DataLoader): DataLoader for scoring the data.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
verbose (bool, optional): Whether to print out the training log. Defaults to True.
Returns
@ -337,12 +300,12 @@ class SequenceClassifier(Transformer):
preds = list(
super().predict(
eval_dataloader=eval_dataloader,
eval_dataloader=test_dataloader,
get_inputs=Processor.get_inputs,
n_gpu=num_gpus,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
verbose=verbose,
)
)
preds = np.concatenate(preds)
# todo generator & probs
return np.argmax(preds, axis=1)

Просмотреть файл

@ -2,23 +2,20 @@
# Licensed under the MIT License.
"""Utilities for Xlnet Sequence Classification"""
import numpy as np
import os
from collections import namedtuple
import torch
import torch.nn as nn
from transformers import (
XLNetConfig,
XLNetForSequenceClassification,
AdamW,
WarmupLinearSchedule,
)
from tqdm import tqdm
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from utils_nlp.common.pytorch_utils import get_device, move_to_device
from utils_nlp.models.xlnet.common import Language
import mlflow
import mlflow.pytorch
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from tqdm import tqdm
from transformers import AdamW, WarmupLinearSchedule, XLNetConfig, XLNetForSequenceClassification
from utils_nlp.common.pytorch_utils import get_device, move_model_to_device
from utils_nlp.models.xlnet.common import Language
class XLNetSequenceClassifier:
@ -79,9 +76,7 @@ class XLNetSequenceClassifier:
self.max_grad_norm = max_grad_norm
# create classifier
self.config = XLNetConfig.from_pretrained(
self.language.value, num_labels=num_labels, cache_dir=cache_dir
)
self.config = XLNetConfig.from_pretrained(self.language.value, num_labels=num_labels, cache_dir=cache_dir)
self.model = XLNetForSequenceClassification(self.config)
def fit(
@ -114,7 +109,7 @@ class XLNetSequenceClassifier:
"""
device, num_gpus = get_device(self.num_gpus)
self.model = move_to_device(self.model, device, self.num_gpus)
self.model = move_model_to_device(self.model, device, self.num_gpus)
token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
@ -128,24 +123,17 @@ class XLNetSequenceClassifier:
token_type_ids_tensor = torch.tensor(token_type_ids, dtype=torch.long)
val_token_type_ids_tensor = torch.tensor(val_token_type_ids, dtype=torch.long)
train_dataset = TensorDataset(
token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor
)
train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, token_type_ids_tensor, labels_tensor)
val_dataset = TensorDataset(
val_token_ids_tensor,
val_input_mask_tensor,
val_token_type_ids_tensor,
val_labels_tensor,
val_token_ids_tensor, val_input_mask_tensor, val_token_type_ids_tensor, val_labels_tensor,
)
else:
train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor, labels_tensor)
val_dataset = TensorDataset(
val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor
)
val_dataset = TensorDataset(val_token_ids_tensor, val_input_mask_tensor, val_labels_tensor)
# define optimizer and model parameters
param_optimizer = list(self.model.named_parameters())
@ -155,10 +143,7 @@ class XLNetSequenceClassifier:
"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
"weight_decay": self.weight_decay,
},
{
"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
val_sampler = RandomSampler(val_dataset)
@ -181,9 +166,7 @@ class XLNetSequenceClassifier:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
train_dataset, sampler=train_sampler, batch_size=self.batch_size
)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.batch_size)
tr_loss = 0.0
logging_loss = 0.0
@ -191,18 +174,13 @@ class XLNetSequenceClassifier:
for i, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
if token_type_ids:
x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
t.to(device) for t in batch
)
x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(t.to(device) for t in batch)
else:
token_type_ids_batch = None
x_batch, mask_batch, y_batch = tuple(t.to(device) for t in batch)
outputs = self.model(
input_ids=x_batch,
token_type_ids=token_type_ids_batch,
attention_mask=mask_batch,
labels=y_batch,
input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=y_batch,
)
loss = outputs[0] # model outputs are always tuple in pytorch-transformers
@ -220,9 +198,7 @@ class XLNetSequenceClassifier:
if logging_steps > 0 and global_step % logging_steps == 0:
mlflow.log_metric("learning rate", scheduler.get_lr()[0], step=global_step)
mlflow.log_metric(
"training loss",
(tr_loss - logging_loss) / (logging_steps * self.batch_size),
step=global_step,
"training loss", (tr_loss - logging_loss) / (logging_steps * self.batch_size), step=global_step,
)
logging_loss = tr_loss
# model checkpointing
@ -245,9 +221,7 @@ class XLNetSequenceClassifier:
)
else:
token_type_ids_batch = None
val_x_batch, val_mask_batch, val_y_batch = tuple(
t.to(device) for t in val_batch
)
val_x_batch, val_mask_batch, val_y_batch = tuple(t.to(device) for t in val_batch)
val_outputs = self.model(
input_ids=val_x_batch,
token_type_ids=val_token_type_ids_batch,
@ -256,9 +230,7 @@ class XLNetSequenceClassifier:
)
vloss = val_outputs[0]
val_loss += vloss.sum().item()
mlflow.log_metric(
"validation loss", val_loss / len(val_dataset), step=global_step
)
mlflow.log_metric("validation loss", val_loss / len(val_dataset), step=global_step)
self.model.train()
if verbose:
@ -300,13 +272,7 @@ class XLNetSequenceClassifier:
torch.cuda.empty_cache()
def predict(
self,
token_ids,
input_mask,
token_type_ids=None,
num_gpus=None,
batch_size=8,
probabilities=False,
self, token_ids, input_mask, token_type_ids=None, num_gpus=None, batch_size=8, probabilities=False,
):
"""Scores the given dataset and returns the predicted classes.
@ -330,7 +296,7 @@ class XLNetSequenceClassifier:
"""
device, num_gpus = get_device(num_gpus)
self.model = move_to_device(self.model, device, num_gpus)
self.model = move_model_to_device(self.model, device, num_gpus)
self.model.eval()
preds = []
@ -342,16 +308,11 @@ class XLNetSequenceClassifier:
x_batch = torch.tensor(token_ids[start:end], dtype=torch.long, device=device)
mask_batch = torch.tensor(input_mask[start:end], dtype=torch.long, device=device)
token_type_ids_batch = torch.tensor(
token_type_ids[start:end], dtype=torch.long, device=device
)
token_type_ids_batch = torch.tensor(token_type_ids[start:end], dtype=torch.long, device=device)
with torch.no_grad():
pred_batch = self.model(
input_ids=x_batch,
token_type_ids=token_type_ids_batch,
attention_mask=mask_batch,
labels=None,
input_ids=x_batch, token_type_ids=token_type_ids_batch, attention_mask=mask_batch, labels=None,
)
preds.append(pred_batch[0].cpu())
if i % batch_size == 0: