Merge branch 'hlu/update_entailment_notebook_to_use_transformers' of https://github.com/Microsoft/NLP into hlu/update_entailment_notebook_to_use_transformers
This commit is contained in:
Коммит
6301686f52
|
@ -22,24 +22,16 @@
|
|||
"source": [
|
||||
"# Before You Start\n",
|
||||
"\n",
|
||||
"The running time shown in this notebook is running bert-large-cased on a Standard_NC24rs_v3 Azure Deep Learning Virtual Machine with 4 NVIDIA Tesla V100 GPUs. \n",
|
||||
"It takes about 4 hours to fine-tune the `bert-large-cased` model on a Standard_NC24rs_v3 Azure Data Science Virtual Machine with 4 NVIDIA Tesla V100 GPUs. \n",
|
||||
"> **Tip:** If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
|
||||
"\n",
|
||||
"The table below provides some reference running time on different machine configurations. \n",
|
||||
"\n",
|
||||
"|QUICK_RUN|Machine Configurations|Running time|\n",
|
||||
"|:---------|:----------------------|:------------|\n",
|
||||
"|True|4 **CPU**s, 14GB memory| ~ 15 minutes|\n",
|
||||
"|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 5 minutes|\n",
|
||||
"|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 10.5 hours|\n",
|
||||
"|False|4 NVIDIA Tesla V100 GPUs, 64GB GPU memory| ~ 2.5 hours|\n",
|
||||
"\n",
|
||||
"If you run into CUDA out-of-memory error, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -56,31 +48,24 @@
|
|||
"To classify a sentence pair, we concatenate the tokens in both sentences and separate the sentences by the special [SEP] token. A [CLS] token is prepended to the token list and used as the aggregate sequence representation for the classification task.The NLI task essentially becomes a sequence classification task. For example, the figure below shows how [BERT](https://arxiv.org/abs/1810.04805) classifies sentence pairs. \n",
|
||||
"<img src=\"https://nlpbp.blob.core.windows.net/images/bert_two_sentence.PNG\">\n",
|
||||
"\n",
|
||||
"We compare the training time and performance of three models: bert-base-cased, bert-large-cased, and xlnet-large-cased. The model used can be set in the **Configurations** section. "
|
||||
"We compare the training time and performance of bert-large-cased and xlnet-large-cased. The model used can be set in the **Configurations** section. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"I1110 19:13:59.935610 140117887072000 file_utils.py:39] PyTorch version 1.2.0 available.\n",
|
||||
"I1110 19:13:59.978967 140117887072000 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys, os\n",
|
||||
"nlp_path = os.path.abspath('../../')\n",
|
||||
"if nlp_path not in sys.path:\n",
|
||||
" sys.path.insert(0, nlp_path)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"import scrapbook as sb\n",
|
||||
"\n",
|
||||
"from tempfile import TemporaryDirectory\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
|
@ -104,39 +89,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['bert-base-uncased',\n",
|
||||
" 'bert-large-uncased',\n",
|
||||
" 'bert-base-cased',\n",
|
||||
" 'bert-large-cased',\n",
|
||||
" 'bert-base-multilingual-uncased',\n",
|
||||
" 'bert-base-multilingual-cased',\n",
|
||||
" 'bert-base-chinese',\n",
|
||||
" 'bert-base-german-cased',\n",
|
||||
" 'bert-large-uncased-whole-word-masking',\n",
|
||||
" 'bert-large-cased-whole-word-masking',\n",
|
||||
" 'bert-large-uncased-whole-word-masking-finetuned-squad',\n",
|
||||
" 'bert-large-cased-whole-word-masking-finetuned-squad',\n",
|
||||
" 'bert-base-cased-finetuned-mrpc',\n",
|
||||
" 'roberta-base',\n",
|
||||
" 'roberta-large',\n",
|
||||
" 'roberta-large-mnli',\n",
|
||||
" 'xlnet-base-cased',\n",
|
||||
" 'xlnet-large-cased',\n",
|
||||
" 'distilbert-base-uncased',\n",
|
||||
" 'distilbert-base-uncased-distilled-squad']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"SequenceClassifier.list_supported_models()"
|
||||
]
|
||||
|
@ -150,7 +105,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"parameters"
|
||||
|
@ -194,8 +149,7 @@
|
|||
"LABEL_COL = \"gold_label\"\n",
|
||||
"LABEL_COL_NUM = \"gold_label_num\"\n",
|
||||
"\n",
|
||||
"CACHE_DIR = TemporaryDirectory().name\n",
|
||||
"CACHE_DIR = \"./temp\""
|
||||
"CACHE_DIR = TemporaryDirectory().name"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -209,7 +163,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -220,7 +174,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -230,33 +184,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Training dataset size: 392702\n",
|
||||
"Development (matched) dataset size: 9815\n",
|
||||
"Development (mismatched) dataset size: 9832\n",
|
||||
"\n",
|
||||
" gold_label sentence1 \\\n",
|
||||
"0 neutral Conceptually cream skimming has two basic dime... \n",
|
||||
"1 entailment you know during the season and i guess at at y... \n",
|
||||
"2 entailment One of our number will carry out your instruct... \n",
|
||||
"3 entailment How do you know? All this is their information... \n",
|
||||
"4 neutral yeah i tell you what though if you go price so... \n",
|
||||
"\n",
|
||||
" sentence2 \n",
|
||||
"0 Product and geography are what make cream skim... \n",
|
||||
"1 You lose the things to the following level if ... \n",
|
||||
"2 A member of my team will execute your orders w... \n",
|
||||
"3 This information belongs to them. \n",
|
||||
"4 The tennis shoes have a range of prices. \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Training dataset size: {}\".format(train_df.shape[0]))\n",
|
||||
"print(\"Development (matched) dataset size: {}\".format(dev_df_matched.shape[0]))\n",
|
||||
|
@ -267,7 +197,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -278,7 +208,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -293,25 +223,18 @@
|
|||
"metadata": {},
|
||||
"source": [
|
||||
"## Tokenize and Preprocess\n",
|
||||
"Before training, we tokenize the sentence texts and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets."
|
||||
"Before training, we tokenize and preprocess the sentence texts to convert them into the format required by transformer model classes. \n",
|
||||
"The `create_dataloader_from_df` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataLoader`\n",
|
||||
"* Tokenize input texts using the tokenizer of the pre-trained model specified by `model_name`. \n",
|
||||
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
|
||||
"* Pad or truncate the token lists to the specified max length."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"I1110 19:14:11.376676 140117887072000 tokenization_utils.py:373] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt from cache at ./temp/cee054f6aafe5e2cf816d2228704e326446785f940f5451a5b26033516a4ac3d.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1\n",
|
||||
"100%|██████████| 392702/392702 [03:48<00:00, 1715.17it/s]\n",
|
||||
"100%|██████████| 9815/9815 [00:05<00:00, 1797.48it/s]\n",
|
||||
"100%|██████████| 9832/9832 [00:05<00:00, 1709.69it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"processor = Processor(model_name=MODEL_NAME, cache_dir=CACHE_DIR, to_lower=TO_LOWER)\n",
|
||||
"train_dataloader = processor.create_dataloader_from_df(\n",
|
||||
|
@ -341,21 +264,6 @@
|
|||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In addition, we perform the following preprocessing steps in the cell below:\n",
|
||||
"\n",
|
||||
"* Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary\n",
|
||||
"* Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence\n",
|
||||
"* Pad or truncate the token lists to the specified max length\n",
|
||||
"* Return mask lists that indicate paddings' positions\n",
|
||||
"* Return token type id lists that indicate which sentence the tokens belong to\n",
|
||||
"\n",
|
||||
"*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
@ -416,31 +324,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Evaluating: 100%|██████████| 614/614 [04:53<00:00, 2.12it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Prediction time : 0.082 hrs\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with Timer() as t:\n",
|
||||
" predictions_matched = classifier.predict(dev_dataloader_matched)\n",
|
||||
|
@ -449,31 +335,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Evaluating: 100%|██████████| 615/615 [04:53<00:00, 2.12it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Prediction time : 0.082 hrs\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with Timer() as t:\n",
|
||||
" predictions_mismatched = classifier.predict(dev_dataloader_mismatched)\n",
|
||||
|
@ -489,26 +353,9 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
"contradiction 0.872 0.894 0.883 3213\n",
|
||||
" entailment 0.913 0.862 0.887 3479\n",
|
||||
" neutral 0.813 0.842 0.828 3123\n",
|
||||
"\n",
|
||||
" micro avg 0.866 0.866 0.866 9815\n",
|
||||
" macro avg 0.866 0.866 0.866 9815\n",
|
||||
" weighted avg 0.868 0.866 0.867 9815\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions_matched = label_encoder.inverse_transform(predictions_matched)\n",
|
||||
"print(classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3))"
|
||||
|
@ -516,28 +363,11 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
"contradiction 0.891 0.888 0.889 3240\n",
|
||||
" entailment 0.899 0.862 0.880 3463\n",
|
||||
" neutral 0.810 0.850 0.830 3129\n",
|
||||
"\n",
|
||||
" micro avg 0.867 0.867 0.867 9832\n",
|
||||
" macro avg 0.867 0.867 0.866 9832\n",
|
||||
" weighted avg 0.868 0.867 0.867 9832\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"predictions_mismatched = label_encoder.inverse_transform(predictions_mismatched)\n",
|
||||
"print(classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3))"
|
||||
|
@ -559,6 +389,22 @@
|
|||
"|xlnet-large-cased|5.15 hrs|0.11 hrs|0.887|0.890|\n",
|
||||
"|bert-large-cased|4.01 hrs|0.08 hrs|0.867|0.867|"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"result_matched_dict = classification_report(dev_df_matched[LABEL_COL], predictions_matched, digits=3, output_dict=True)\n",
|
||||
"result_mismatched_dict = classification_report(dev_df_mismatched[LABEL_COL], predictions_mismatched, digits=3, output_dict=True)\n",
|
||||
"sb.glue(\"matched_precision\", result_matched_dict[\"weighted avg\"][\"precision\"])\n",
|
||||
"sb.glue(\"matched_recall\", result_matched_dict[\"weighted avg\"][\"recall\"])\n",
|
||||
"sb.glue(\"matched_f1\", result_matched_dict[\"weighted avg\"][\"f1-score\"])\n",
|
||||
"sb.glue(\"mismatched_precision\", result_mismatched_dict[\"weighted avg\"][\"precision\"])\n",
|
||||
"sb.glue(\"mismatched_recall\", result_mismatched_dict[\"weighted avg\"][\"recall\"])\n",
|
||||
"sb.glue(\"mismatched_f1\", result_mismatched_dict[\"weighted avg\"][\"f1-score\"])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
|
|
@ -63,7 +63,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -78,7 +78,7 @@
|
|||
" sys.path.insert(0, nlp_path)\n",
|
||||
"\n",
|
||||
"from utils_nlp.dataset.squad import load_pandas_df\n",
|
||||
"from utils_nlp.dataset.pytorch import QADataset\n",
|
||||
"from utils_nlp.models.transformers.datasets import QADataset\n",
|
||||
"from utils_nlp.models.transformers.question_answering import (\n",
|
||||
" QAProcessor,\n",
|
||||
" AnswerExtractor\n",
|
||||
|
@ -175,6 +175,7 @@
|
|||
"DOC_STRIDE = 128\n",
|
||||
"PER_GPU_BATCH_SIZE = 4\n",
|
||||
"GRADIENT_ACCUMULATION_STEPS = 1\n",
|
||||
"NUM_GPUS = torch.cuda.device_count()\n",
|
||||
"\n",
|
||||
"if QUICK_RUN:\n",
|
||||
" TRAIN_DATA_USED_PERCENT = 0.001\n",
|
||||
|
@ -558,7 +559,7 @@
|
|||
"* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\n",
|
||||
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
|
||||
"\n",
|
||||
"`QAProcessor.preprocess` returns a Pytorch TensorDataset. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
|
||||
"`QAProcessor.preprocess` returns a Pytorch Dataloader. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -576,16 +577,20 @@
|
|||
],
|
||||
"source": [
|
||||
"qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
|
||||
"train_features = qa_processor.preprocess(\n",
|
||||
"train_dataloader = qa_processor.preprocess(\n",
|
||||
" train_dataset, \n",
|
||||
" batch_size=PER_GPU_BATCH_SIZE,\n",
|
||||
" num_gpus=NUM_GPUS,\n",
|
||||
" is_training=True,\n",
|
||||
" max_question_length=MAX_QUESTION_LENGTH,\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
" doc_stride=DOC_STRIDE\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"dev_features = qa_processor.preprocess(\n",
|
||||
"dev_dataloader = qa_processor.preprocess(\n",
|
||||
" dev_dataset, \n",
|
||||
" batch_size=PER_GPU_BATCH_SIZE,\n",
|
||||
" num_gpus=NUM_GPUS,\n",
|
||||
" is_training=False,\n",
|
||||
" max_question_length=MAX_QUESTION_LENGTH,\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
|
@ -616,10 +621,9 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"with Timer() as t:\n",
|
||||
" qa_extractor.fit(train_dataset=train_features,\n",
|
||||
" qa_extractor.fit(train_dataloader,\n",
|
||||
" num_epochs=NUM_EPOCHS,\n",
|
||||
" learning_rate=LEARNING_RATE,\n",
|
||||
" per_gpu_batch_size=PER_GPU_BATCH_SIZE,\n",
|
||||
" gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
|
||||
" seed=RANDOM_SEED,\n",
|
||||
" cache_model=True)\n",
|
||||
|
@ -648,7 +652,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"qa_results = qa_extractor.predict(dev_features, per_gpu_batch_size=PER_GPU_BATCH_SIZE)"
|
||||
"qa_results = qa_extractor.predict(dev_dataloader)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -824,9 +828,9 @@
|
|||
"metadata": {
|
||||
"celltoolbar": "Tags",
|
||||
"kernelspec": {
|
||||
"display_name": "Python [default]",
|
||||
"display_name": "nlp_gpu",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "nlp_gpu"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
@ -838,7 +842,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.5.5"
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
import pytest
|
||||
import os
|
||||
from utils_nlp.dataset.pytorch import QADataset
|
||||
from utils_nlp.models.transformers.datasets import QADataset
|
||||
from utils_nlp.models.transformers.question_answering import (
|
||||
QAProcessor,
|
||||
AnswerExtractor,
|
||||
|
@ -11,6 +11,11 @@ from utils_nlp.models.transformers.question_answering import (
|
|||
CACHED_FEATURES_TEST_FILE,
|
||||
)
|
||||
|
||||
import torch
|
||||
|
||||
NUM_GPUS = max(1, torch.cuda.device_count())
|
||||
BATCH_SIZE = 8
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def qa_test_data(qa_test_df, tmp):
|
||||
|
@ -61,6 +66,8 @@ def qa_test_data(qa_test_df, tmp):
|
|||
qa_processor_bert = QAProcessor()
|
||||
train_features_bert = qa_processor_bert.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -70,6 +77,8 @@ def qa_test_data(qa_test_df, tmp):
|
|||
|
||||
test_features_bert = qa_processor_bert.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -80,6 +89,8 @@ def qa_test_data(qa_test_df, tmp):
|
|||
qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased")
|
||||
train_features_xlnet = qa_processor_xlnet.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -89,6 +100,8 @@ def qa_test_data(qa_test_df, tmp):
|
|||
|
||||
test_features_xlnet = qa_processor_xlnet.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -99,6 +112,8 @@ def qa_test_data(qa_test_df, tmp):
|
|||
qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased")
|
||||
train_features_distilbert = qa_processor_distilbert.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -108,6 +123,8 @@ def qa_test_data(qa_test_df, tmp):
|
|||
|
||||
test_features_distilbert = qa_processor_distilbert.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -157,9 +174,7 @@ def test_QAProcessor(qa_test_data, tmp):
|
|||
def test_AnswerExtractor(qa_test_data, tmp):
|
||||
# test bert
|
||||
qa_extractor_bert = AnswerExtractor(cache_dir=tmp)
|
||||
qa_extractor_bert.fit(
|
||||
qa_test_data["train_features_bert"], cache_model=True, per_gpu_batch_size=8
|
||||
)
|
||||
qa_extractor_bert.fit(qa_test_data["train_features_bert"], cache_model=True)
|
||||
|
||||
# test saving fine-tuned model
|
||||
model_output_dir = os.path.join(tmp, "fine_tuned")
|
||||
|
@ -170,15 +185,11 @@ def test_AnswerExtractor(qa_test_data, tmp):
|
|||
qa_extractor_from_cache.predict(qa_test_data["test_features_bert"])
|
||||
|
||||
qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp)
|
||||
qa_extractor_xlnet.fit(
|
||||
qa_test_data["train_features_xlnet"], cache_model=False, per_gpu_batch_size=8
|
||||
)
|
||||
qa_extractor_xlnet.fit(qa_test_data["train_features_xlnet"], cache_model=False)
|
||||
qa_extractor_xlnet.predict(qa_test_data["test_features_xlnet"])
|
||||
|
||||
qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp)
|
||||
qa_extractor_distilbert.fit(
|
||||
qa_test_data["train_features_distilbert"], cache_model=False, per_gpu_batch_size=8
|
||||
)
|
||||
qa_extractor_distilbert.fit(qa_test_data["train_features_distilbert"], cache_model=False)
|
||||
qa_extractor_distilbert.predict(qa_test_data["test_features_distilbert"])
|
||||
|
||||
|
||||
|
|
|
@ -26,7 +26,8 @@ import math
|
|||
import jsonlines
|
||||
|
||||
import torch
|
||||
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
|
||||
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
|
||||
|
@ -40,11 +41,7 @@ from transformers.modeling_distilbert import (
|
|||
)
|
||||
|
||||
from utils_nlp.common.pytorch_utils import get_device
|
||||
from utils_nlp.models.transformers.common import (
|
||||
MAX_SEQ_LEN,
|
||||
TOKENIZER_CLASS,
|
||||
Transformer,
|
||||
)
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
|
||||
|
||||
MODEL_CLASS = {}
|
||||
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
|
@ -146,6 +143,9 @@ class QAProcessor:
|
|||
self,
|
||||
qa_dataset,
|
||||
is_training,
|
||||
batch_size=32,
|
||||
num_gpus=None,
|
||||
distributed=False,
|
||||
max_question_length=64,
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
doc_stride=128,
|
||||
|
@ -243,37 +243,42 @@ class QAProcessor:
|
|||
examples_writer.write_all(qa_examples_json)
|
||||
features_writer.write_all(features_json)
|
||||
|
||||
# TODO: maybe generalize the following code
|
||||
input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
||||
segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
||||
cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||
|
||||
if is_training:
|
||||
start_positions = torch.tensor(
|
||||
[f.start_position for f in features], dtype=torch.long
|
||||
)
|
||||
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids,
|
||||
input_mask,
|
||||
segment_ids,
|
||||
start_positions,
|
||||
end_positions,
|
||||
cls_index,
|
||||
p_mask,
|
||||
)
|
||||
else:
|
||||
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
|
||||
)
|
||||
|
||||
logger.info("QA examples are saved to {}".format(examples_file))
|
||||
logger.info("QA features are saved to {}".format(features_file))
|
||||
|
||||
return qa_dataset
|
||||
# TODO: maybe generalize the following code
|
||||
input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
|
||||
input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
||||
segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
||||
cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||
|
||||
if is_training:
|
||||
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids,
|
||||
input_mask,
|
||||
segment_ids,
|
||||
start_positions,
|
||||
end_positions,
|
||||
cls_index,
|
||||
p_mask,
|
||||
)
|
||||
else:
|
||||
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
|
||||
)
|
||||
|
||||
if num_gpus is not None:
|
||||
batch_size = batch_size * max(1, num_gpus)
|
||||
if distributed:
|
||||
sampler = DistributedSampler(qa_dataset)
|
||||
else:
|
||||
sampler = RandomSampler(qa_dataset) if is_training else SequentialSampler(qa_dataset)
|
||||
|
||||
return DataLoader(qa_dataset, sampler=sampler, batch_size=batch_size)
|
||||
|
||||
def postprocess(
|
||||
self,
|
||||
|
@ -469,9 +474,8 @@ class AnswerExtractor(Transformer):
|
|||
|
||||
def fit(
|
||||
self,
|
||||
train_dataset,
|
||||
train_dataloader,
|
||||
num_gpus=None,
|
||||
per_gpu_batch_size=8,
|
||||
num_epochs=1,
|
||||
learning_rate=5e-5,
|
||||
max_grad_norm=1.0,
|
||||
|
@ -491,12 +495,10 @@ class AnswerExtractor(Transformer):
|
|||
Fine-tune pre-trained transofmer models for question answering.
|
||||
|
||||
Args:
|
||||
train_dataset (QADataset): Training dataset of type
|
||||
:class:`utils_nlp.dataset.pytorch.QADataset`.
|
||||
train_dataloader (Dataloader): Dataloader for the training data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
per_gpu_batch_size (int, optional): Training batch size on each GPU. Defaults to 8.
|
||||
num_epochs (int, optional): Number of training epochs. Defaults to 1.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
|
||||
5e-5.
|
||||
|
@ -530,14 +532,13 @@ class AnswerExtractor(Transformer):
|
|||
|
||||
self.model.to(device)
|
||||
super().fine_tune(
|
||||
train_dataset=train_dataset,
|
||||
train_dataloader=train_dataloader,
|
||||
get_inputs=QAProcessor.get_inputs,
|
||||
device=device,
|
||||
max_steps=max_steps,
|
||||
num_train_epochs=num_epochs,
|
||||
max_grad_norm=max_grad_norm,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
per_gpu_train_batch_size=per_gpu_batch_size,
|
||||
n_gpu=num_gpus,
|
||||
weight_decay=weight_decay,
|
||||
learning_rate=learning_rate,
|
||||
|
@ -552,22 +553,13 @@ class AnswerExtractor(Transformer):
|
|||
if cache_model:
|
||||
self.save_model()
|
||||
|
||||
def predict(
|
||||
self,
|
||||
test_dataset,
|
||||
per_gpu_batch_size=16,
|
||||
num_gpus=None,
|
||||
local_rank=-1,
|
||||
verbose=True,
|
||||
):
|
||||
def predict(self, test_dataloader, num_gpus=None, local_rank=-1, verbose=True):
|
||||
|
||||
"""
|
||||
Predicts answer start and end logits.
|
||||
|
||||
Args:
|
||||
test_dataset (QADataset): Testing dataset of type
|
||||
:class:`utils_nlp.dataset.pytorch.QADataset`.
|
||||
per_gpu_batch_size (int, optional): Testing batch size on each GPU. Defaults to 16.
|
||||
test_dataloader (QADataset): Dataloader for the testing data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
|
@ -583,16 +575,12 @@ class AnswerExtractor(Transformer):
|
|||
return tensor.detach().cpu().tolist()
|
||||
|
||||
device, num_gpus = get_device(num_gpus=num_gpus, local_rank=local_rank)
|
||||
batch_size = per_gpu_batch_size * max(1, num_gpus)
|
||||
|
||||
self.model.to(device)
|
||||
|
||||
# score
|
||||
self.model.eval()
|
||||
|
||||
sampler = SequentialSampler(test_dataset)
|
||||
test_dataloader = DataLoader(test_dataset, sampler=sampler, batch_size=batch_size)
|
||||
|
||||
all_results = []
|
||||
for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
|
|
Загрузка…
Ссылка в новой задаче