Merge pull request #586 from microsoft/bleik/add-models

update utils and examples
This commit is contained in:
Said Bleik 2020-05-13 00:20:56 -04:00 коммит произвёл GitHub
Родитель 30afaa4619 2e92e319b3
Коммит e02e3b5525
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
14 изменённых файлов: 807 добавлений и 381 удалений

Просмотреть файл

@ -15,37 +15,6 @@
"# Named Entity Recognition Using Transformer Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Before You Start\n",
"\n",
"The running time shown in this notebook is on a Standard_NC6 Azure Deep Learning Virtual Machine with 1 NVIDIA Tesla K80 GPU. \n",
"> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
"\n",
"The table below provides some reference running time on different machine configurations. \n",
"\n",
"|QUICK_RUN|Machine Configurations|Running time|\n",
"|:---------|:----------------------|:------------|\n",
"|True|4 **CPU**s, 14GB memory| ~ 2 minutes|\n",
"|False|4 **CPU**s, 14GB memory| ~1.5 hours|\n",
"|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1 minute|\n",
"|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 7 minutes |\n",
"\n",
"If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
"QUICK_RUN = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -61,29 +30,30 @@
"<img src=\"https://nlpbp.blob.core.windows.net/images/bert_architecture.png\">"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparation"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import random\n",
"import string\n",
"import sys\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"import pandas as pd\n",
"import scrapbook as sb\n",
"import torch\n",
"\n",
"from tempfile import TemporaryDirectory\n",
"from utils_nlp.dataset import wikigold\n",
"from utils_nlp.common.timer import Timer\n",
"from seqeval.metrics import classification_report\n",
"from utils_nlp.models.transformers.named_entity_recognition import TokenClassifier"
"from sklearn.model_selection import train_test_split\n",
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
"from utils_nlp.common.timer import Timer\n",
"from utils_nlp.dataset import wikigold\n",
"from utils_nlp.dataset.ner_utils import read_conll_file\n",
"from utils_nlp.dataset.url_utils import maybe_download\n",
"from utils_nlp.models.transformers.named_entity_recognition import (\n",
" TokenClassificationProcessor, TokenClassifier)\n"
]
},
{
@ -93,9 +63,38 @@
"## Configuration"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The running time shown in this notebook is on a Standard_NC12 Azure Virtual Machine with 2 NVIDIA Tesla K80 GPUs. \n",
"> **Tip**: If you want to run through the notebook quickly, you can set the **`QUICK_RUN`** flag in the cell below to **`True`** to run the notebook on a small subset of the data and a smaller number of epochs. \n",
"\n",
"The table below provides some reference running time on different machine configurations. \n",
"\n",
"|QUICK_RUN|Machine Configurations|Running time|\n",
"|:---------|:----------------------|:------------|\n",
"|True|4 CPUs, 14GB memory| ~ 2 minutes|\n",
"|False|4 CPUs, 14GB memory| ~1.5 hours|\n",
"|True|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 1 minute|\n",
"|False|1 NVIDIA Tesla K80 GPUs, 12GB GPU memory| ~ 7 minutes |\n",
"\n",
"If you run into CUDA out-of-memory error or the jupyter kernel dies constantly, try reducing the `BATCH_SIZE` and `MAX_SEQ_LENGTH`, but note that model performance will be compromised. "
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Set QUICK_RUN = True to run the notebook on a small subset of data and a smaller number of epochs.\n",
"QUICK_RUN = False"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"tags": [
"parameters"
@ -103,22 +102,17 @@
},
"outputs": [],
"source": [
"# Wikigold dataset\n",
"DATA_URL = (\n",
" \"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets\"\n",
" \"/master/data/wikigold/CONLL-format/data/wikigold.conll.txt\"\n",
")\n",
"\n",
"# fraction of the dataset used for testing\n",
"TEST_DATA_FRACTION = 0.3\n",
"\n",
"# sub-sampling ratio for training\n",
"TRAIN_SAMPLE_RATIO = 1\n",
"\n",
"# sub-sampling ratio for testing\n",
"TEST_SAMPLE_RATIO = 1\n",
"\n",
"NUM_TRAIN_EPOCHS = 5\n",
"\n",
"# update variables for quick run option\n",
"if QUICK_RUN:\n",
" TRAIN_SAMPLE_RATIO = 0.1\n",
" TEST_SAMPLE_RATIO = 0.1\n",
" NUM_TRAIN_EPOCHS = 1\n",
"# sub-sampling ratio\n",
"SAMPLE_RATIO = 1\n",
"\n",
"# the data path used to save the downloaded data file\n",
"DATA_PATH = TemporaryDirectory().name\n",
@ -131,16 +125,18 @@
"torch.manual_seed(RANDOM_SEED)\n",
"\n",
"# model configurations\n",
"NUM_TRAIN_EPOCHS = 5\n",
"MODEL_NAME = \"bert-base-cased\"\n",
"DO_LOWER_CASE = False\n",
"MAX_SEQ_LENGTH = 200\n",
"TRAILING_PIECE_TAG = \"X\"\n",
"DEVICE = \"cuda\"\n",
"\n",
"if torch.cuda.is_available():\n",
"NUM_GPUS = None # uses all if available\n",
"BATCH_SIZE = 16\n",
"else:\n",
" BATCH_SIZE = 8\n"
"\n",
"# update variables for quick run option\n",
"if QUICK_RUN:\n",
" SAMPLE_RATIO = 0.1\n",
" NUM_TRAIN_EPOCHS = 1"
]
},
{
@ -151,45 +147,275 @@
"\n",
"The dataset used in this notebook is the [wikigold dataset](https://www.aclweb.org/anthology/W09-3302). The wikigold dataset consists of 145 mannually labelled Wikipedia articles, including 1841 sentences and 40k tokens in total. The dataset can be directly downloaded from [here](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold). \n",
"\n",
"A helper function `load_dataset` downloads the raw wikigold data, splits it into training and testing datasets (also sub-sampling if the sampling ratio is smaller than 1.0), and then process for the transformer model. Everything is done in one function call, and you can use the processed training and testing Pytorch datasets to fine tune the model and evaluate the performance of the model."
"In the following cell, we download the data file, parse the tokens and labels, sample a given number of sentences, and split the dataset for training and testing."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Maximum sequence length is: 144\n"
]
}
],
"source": [
"train_dataloader, test_dataloader, label_map, test_dataset = wikigold.load_dataset(\n",
" local_path=DATA_PATH,\n",
" test_fraction=TEST_DATA_FRACTION,\n",
" random_seed=RANDOM_SEED,\n",
" train_sample_ratio=TRAIN_SAMPLE_RATIO,\n",
" test_sample_ratio=TEST_SAMPLE_RATIO,\n",
" model_name=MODEL_NAME,\n",
" to_lower=DO_LOWER_CASE,\n",
" cache_dir=CACHE_DIR,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
" batch_size=BATCH_SIZE,\n",
" num_gpus=None\n",
"# download data\n",
"file_name = DATA_URL.split(\"/\")[-1] # a name for the downloaded file\n",
"maybe_download(DATA_URL, file_name, DATA_PATH)\n",
"data_file = os.path.join(DATA_PATH, file_name)\n",
"\n",
"# parse CoNll file\n",
"sentence_list, labels_list = read_conll_file(data_file, sep=\" \")\n",
"\n",
"# sub-sample (optional)\n",
"random.seed(RANDOM_SEED)\n",
"sample_size = int(SAMPLE_RATIO * len(sentence_list))\n",
"sentence_list, labels_list = list(\n",
" zip(*random.sample(list(zip(sentence_list, labels_list)), k=sample_size))\n",
")\n",
"\n",
"# train-test split\n",
"train_sentence_list, test_sentence_list, train_labels_list, test_labels_list = train_test_split(\n",
" sentence_list, labels_list, test_size=TEST_DATA_FRACTION, random_state=RANDOM_SEED\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following is an example input sentence of the training set."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>token</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>In</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1999</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>,</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Caloi</td>\n",
" <td>I-PER</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>family</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>sold</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>the</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>majority</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>of</td>\n",
" <td>O</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Caloi</td>\n",
" <td>I-ORG</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" token label\n",
"0 In O\n",
"1 1999 O\n",
"2 , O\n",
"3 the O\n",
"4 Caloi I-PER\n",
"5 family O\n",
"6 sold O\n",
"7 the O\n",
"8 majority O\n",
"9 of O\n",
"10 Caloi I-ORG"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame({\"token\": train_sentence_list[0], \"label\": train_labels_list[0]}).head(11)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> If your data is unlabeled, try using an annotation tool to simplify the process of labeling. The example [here](../annotation/Doccano.md) introduces [Doccanno](https://github.com/chakki-works/doccano) and shows how it can be used for NER annotation."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create PyTorch Datasets and Dataloaders\n",
"Given the tokenized input and corresponding labels, we use a custom processer to convert our input lists into a PyTorch dataset that can be used with our token classifier. Next, we create PyTorch dataloaders for training and testing."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Token lists with length > 512 will be truncated\n",
"WARNING:root:Token lists with length > 512 will be truncated\n"
]
}
],
"source": [
"processor = TokenClassificationProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE, cache_dir=CACHE_DIR)\n",
"\n",
"label_map = TokenClassificationProcessor.create_label_map(\n",
" label_lists=labels_list, trailing_piece_tag=TRAILING_PIECE_TAG\n",
")\n",
"\n",
"train_dataset = processor.preprocess(\n",
" text=train_sentence_list,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" labels=train_labels_list,\n",
" label_map=label_map,\n",
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
")\n",
"train_dataloader = dataloader_from_dataset(\n",
" train_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True, distributed=False\n",
")\n",
"\n",
"test_dataset = processor.preprocess(\n",
" text=test_sentence_list,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" labels=test_labels_list,\n",
" label_map=label_map,\n",
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
")\n",
"test_dataloader = dataloader_from_dataset(\n",
" test_dataset, batch_size=BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False, distributed=False\n",
")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train Model\n",
"\n",
"There are two steps to train a NER model using pretrained transformer model: 1). instantiate a TokenClassifier class which is a wrapper of the transformer using BERT architecture, and 2), fit the model using the preprocessed training dataset. The member method `fit` of TokenClassifier class is used to fine tune the model."
"There are two steps to train a NER model using pretrained transformer model: 1) Instantiate a TokenClassifier class which is a wrapper of a transformer-based network, and 2) Fit the model using the preprocessed training dataloader. The member method `fit` of TokenClassifier class is used to fine-tune the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d7c19dfe849b4bb3b195e792b0ccc809",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, description='Downloading', max=435779157, style=ProgressStyle(description_…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/media/bleik2/backup/miniconda3/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/parallel/_functions.py:61: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
" warnings.warn('Was asked to gather along dimension 0, but all '\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training time : 0.075 hrs\n"
]
}
],
"source": [
"# Instantiate a TokenClassifier class for NER using pretrained transformer model\n",
"model = TokenClassifier(\n",
@ -203,13 +429,13 @@
" model.fit(\n",
" train_dataloader=train_dataloader,\n",
" num_epochs=NUM_TRAIN_EPOCHS,\n",
" num_gpus=None,\n",
" num_gpus=NUM_GPUS,\n",
" local_rank=-1,\n",
" weight_decay=0.0,\n",
" learning_rate=5e-5,\n",
" adam_epsilon=1e-8,\n",
" warmup_steps=0,\n",
" verbose=True,\n",
" verbose=False,\n",
" seed=RANDOM_SEED\n",
" )\n",
"\n",
@ -227,9 +453,31 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Scoring: 100%|██████████| 18/18 [00:08<00:00, 2.49it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Prediction time : 0.002 hrs\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"with Timer() as t:\n",
" preds = model.predict(\n",
@ -245,12 +493,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the true token labels of the testing dataset. "
"Get the true token labels of the testing dataset:"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@ -266,9 +514,26 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 10,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" MISC 0.68 0.67 0.68 221\n",
" LOC 0.79 0.85 0.82 317\n",
" ORG 0.73 0.81 0.76 274\n",
" PER 0.92 0.93 0.92 257\n",
"\n",
"micro avg 0.78 0.82 0.80 1069\n",
"macro avg 0.78 0.82 0.80 1069\n",
"\n"
]
}
],
"source": [
"predicted_labels = model.get_predicted_token_labels(\n",
" predictions=preds,\n",
@ -284,6 +549,94 @@
"print(report)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Score Example Sentences\n",
"Finally, we test the model on some random input sentences."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Token lists with length > 512 will be truncated\n",
"Scoring: 100%|██████████| 1/1 [00:00<00:00, 7.56it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
" Is it true that Jane works at Microsoft?\n",
" tokens labels\n",
"0 Is O\n",
"1 it O\n",
"2 true O\n",
"3 that O\n",
"4 Jane I-PER\n",
"5 works O\n",
"6 at O\n",
"7 Microsoft? I-ORG\n",
"\n",
" Joe now lives in Copenhagen.\n",
" tokens labels\n",
"0 Joe I-PER\n",
"1 now O\n",
"2 lives O\n",
"3 in O\n",
"4 Copenhagen. I-LOC\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# test\n",
"sample_text = [ \n",
" \"Is it true that Jane works at Microsoft?\",\n",
" \"Joe now lives in Copenhagen.\"\n",
"]\n",
"sample_tokens = [x.split() for x in sample_text]\n",
"\n",
"sample_dataset = processor.preprocess(\n",
" text=sample_tokens,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" labels=None,\n",
" label_map=label_map,\n",
" trailing_piece_tag=TRAILING_PIECE_TAG,\n",
")\n",
"sample_dataloader = dataloader_from_dataset(\n",
" sample_dataset, batch_size=BATCH_SIZE, num_gpus=None, shuffle=False, distributed=False\n",
")\n",
"preds = model.predict(\n",
" test_dataloader=sample_dataloader,\n",
" num_gpus=None,\n",
" verbose=True\n",
")\n",
"predicted_labels = model.get_predicted_token_labels(\n",
" predictions=preds,\n",
" label_map=label_map,\n",
" dataset=sample_dataset\n",
")\n",
"\n",
"for i in range(len(sample_text)):\n",
" print(\"\\n\", sample_text[i])\n",
" print(pd.DataFrame({\"tokens\": sample_tokens[i] , \"labels\":predicted_labels[i]})) "
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -293,9 +646,64 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.78,
"encoder": "json",
"name": "precision",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "precision"
}
},
"output_type": "display_data"
},
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.82,
"encoder": "json",
"name": "recall",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "recall"
}
},
"output_type": "display_data"
},
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": 0.8,
"encoder": "json",
"name": "f1",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "f1"
}
},
"output_type": "display_data"
}
],
"source": [
"report_splits = report.split('\\n')[-2].split()\n",
"\n",
@ -306,11 +714,10 @@
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3.6 - AzureML",
"display_name": "nlp_gpu",
"language": "python",
"name": "python3-azureml"
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
@ -326,5 +733,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

Просмотреть файл

@ -112,7 +112,7 @@ def test_wikigold(tmp_path):
def test_ner_utils(ner_utils_test_data):
output = preprocess_conll(ner_utils_test_data["input"])
output = preprocess_conll(ner_utils_test_data["input"], sep=" ")
assert output == ner_utils_test_data["expected_output"]

Просмотреть файл

@ -85,7 +85,9 @@ def qa_test_data(qa_test_df, tmp_module):
)
# xlnet
qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
qa_processor_xlnet = QAProcessor(
model_name="xlnet-base-cased", cache_dir=tmp_module
)
train_features_xlnet = qa_processor_xlnet.preprocess(
train_dataset,
is_training=True,
@ -148,13 +150,19 @@ def test_QAProcessor(qa_test_data, tmp_module):
]:
qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
qa_processor.preprocess(
qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
qa_test_data["train_dataset"],
is_training=True,
feature_cache_dir=tmp_module,
)
qa_processor.preprocess(
qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
qa_test_data["train_dataset_list"],
is_training=True,
feature_cache_dir=tmp_module,
)
qa_processor.preprocess(
qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
qa_test_data["test_dataset"],
is_training=False,
feature_cache_dir=tmp_module,
)
# test unsupported model type
@ -188,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
# bert
qa_extractor_bert = AnswerExtractor(cache_dir=tmp_module)
train_loader_bert = dataloader_from_dataset(qa_test_data["train_features_bert"])
test_loader_bert = dataloader_from_dataset(qa_test_data["test_features_bert"], shuffle=False)
test_loader_bert = dataloader_from_dataset(
qa_test_data["test_features_bert"], shuffle=False
)
qa_extractor_bert.fit(train_loader_bert, verbose=False, cache_model=True)
# test saving fine-tuned model
@ -203,13 +213,19 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
# xlnet
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_xlnet"])
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_xlnet"], shuffle=False)
qa_extractor_xlnet = AnswerExtractor(model_name="xlnet-base-cased", cache_dir=tmp_module)
test_loader_xlnet = dataloader_from_dataset(
qa_test_data["test_features_xlnet"], shuffle=False
)
qa_extractor_xlnet = AnswerExtractor(
model_name="xlnet-base-cased", cache_dir=tmp_module
)
qa_extractor_xlnet.fit(train_loader_xlnet, verbose=False, cache_model=False)
qa_extractor_xlnet.predict(test_loader_xlnet, verbose=False)
# distilbert
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
train_loader_xlnet = dataloader_from_dataset(
qa_test_data["train_features_distilbert"]
)
test_loader_xlnet = dataloader_from_dataset(
qa_test_data["test_features_distilbert"], shuffle=False
)

Просмотреть файл

@ -23,7 +23,7 @@ def test_token_classifier_fit_predict(tmpdir, ner_test_data):
)
# test fit, no warmup
train_dataset = processor.preprocess_for_bert(
train_dataset = processor.preprocess(
text=ner_test_data["INPUT_TEXT"],
max_len=max_seq_len,
labels=ner_test_data["INPUT_LABELS"],

Просмотреть файл

@ -81,7 +81,7 @@ PIP_BASE = {
"https://github.com/explosion/spacy-models/releases/download/"
"en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz"
),
"transformers": "transformers==2.5.0",
"transformers": "transformers==2.9.0",
"gensim": "gensim>=3.7.0",
"nltk": "nltk>=3.4",
"seqeval": "seqeval>=0.0.12",

Просмотреть файл

@ -4,10 +4,9 @@
"""Common helper functions for preprocessing Named Entity Recognition (NER) datasets."""
def preprocess_conll(text, data_type=""):
def preprocess_conll(text, sep="\t"):
"""
Helper function converting data in conll format to word lists
and token label lists.
Converts data in CoNLL format to word and label lists.
Args:
text (str): Text string in conll format, e.g.
@ -20,8 +19,8 @@ def preprocess_conll(text, data_type=""):
of I-ORG
Minnesota I-ORG
. O"
data_type (str, optional): String that briefly describes the data,
e.g. "train"
sep (str, optional): Column separator
Defaults to \t
Returns:
tuple:
(list of word lists, list of token label lists)
@ -37,11 +36,29 @@ def preprocess_conll(text, data_type=""):
# split each sentence string into "word label" pairs
s_split = s.split("\n")
# split "word label" pairs
s_split_split = [t.split() for t in s_split]
s_split_split = [t.split(sep) for t in s_split]
sentence_list.append([t[0] for t in s_split_split if len(t) > 1])
labels_list.append([t[1] for t in s_split_split if len(t) > 1])
if len(s_split_split) > max_seq_len:
max_seq_len = len(s_split_split)
print("Maximum sequence length in the {0} data is: {1}".format(data_type, max_seq_len))
print("Maximum sequence length is: {0}".format(max_seq_len))
return sentence_list, labels_list
def read_conll_file(file_path, sep="\t", encoding=None):
"""
Reads a data file in CoNLL format and returns word and label lists.
Args:
file_path (str): Data file path.
sep (str, optional): Column separator. Defaults to "\t".
encoding (str): File encoding used when reading the file.
Defaults to None.
Returns:
(list, list): A tuple of word and label lists (list of lists).
"""
with open(file_path, encoding=encoding) as f:
data = f.read()
return preprocess_conll(data, sep=sep)

Просмотреть файл

@ -18,7 +18,9 @@ from utils_nlp.common.pytorch_utils import dataloader_from_dataset
from utils_nlp.dataset.ner_utils import preprocess_conll
from utils_nlp.dataset.url_utils import maybe_download
from utils_nlp.models.transformers.common import MAX_SEQ_LEN
from utils_nlp.models.transformers.named_entity_recognition import TokenClassificationProcessor
from utils_nlp.models.transformers.named_entity_recognition import (
TokenClassificationProcessor,
)
URL = (
"https://raw.githubusercontent.com/juand-r/entity-recognition-datasets"
@ -68,7 +70,9 @@ def load_train_test_dfs(local_cache_path="./", test_fraction=0.5, random_seed=No
train_sentence_list = sentence_list[test_sentence_count:]
train_labels_list = labels_list[test_sentence_count:]
train_df = pd.DataFrame({"sentence": train_sentence_list, "labels": train_labels_list})
train_df = pd.DataFrame(
{"sentence": train_sentence_list, "labels": train_labels_list}
)
test_df = pd.DataFrame({"sentence": test_sentence_list, "labels": test_labels_list})
@ -152,7 +156,9 @@ def load_dataset(
"""
train_df, test_df = load_train_test_dfs(
local_cache_path=local_path, test_fraction=test_fraction, random_seed=random_seed
local_cache_path=local_path,
test_fraction=test_fraction,
random_seed=random_seed,
)
if train_sample_ratio > 1.0:
@ -160,7 +166,9 @@ def load_dataset(
logging.warning("Setting the training sample ratio to 1.0")
elif train_sample_ratio < 0:
logging.error("Invalid training sample ration: {}".format(train_sample_ratio))
raise ValueError("Invalid training sample ration: {}".format(train_sample_ratio))
raise ValueError(
"Invalid training sample ration: {}".format(train_sample_ratio)
)
if test_sample_ratio > 1.0:
test_sample_ratio = 1.0
@ -174,7 +182,9 @@ def load_dataset(
if test_sample_ratio < 1.0:
test_df = test_df.sample(frac=test_sample_ratio).reset_index(drop=True)
processor = TokenClassificationProcessor(model_name=model_name, to_lower=to_lower, cache_dir=cache_dir)
processor = TokenClassificationProcessor(
model_name=model_name, to_lower=to_lower, cache_dir=cache_dir
)
label_map = TokenClassificationProcessor.create_label_map(
label_lists=train_df["labels"], trailing_piece_tag=trailing_piece_tag
@ -197,11 +207,19 @@ def load_dataset(
)
train_dataloader = dataloader_from_dataset(
train_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=True, distributed=False
train_dataset,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=True,
distributed=False,
)
test_dataloader = dataloader_from_dataset(
test_dataset, batch_size=batch_size, num_gpus=num_gpus, shuffle=False, distributed=False
test_dataset,
batch_size=batch_size,
num_gpus=num_gpus,
shuffle=False,
distributed=False,
)
return (train_dataloader, test_dataloader, label_map, test_dataset)

Просмотреть файл

@ -5,34 +5,29 @@
# This script reuses some code from https://github.com/huggingface/transformers/
# Add to noticefile
from collections import namedtuple
import logging
import os
import pickle
from tqdm import tqdm
from collections import namedtuple
import torch
from torch.utils.data import (
DataLoader,
SequentialSampler,
RandomSampler,
)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from transformers import BertModel
from tqdm import tqdm
from transformers import AutoTokenizer, BertModel
from utils_nlp.common.pytorch_utils import (
compute_training_steps,
get_device,
get_amp,
get_device,
move_model_to_device,
parallelize_model,
)
from utils_nlp.eval import compute_rouge_python
from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
from utils_nlp.models.transformers.bertsum import model_builder
from utils_nlp.models.transformers.bertsum.model_builder import AbsSummarizer
from utils_nlp.models.transformers.bertsum.predictor import build_predictor
from utils_nlp.models.transformers.common import Transformer
MODEL_CLASS = {"bert-base-uncased": BertModel}
@ -134,8 +129,11 @@ class BertSumAbsProcessor:
"""
self.model_name = model_name
self.tokenizer = TOKENIZER_CLASS[self.model_name].from_pretrained(
self.model_name, do_lower_case=to_lower, cache_dir=cache_dir
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
do_lower_case=to_lower,
cache_dir=cache_dir,
output_loading_info=False,
)
self.symbols = {
@ -156,7 +154,7 @@ class BertSumAbsProcessor:
@staticmethod
def list_supported_models():
return list(MODEL_CLASS.keys())
return list(MODEL_CLASS)
@property
def model_name(self):
@ -184,7 +182,7 @@ class BertSumAbsProcessor:
also contains the target ids and the number of tokens
in the target and target text.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
model_name (bool): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
@ -403,7 +401,8 @@ class BertSumAbs(Transformer):
check MODEL_CLASS for supported models. Defaults to "bert-base-uncased".
finetune_bert (bool, option): Whether the bert model in the encoder is
finetune or not. Defaults to True.
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
cache_dir (str, optional): Directory to cache the tokenizer.
Defaults to ".".
label_smoothing (float, optional): The amount of label smoothing.
Value range is [0, 1]. Defaults to 0.1.
test (bool, optional): Whether the class is initiated for test or not.
@ -412,13 +411,11 @@ class BertSumAbs(Transformer):
max_pos_length (int, optional): maximum postional embedding length for the
input. Defaults to 768.
"""
super().__init__(
model_class=MODEL_CLASS,
model_name=model_name,
num_labels=0,
cache_dir=cache_dir,
model = MODEL_CLASS[model_name].from_pretrained(
model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
)
super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
if model_name not in self.list_supported_models():
raise ValueError(
"Model name {} is not supported by BertSumAbs. "
@ -616,10 +613,7 @@ class BertSumAbs(Transformer):
)
train_dataloader = DataLoader(
train_dataset,
sampler=sampler,
batch_size=batch_size,
collate_fn=collate_fn,
train_dataset, sampler=sampler, batch_size=batch_size, collate_fn=collate_fn
)
# compute the max number of training steps

Просмотреть файл

@ -13,7 +13,7 @@ from torch.utils.data.distributed import DistributedSampler
from transformers import RobertaConfig, BertConfig
from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
from utils_nlp.models.transformers.common import Transformer
from utils_nlp.common.pytorch_utils import (
get_device,
move_model_to_device,
@ -52,8 +52,7 @@ MODEL_CLASS.update(
{k: BertForSequenceToSequence for k in MINILM_PRETRAINED_MODEL_ARCHIVE_MAP}
)
TOKENIZER_CLASS = {}
TOKENIZER_CLASS.update({k: UnilmTokenizer for k in UNILM_PRETRAINED_CONFIG_ARCHIVE_MAP})
TOKENIZER_CLASS.update({k: MinilmTokenizer for k in MINILM_PRETRAINED_CONFIG_ARCHIVE_MAP})

Просмотреть файл

@ -14,32 +14,14 @@ import numpy as np
import torch
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
from transformers.tokenization_bert import BertTokenizer
from transformers.tokenization_distilbert import DistilBertTokenizer
from transformers.tokenization_roberta import RobertaTokenizer
from transformers.tokenization_xlnet import XLNetTokenizer
from utils_nlp.common.pytorch_utils import (
get_amp,
get_device,
move_model_to_device,
get_amp,
parallelize_model,
)
TOKENIZER_CLASS = {}
TOKENIZER_CLASS.update({k: BertTokenizer for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
TOKENIZER_CLASS.update(
{k: RobertaTokenizer for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
)
TOKENIZER_CLASS.update({k: XLNetTokenizer for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
TOKENIZER_CLASS.update(
{k: DistilBertTokenizer for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MAX_SEQ_LEN = 512
logger = logging.getLogger(__name__)
@ -48,35 +30,14 @@ logger = logging.getLogger(__name__)
class Transformer:
def __init__(
self,
model_class,
model_name="bert-base-cased",
num_labels=2,
cache_dir=".",
load_model_from_dir=None,
model_name,
model,
cache_dir,
):
if model_name not in self.list_supported_models():
raise ValueError(
"Model name {0} is not supported by {1}. "
"Call '{1}.list_supported_models()' to get all supported model "
"names.".format(model_name, self.__class__.__name__)
)
self._model_name = model_name
self._model_type = model_name.split("-")[0]
self.model = model
self.cache_dir = cache_dir
self.load_model_from_dir = load_model_from_dir
if load_model_from_dir is None:
self.model = model_class[model_name].from_pretrained(
model_name,
cache_dir=cache_dir,
num_labels=num_labels,
output_loading_info=False,
)
else:
logger.info("Loading cached model from {}".format(load_model_from_dir))
self.model = model_class[model_name].from_pretrained(
load_model_from_dir, num_labels=num_labels, output_loading_info=False
)
@property
def model_name(self):
@ -241,7 +202,8 @@ class Transformer:
if isinstance(outputs, tuple):
loss = outputs[0]
else:
# Accomondate models based on older versions of Transformers, e.g. UniLM
# Accomondate models based on older versions of Transformers,
# e.g. UniLM
loss = outputs
if num_gpus > 1:
@ -317,7 +279,7 @@ class Transformer:
saved_model_path = os.path.join(
self.cache_dir, f"{self.model_name}_step_{global_step}.pt"
)
self.save_model(global_step, saved_model_path)
self.save_model(saved_model_path)
if validation_function:
validation_log = validation_function(self)
logger.info(validation_log)

Просмотреть файл

@ -12,14 +12,20 @@ from multiprocessing import Pool, cpu_count
import numpy as np
import torch
from torch.utils.data import (
DataLoader,
SequentialSampler,
RandomSampler,
)
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from transformers import BertModel, DistilBertModel
from transformers import AutoTokenizer, BertModel, DistilBertModel
from utils_nlp.common.pytorch_utils import (
compute_training_steps,
get_device,
move_model_to_device,
parallelize_model,
)
from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
from utils_nlp.models.transformers.abstractive_summarization_bertsum import (
fit_to_block_size,
)
from utils_nlp.models.transformers.bertsum import model_builder
from utils_nlp.models.transformers.bertsum.data_loader import (
@ -32,17 +38,7 @@ from utils_nlp.models.transformers.bertsum.dataset import (
ExtSumProcessedIterableDataset,
)
from utils_nlp.models.transformers.bertsum.model_builder import BertSumExt
from utils_nlp.common.pytorch_utils import (
compute_training_steps,
get_device,
move_model_to_device,
parallelize_model,
)
from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
from utils_nlp.models.transformers.common import TOKENIZER_CLASS, Transformer
from utils_nlp.models.transformers.abstractive_summarization_bertsum import (
fit_to_block_size,
)
from utils_nlp.models.transformers.common import Transformer
MODEL_CLASS = {
"bert-base-uncased": BertModel,
@ -302,7 +298,7 @@ def parallel_preprocess(input_data, preprocess, num_pool=-1):
p = Pool(num_pool)
results = p.map(
preprocess, input_data, chunksize=min(1, int(len(input_data) / num_pool)),
preprocess, input_data, chunksize=min(1, int(len(input_data) / num_pool))
)
p.close()
p.join()
@ -347,8 +343,11 @@ class ExtSumProcessor:
"""
self.model_name = model_name
self.tokenizer = TOKENIZER_CLASS[self.model_name].from_pretrained(
self.model_name, do_lower_case=to_lower, cache_dir=cache_dir
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
do_lower_case=to_lower,
cache_dir=cache_dir,
output_loading_info=False,
)
self.sep_vid = self.tokenizer.vocab["[SEP]"]
self.cls_vid = self.tokenizer.vocab["[CLS]"]
@ -361,7 +360,7 @@ class ExtSumProcessor:
@staticmethod
def list_supported_models():
return list(TOKENIZER_CLASS.keys())
return list(MODEL_CLASS)
@property
def model_name(self):
@ -389,7 +388,7 @@ class ExtSumProcessor:
text. If train_model is True, it also contains the labels and target
text.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
model_name (bool): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
@ -500,7 +499,6 @@ class ExtSumProcessor:
if len(src) == 0:
raise ValueError("source doesn't have any sentences")
return None
original_src_txt = [" ".join(s) for s in src]
# no filtering for prediction
@ -588,12 +586,11 @@ class ExtractiveSummarizer(Transformer):
Defaults to ".".
"""
super().__init__(
model_class=MODEL_CLASS,
model_name=model_name,
num_labels=0,
cache_dir=cache_dir,
model = MODEL_CLASS[model_name].from_pretrained(
model_name, cache_dir=cache_dir, num_labels=0, output_loading_info=False
)
super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
if model_name not in self.list_supported_models():
raise ValueError(
"Model name {} is not supported by ExtractiveSummarizer. "
@ -621,7 +618,7 @@ class ExtractiveSummarizer(Transformer):
@staticmethod
def list_supported_models():
return list(MODEL_CLASS.keys())
return list(MODEL_CLASS)
def fit(
self,

Просмотреть файл

@ -7,32 +7,21 @@ from collections import Iterable
import numpy as np
import torch
from torch.utils.data import TensorDataset
from transformers.modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
BertForTokenClassification,
)
from transformers.modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForTokenClassification,
from transformers import (
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
AutoConfig,
AutoModelForTokenClassification,
AutoTokenizer,
)
from utils_nlp.common.pytorch_utils import compute_training_steps
from utils_nlp.models.transformers.common import (
MAX_SEQ_LEN,
TOKENIZER_CLASS,
Transformer,
)
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer
TC_MODEL_CLASS = {}
TC_MODEL_CLASS.update(
{k: BertForTokenClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
TC_MODEL_CLASS.update(
{
k: DistilBertForTokenClassification
for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
}
)
supported_models = [
list(x.pretrained_config_archive_map)
for x in MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
]
supported_models = sorted([x for y in supported_models for x in y])
class TokenClassificationProcessor:
@ -52,7 +41,7 @@ class TokenClassificationProcessor:
self.model_name = model_name
self.to_lower = to_lower
self.cache_dir = cache_dir
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
do_lower_case=to_lower,
cache_dir=cache_dir,
@ -68,7 +57,7 @@ class TokenClassificationProcessor:
batch (tuple): A tuple containing input ids, attention mask,
segment ids, and labels tensors.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
model_name (bool): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
@ -77,7 +66,7 @@ class TokenClassificationProcessor:
Labels are only returned when train_mode is True.
"""
batch = tuple(t.to(device) for t in batch)
if model_name.split("-")[0] in ["bert", "distilbert"]:
if model_name in supported_models:
if train_mode:
inputs = {
"input_ids": batch[0],
@ -110,17 +99,15 @@ class TokenClassificationProcessor:
dict: A dictionary object to map a label (str) to an ID (int).
"""
label_set = set()
for labels in label_lists:
label_set.update(labels)
unique_labels = sorted(set([x for y in label_lists for x in y]))
label_map = {label: i for i, label in enumerate(unique_labels)}
label_map = {label: i for i, label in enumerate(label_set)}
if trailing_piece_tag not in unique_labels:
label_map[trailing_piece_tag] = len(unique_labels)
if trailing_piece_tag not in label_set:
label_map[trailing_piece_tag] = len(label_set)
return label_map
def preprocess_for_bert(
def preprocess(
self,
text,
max_len=MAX_SEQ_LEN,
@ -187,6 +174,10 @@ class TokenClassificationProcessor:
)
max_len = MAX_SEQ_LEN
logging.warn(
"Token lists with length > {} will be truncated".format(MAX_SEQ_LEN)
)
if not _is_iterable_but_not_string(text):
# The input text must be an non-string Iterable
raise ValueError("Input text must be an iterable and not a string.")
@ -233,11 +224,6 @@ class TokenClassificationProcessor:
new_tokens.append(sub_word)
if len(new_tokens) > max_len:
logging.warn(
"Text after tokenization with length {} has been truncated".format(
len(new_tokens)
)
)
new_tokens = new_tokens[:max_len]
new_labels = new_labels[:max_len]
input_ids = self.tokenizer.convert_tokens_to_ids(new_tokens)
@ -269,16 +255,16 @@ class TokenClassificationProcessor:
if label_available:
td = TensorDataset(
torch.tensor(input_ids_all, dtype=torch.long),
torch.tensor(input_mask_all, dtype=torch.long),
torch.tensor(trailing_token_mask_all, dtype=torch.long),
torch.tensor(label_ids_all, dtype=torch.long),
torch.LongTensor(input_ids_all),
torch.LongTensor(input_mask_all),
torch.LongTensor(trailing_token_mask_all),
torch.LongTensor(label_ids_all),
)
else:
td = TensorDataset(
torch.tensor(input_ids_all, dtype=torch.long),
torch.tensor(input_mask_all, dtype=torch.long),
torch.tensor(trailing_token_mask_all, dtype=torch.long),
torch.LongTensor(input_ids_all),
torch.LongTensor(input_mask_all),
torch.LongTensor(trailing_token_mask_all),
)
return td
@ -297,16 +283,17 @@ class TokenClassifier(Transformer):
"""
def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
super().__init__(
model_class=TC_MODEL_CLASS,
model_name=model_name,
num_labels=num_labels,
cache_dir=cache_dir,
config = AutoConfig.from_pretrained(
model_name, num_labels=num_labels, cache_dir=cache_dir
)
model = AutoModelForTokenClassification.from_pretrained(
model_name, cache_dir=cache_dir, config=config, output_loading_info=False
)
super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
@staticmethod
def list_supported_models():
return list(TC_MODEL_CLASS)
return supported_models
def fit(
self,
@ -398,7 +385,9 @@ class TokenClassifier(Transformer):
# init scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
optimizer=self.optimizer,
warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
# fine tune

Просмотреть файл

@ -27,6 +27,8 @@ import jsonlines
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers.modeling_albert import (
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForQuestionAnswering,
@ -51,19 +53,21 @@ from utils_nlp.common.pytorch_utils import (
move_model_to_device,
parallelize_model,
)
from utils_nlp.models.transformers.common import (
MAX_SEQ_LEN,
TOKENIZER_CLASS,
Transformer,
)
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer
MODEL_CLASS = {}
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update(
{k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update(
{k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update(
{k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update(
{k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
# cached files during preprocessing
# these are used in postprocessing to generate the final answer texts
@ -103,11 +107,18 @@ class QAProcessor:
"""
def __init__(
self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
self,
model_name="bert-base-cased",
to_lower=False,
custom_tokenize=None,
cache_dir=".",
):
self.model_name = model_name
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
do_lower_case=to_lower,
cache_dir=cache_dir,
output_loading_info=False,
)
self.do_lower_case = to_lower
self.custom_tokenize = custom_tokenize
@ -218,7 +229,9 @@ class QAProcessor:
os.makedirs(feature_cache_dir)
if is_training and not qa_dataset.actual_answer_available:
raise Exception("answer_start and answer_text must be provided for training data.")
raise Exception(
"answer_start and answer_text must be provided for training data."
)
if is_training:
examples_file = os.path.join(feature_cache_dir, CACHED_EXAMPLES_TRAIN_FILE)
@ -245,7 +258,10 @@ class QAProcessor:
qa_examples.append(qa_example_cur)
qa_examples_json.append(
{"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
{
"qa_id": qa_example_cur.qa_id,
"doc_tokens": qa_example_cur.doc_tokens,
}
)
features_cur = _create_qa_features(
@ -289,8 +305,12 @@ class QAProcessor:
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)
if is_training:
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
start_positions = torch.tensor(
[f.start_position for f in features], dtype=torch.long
)
end_positions = torch.tensor(
[f.end_position for f in features], dtype=torch.long
)
qa_dataset = TensorDataset(
input_ids,
input_mask,
@ -421,7 +441,9 @@ class QAProcessor:
return final_answers, answer_probs, nbest_answers
QAResult_ = collections.namedtuple("QAResult", ["unique_id", "start_logits", "end_logits"])
QAResult_ = collections.namedtuple(
"QAResult", ["unique_id", "start_logits", "end_logits"]
)
# create a wrapper class so that we can add docstrings
@ -503,15 +525,15 @@ class AnswerExtractor(Transformer):
"""
def __init__(self, model_name="bert-base-cased", cache_dir=".", load_model_from_dir=None):
super().__init__(
model_class=MODEL_CLASS,
model_name=model_name,
num_labels=2,
def __init__(
self, model_name="bert-base-cased", cache_dir=".", load_model_from_dir=None
):
model = MODEL_CLASS[model_name].from_pretrained(
model_name if load_model_from_dir is None else load_model_from_dir,
cache_dir=cache_dir,
load_model_from_dir=load_model_from_dir,
output_loading_info=False,
)
super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
@staticmethod
def list_supported_models():
@ -613,7 +635,9 @@ class AnswerExtractor(Transformer):
# inin scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
optimizer=self.optimizer,
warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
# fine tune
@ -668,13 +692,19 @@ class AnswerExtractor(Transformer):
# parallelize model
self.model = parallelize_model(
model=self.model, device=device, num_gpus=num_gpus, gpu_ids=gpu_ids, local_rank=-1,
model=self.model,
device=device,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
local_rank=-1,
)
all_results = []
for batch in tqdm(test_dataloader, desc="Evaluating", disable=not verbose):
with torch.no_grad():
inputs = QAProcessor.get_inputs(batch, device, self.model_name, train_mode=False)
inputs = QAProcessor.get_inputs(
batch, device, self.model_name, train_mode=False
)
outputs = self.model(**inputs)
unique_id_tensor = batch[5]
@ -865,7 +895,9 @@ def postprocess_bert_answer(
# Sort by the sum of the start and end logits in ascending order,
# so that the first element is the most probable answer
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True,
)
seen_predictions = {}
@ -890,7 +922,9 @@ def postprocess_bert_answer(
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = _get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
final_text = _get_final_text(
tok_text, orig_text, do_lower_case, verbose_logging
)
if final_text in seen_predictions:
continue
@ -901,7 +935,9 @@ def postprocess_bert_answer(
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit,
)
)
# if we didn't include the empty option in the n-best, include it
@ -916,7 +952,9 @@ def postprocess_bert_answer(
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
if len(nbest) == 1:
nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
nbest.insert(
0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)
)
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
@ -956,7 +994,9 @@ def postprocess_bert_answer(
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = (
score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
score_null
- best_non_null_entry.start_logit
- (best_non_null_entry.end_logit)
)
scores_diff_json[example["qa_id"]] = score_diff
if score_diff > null_score_diff_threshold:
@ -1129,7 +1169,9 @@ def postprocess_xlnet_answer(
)
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
prelim_predictions,
key=lambda x: (x.start_logit + x.end_logit),
reverse=True,
)
seen_predictions = {}
@ -1172,7 +1214,9 @@ def postprocess_xlnet_answer(
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
text=final_text,
start_logit=pred.start_logit,
end_logit=pred.end_logit,
)
)
@ -1300,7 +1344,9 @@ def _create_qa_example(qa_input, is_training):
if _is_iterable_but_not_string(a_start):
if not _is_iterable_but_not_string(a_text):
raise Exception("The answer text must be a list when answer start is a list.")
raise Exception(
"The answer text must be a list when answer start is a list."
)
if len(a_start) != 1 and is_training and not impossible:
raise Exception("For training, each question should have exactly 1 answer.")
a_start = a_start[0]
@ -1323,7 +1369,9 @@ def _create_qa_example(qa_input, is_training):
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning(
"Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
"Could not find answer: '%s' vs. '%s'",
actual_text,
cleaned_answer_text,
)
return
else:
@ -1549,7 +1597,10 @@ def _create_qa_features(
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
all_doc_tokens,
tok_start_position,
tok_end_position,
example.orig_answer_text,
)
# The -3 accounts for [CLS], [SEP] and [SEP]
@ -1583,7 +1634,8 @@ def _create_qa_features(
# p_mask: mask with 1 for token than cannot be in the answer
# (0 for token which can be in an answer)
# Original TF implem also keep the classification token (set to 0) (not sure why...)
# Original TF implem also keep the classification token (set to 0)
# (not sure why...)
# TODO: Should we set p_mask = 1 for cls token?
p_mask = []
@ -1612,9 +1664,11 @@ def _create_qa_features(
split_token_index = doc_span.start + i
token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
## TODO: maybe this can be improved to compute
# TODO: maybe this can be improved to compute
# is_max_context for each token only once.
is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
is_max_context = _check_is_max_context(
doc_spans, doc_span_index, split_token_index
)
token_is_max_context[len(tokens)] = is_max_context
tokens.append(all_doc_tokens[split_token_index])
if model_type == "xlnet":
@ -1720,10 +1774,13 @@ def _create_qa_features(
# -------------------------------------------------------------------------------------------------
# Post processing helper functions
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
"PrelimPrediction",
["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
)
_NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
_NbestPrediction = collections.namedtuple(
"NbestPrediction", ["text", "start_logit", "end_logit"]
)
def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
@ -1786,7 +1843,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info(
"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
"Length not equal after stripping spaces: '%s' vs '%s'",
orig_ns_text,
tok_ns_text,
)
return orig_text

Просмотреть файл

@ -2,54 +2,22 @@
# Licensed under the MIT License.
import numpy as np
from transformers.modeling_albert import (
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForSequenceClassification,
)
from transformers.modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
BertForSequenceClassification,
)
from transformers.modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForSequenceClassification,
)
from transformers.modeling_roberta import (
ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
RobertaForSequenceClassification,
)
from transformers.modeling_xlnet import (
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
XLNetForSequenceClassification,
from transformers import (
MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
)
from utils_nlp.common.pytorch_utils import compute_training_steps
from utils_nlp.models.transformers.common import (
MAX_SEQ_LEN,
TOKENIZER_CLASS,
Transformer,
)
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, Transformer
from utils_nlp.models.transformers.datasets import SCDataSet, SPCDataSet
MODEL_CLASS = {}
MODEL_CLASS.update(
{k: BertForSequenceClassification for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update(
{k: RobertaForSequenceClassification for k in ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update(
{k: XLNetForSequenceClassification for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update(
{
k: DistilBertForSequenceClassification
for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
}
)
MODEL_CLASS.update(
{k: AlbertForSequenceClassification for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
supported_models = [
list(x.pretrained_config_archive_map)
for x in MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
]
supported_models = sorted([x for y in supported_models for x in y])
class Processor:
@ -68,7 +36,10 @@ class Processor:
"""
def __init__(self, model_name="bert-base-cased", to_lower=False, cache_dir="."):
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
self.model_name = model_name
self.to_lower = to_lower
self.cache_dir = cache_dir
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
do_lower_case=to_lower,
cache_dir=cache_dir,
@ -84,7 +55,7 @@ class Processor:
batch (tuple): A tuple containing input ids, attention mask,
segment ids, and labels tensors.
device (torch.device): A PyTorch device.
model_name (bool, optional): Model name used to format the inputs.
model_name (bool): Model name used to format the inputs.
train_mode (bool, optional): Training mode flag.
Defaults to True.
@ -93,13 +64,7 @@ class Processor:
Labels are only returned when train_mode is True.
"""
batch = tuple(t.to(device) for t in batch)
if model_name.split("-")[0] in [
"bert",
"xlnet",
"roberta",
"distilbert",
"albert",
]:
if model_name in supported_models:
if train_mode:
inputs = {
"input_ids": batch[0],
@ -109,8 +74,8 @@ class Processor:
else:
inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
# distilbert doesn't support segment ids
if model_name.split("-")[0] not in ["distilbert"]:
# distilbert, bart don't support segment ids
if model_name.split("-")[0] not in ["distilbert", "bart"]:
inputs["token_type_ids"] = batch[2]
return inputs
@ -244,16 +209,17 @@ class Processor:
class SequenceClassifier(Transformer):
def __init__(self, model_name="bert-base-cased", num_labels=2, cache_dir="."):
super().__init__(
model_class=MODEL_CLASS,
model_name=model_name,
num_labels=num_labels,
cache_dir=cache_dir,
config = AutoConfig.from_pretrained(
model_name, num_labels=num_labels, cache_dir=cache_dir
)
model = AutoModelForSequenceClassification.from_pretrained(
model_name, cache_dir=cache_dir, config=config, output_loading_info=False
)
super().__init__(model_name=model_name, model=model, cache_dir=cache_dir)
@staticmethod
def list_supported_models():
return list(MODEL_CLASS)
return supported_models
def fit(
self,
@ -345,7 +311,9 @@ class SequenceClassifier(Transformer):
# init scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=self.optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
optimizer=self.optimizer,
warmup_steps=warmup_steps,
num_training_steps=max_steps,
)
# fine tune