Merge branch 'staging' into hlu/unilm_abstractive_summarization

This commit is contained in:
Hong Lu 2020-02-13 18:52:24 -05:00 коммит произвёл GitHub
Родитель e5d31ecf46 10598d0c8e
Коммит 2f6aaf7a41
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
13 изменённых файлов: 1445 добавлений и 238 удалений

Просмотреть файл

@ -14,4 +14,5 @@
# E722: do not use bare except
# E231: missing white space after "," --> black generates autoformat [,] which fails flake8
ignore = E203, E266, W503, F403, F405, E402, E731, F821, E722, E231
max-line-length = 100
max-line-length = 88

Просмотреть файл

@ -87,7 +87,7 @@ The following is a list of related repositories that we like and think are usefu
|[MASS](https://github.com/microsoft/MASS)|MASS: Masked Sequence to Sequence Pre-training for Language Generation.|
|[MT-DNN](https://github.com/namisan/mt-dnn)|Multi-Task Deep Neural Networks for Natural Language Understanding.|
|[UniLM](https://github.com/microsoft/unilm)|Unified Language Model Pre-training.|
|[DialoGPT](https://github.com/microsoft/DialoGPT)|DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation|
## Build Status

Просмотреть файл

@ -53,7 +53,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {
"scrolled": false
},
@ -76,6 +76,7 @@
"\n",
"from utils_nlp.models.transformers.sequence_classification import Processor, SequenceClassifier\n",
"from utils_nlp.dataset.multinli import load_pandas_df\n",
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
"from utils_nlp.common.timer import Timer"
]
},
@ -201,6 +202,7 @@
"metadata": {},
"outputs": [],
"source": [
"# sample\n",
"train_df = train_df.sample(frac=TRAIN_DATA_USED_FRACTION).reset_index(drop=True)\n",
"dev_df_matched = dev_df_matched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)\n",
"dev_df_mismatched = dev_df_mismatched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)"
@ -224,7 +226,7 @@
"source": [
"## Tokenize and Preprocess\n",
"Before training, we tokenize and preprocess the sentence texts to convert them into the format required by transformer model classes. \n",
"The `create_dataloader_from_df` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataLoader`\n",
"The `dataset_from_dataframe` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataSet`\n",
"* Tokenize input texts using the tokenizer of the pre-trained model specified by `model_name`. \n",
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
"* Pad or truncate the token lists to the specified max length."
@ -237,30 +239,35 @@
"outputs": [],
"source": [
"processor = Processor(model_name=MODEL_NAME, cache_dir=CACHE_DIR, to_lower=TO_LOWER)\n",
"train_dataloader = processor.create_dataloader_from_df(\n",
"\n",
"train_dataset = processor.dataset_from_dataframe(\n",
" df=train_df,\n",
" text_col=TEXT_COL_1,\n",
" label_col=LABEL_COL_NUM,\n",
" shuffle=True,\n",
" text2_col=TEXT_COL_2,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" batch_size=BATCH_SIZE,\n",
")\n",
"dev_dataloader_matched = processor.create_dataloader_from_df(\n",
"dev_dataset_matched = processor.dataset_from_dataframe(\n",
" df=dev_df_matched,\n",
" text_col=TEXT_COL_1,\n",
" shuffle=False,\n",
" text_col=TEXT_COL_1, \n",
" text2_col=TEXT_COL_2,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" batch_size=BATCH_SIZE,\n",
")\n",
"dev_dataloader_mismatched = processor.create_dataloader_from_df(\n",
"dev_dataset_mismatched = processor.dataset_from_dataframe(\n",
" df=dev_df_mismatched,\n",
" text_col=TEXT_COL_1,\n",
" shuffle=False,\n",
" text_col=TEXT_COL_1, \n",
" text2_col=TEXT_COL_2,\n",
" max_len=MAX_SEQ_LENGTH,\n",
" batch_size=BATCH_SIZE,\n",
")\n",
"\n",
"train_dataloader = dataloader_from_dataset(\n",
" train_dataset, batch_size=BATCH_SIZE, shuffle=True\n",
")\n",
"dev_dataloader_matched = dataloader_from_dataset(\n",
" dev_dataset_matched, batch_size=BATCH_SIZE, shuffle=False\n",
")\n",
"dev_dataloader_mismatched = dataloader_from_dataset(\n",
" dev_dataset_mismatched, batch_size=BATCH_SIZE, shuffle=False\n",
")"
]
},

Просмотреть файл

@ -69,23 +69,19 @@
"source": [
"import os\n",
"import sys\n",
"import scrapbook as sb\n",
"\n",
"import scrapbook as sb\n",
"import torch\n",
"\n",
"nlp_path = os.path.abspath('../../')\n",
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"\n",
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
"from utils_nlp.common.timer import Timer\n",
"from utils_nlp.dataset.squad import load_pandas_df\n",
"from utils_nlp.eval.question_answering import evaluate_qa\n",
"from utils_nlp.models.transformers.datasets import QADataset\n",
"from utils_nlp.models.transformers.question_answering import (\n",
" AnswerExtractor,\n",
" QAProcessor,\n",
" AnswerExtractor\n",
")\n",
" \n",
"from utils_nlp.eval.question_answering import evaluate_qa\n",
"from utils_nlp.common.timer import Timer"
")"
]
},
{
@ -559,7 +555,7 @@
"* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\n",
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
"\n",
"`QAProcessor.preprocess` returns a Pytorch Dataloader. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
"`QAProcessor.preprocess` returns a Pytorch DataSet. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
]
},
{
@ -577,24 +573,28 @@
],
"source": [
"qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
"train_dataloader = qa_processor.preprocess(\n",
" train_dataset, \n",
" batch_size=PER_GPU_BATCH_SIZE,\n",
" num_gpus=NUM_GPUS,\n",
"train_dataset = qa_processor.preprocess(\n",
" train_dataset,\n",
" is_training=True,\n",
" max_question_length=MAX_QUESTION_LENGTH,\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" doc_stride=DOC_STRIDE\n",
" doc_stride=DOC_STRIDE,\n",
")\n",
"\n",
"dev_dataloader = qa_processor.preprocess(\n",
" dev_dataset, \n",
" batch_size=PER_GPU_BATCH_SIZE,\n",
" num_gpus=NUM_GPUS,\n",
"# we keep a copy of the oroginal dev_dataset as it is needed for evaluation\n",
"dev_dataset_processed = qa_processor.preprocess(\n",
" dev_dataset,\n",
" is_training=False,\n",
" max_question_length=MAX_QUESTION_LENGTH,\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" doc_stride=DOC_STRIDE\n",
" doc_stride=DOC_STRIDE,\n",
")\n",
"\n",
"train_dataloader = dataloader_from_dataset(\n",
" train_dataset, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
")\n",
"dev_dataloader = dataloader_from_dataset(\n",
" dev_dataset_processed, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
")"
]
},

Просмотреть файл

@ -525,7 +525,7 @@
"source": [
"with Timer() as t:\n",
" preds = model.predict(\n",
" eval_dataloader=test_dataloader,\n",
" test_dataloader=test_dataloader,\n",
" num_gpus=CONFIG['num_gpus'],\n",
" verbose=CONFIG['verbose']\n",
" )\n",

Просмотреть файл

@ -0,0 +1,768 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Distributed Training For Extractive Summarization on CNN/DM Dataset\n",
"\n",
"## Summary\n",
"This notebook demonstrates how to use Azure Machine Learning to run distributed training using Distributed Data Parallel in Pytorch for extractive summarization. For more detailed model related information, please see [extractive_summarization_cnndm_transformer.ipynb](extractive_summarization_cnndm_transformer.ipynb)\n",
"\n",
"## Prerequisites\n",
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, refer to the [Configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:\n",
"\n",
"- Azure subscription\n",
"- Azure Machine Learning Workspace\n",
"- Azure Machine Learning SDK\n",
"\n",
"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"from tempfile import TemporaryDirectory\n",
"import torch\n",
"\n",
"import azureml.core\n",
"from azureml.core import Experiment, Workspace, Run\n",
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
"from azureml.core.compute_target import ComputeTargetException\n",
"from azureml.train.dnn import PyTorch\n",
"from azureml.train.dnn import Nccl\n",
"from azureml.widgets import RunDetails\n",
"\n",
"nlp_path = os.path.abspath(\"../../\")\n",
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
"from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\n",
"\n",
"# Check core SDK version number\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configuration "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# for Azure ML Workspacen\n",
"SUBSRIPTION_ID = \"YOUR_SUBSCRIPTION_ID\"\n",
"LOCATION = \"YOUR_RESOURCE_GROUP_NAME\" # example \"eastus2\"\n",
"RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\" # modifiy to use your own\n",
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
"\n",
"\n",
"# for creating Azure ML Compute Cluster\n",
"AMLCOMPUTE_CLUSTER_NAME = \"extsum5\" # modifiy to use your own\n",
"NODE_COUNT = 4\n",
"VM_SIZE = \"STANDARD_NC6\" # this should be the VM that's supported by Azure and Azure ML\n",
"\n",
"\n",
"# for creating Azure ML Experiment\n",
"EXPERIMENT_NAME = \"NLP-ExtSum\" # modifiy to use your own\n",
"\n",
"\n",
"# local folder to save the downloaded data\n",
"LOCAL_DATA_FOLDER = (\n",
" \"./bertsumdata/\"\n",
") # modify to use your own, the penultimate level folder should exist\n",
"\n",
"# Training related parameter\n",
"MODEL_NAME = \"distilbert-base-uncased\" # limited choice\n",
"ENCODER = \"transformer\"\n",
"# folder in the workspace where the data is uploaded to\n",
"TARGET_DATA_FOLDER = \"/bertsum_processed_data\" # modify to use your own\n",
"TARGET_OUTPUT_DIR = f\"output/{EXPERIMENT_NAME}/\"\n",
"# cache dir in the workspace\n",
"TARGET_CACHE_DIR = f\"cache/{EXPERIMENT_NAME}/\"\n",
"# file name for saving the prediction\n",
"SUMMARY_FILENAME = \"generated_summaries.txt\"\n",
"# file name for saving the trained model\n",
"MODEL_FILENAME = \"dist_extsum.pt\"\n",
"\n",
"\n",
"# local path to download the output from the cluster\n",
"LOCAL_OUTPUT_DIR = \"./output\" # modifiy to use your own, the penultimate level folder\n",
"\n",
"\n",
"# local folder to store all the related files to be copied to the workspace\n",
"PROJECT_FOLDER = \"./azureml_exp\"\n",
"# conda environment name, the yaml file will be copied to the workspace\n",
"CONDA_ENV_NAME = \"nlp_gpu\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an AML Workspace"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Create the workspace using the specified parameters\n",
"ws = get_or_create_workspace(\n",
" workspace_name=WORKSPACE_NAME,\n",
" subscription_id=SUBSRIPTION_ID,\n",
" resource_group=RESOURCE_GROUP,\n",
" workspace_region=LOCATION,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\n",
" \"Workspace name: \" + ws.name,\n",
" \"Azure region: \" + ws.location,\n",
" \"Subscription id: \" + ws.subscription_id,\n",
" \"Resource group: \" + ws.resource_group,\n",
" sep=\"\\n\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an AML GPU Compute Cluster"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found existing compute target.\n",
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-01-30T22:43:39.856000+00:00', 'errors': None, 'creationTime': '2020-01-23T04:50:26.160743+00:00', 'modifiedTime': '2020-01-23T20:31:35.349184+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
]
}
],
"source": [
"try:\n",
" gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)\n",
" print(\"Found existing compute target.\")\n",
"except ComputeTargetException:\n",
" print(\"Creating a new compute target...\")\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, NodeIdleTimeBeforeScaleDown=\"PT1200S\"\n",
" )\n",
"\n",
" # create the cluster\n",
" gpu_compute_target = ComputeTarget.create(\n",
" ws, AMLCOMPUTE_CLUSTER_NAME, compute_config\n",
" )\n",
"\n",
" gpu_compute_target.wait_for_completion(show_output=True)\n",
"\n",
"# use get_status() to get a detailed status for the current AmlCompute.\n",
"print(gpu_compute_target.get_status().serialize())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an Experiment"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"experiment = Experiment(ws, name=EXPERIMENT_NAME)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Dataset to Local File System"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p {LOCAL_DATA_FOLDER}"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"bertsum_data.zip: 869MB [00:29, 29.7MB/s] \n"
]
},
{
"data": {
"text/plain": [
"'./bertsumdata/'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Upload the Downloaded Dataset to AML Workspace"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"ds = ws.get_default_datastore()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare for the Experiment Run\n",
"Prepare the local project folder which is mirror to the workspace for the experiment"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Generated conda file: nlp_gpu.yaml\r\n",
"\r\n",
"To create the conda environment:\r\n",
"$ conda env create -f nlp_gpu.yaml\r\n",
"\r\n",
"To update the conda environment:\r\n",
"$ conda env update -f nlp_gpu.yaml\r\n",
"\r\n",
"To register the conda environment in Jupyter:\r\n",
"$ conda activate nlp_gpu\r\n",
"$ python -m ipykernel install --user --name nlp_gpu --display-name \"Python (nlp_gpu)\"\r\n",
"\r\n"
]
}
],
"source": [
"ENTRY_SCRIPT = \"extractive_summarization_cnndm_distributed_train.py\"\n",
"!mkdir -p {PROJECT_FOLDER}\n",
"!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
"!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
"!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
"!cp -r ../../utils_nlp {PROJECT_FOLDER}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Submit Run"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)\n",
"os.makedirs(os.path.join(LOCAL_OUTPUT_DIR, EXPERIMENT_NAME), exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:azureml.train.estimator._framework_base_estimator:If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
"WARNING:azureml.train.estimator._framework_base_estimator:framework_version is not specified, defaulting to version 1.3.\n"
]
}
],
"source": [
"NcclConfig=Nccl()\n",
"estimator = PyTorch(source_directory=PROJECT_FOLDER,\n",
" compute_target=gpu_compute_target,\n",
" script_params={\n",
" \"--dist_url\": \"$AZ_BATCHAI_PYTORCH_INIT_METHOD\",\n",
" \"--rank\": \"$AZ_BATCHAI_TASK_INDEX\",\n",
" \"--node_count\": NODE_COUNT,\n",
" \"--data_dir\":ds.path(f'{TARGET_DATA_FOLDER}').as_mount(),\n",
" \"--cache_dir\": ds.path(f'{TARGET_CACHE_DIR}').as_mount(),\n",
" '--output_dir':ds.path(f'{TARGET_OUTPUT_DIR}').as_mount(),\n",
" \"--quick_run\": 'true',\n",
" \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
" \"--model_filename\": f'{MODEL_FILENAME}',\n",
" \"--model_name\": MODEL_NAME,\n",
" \"--encoder\": ENCODER\n",
" },\n",
" entry_script= ENTRY_SCRIPT,\n",
" node_count=NODE_COUNT,\n",
" distributed_training=NcclConfig,\n",
" conda_dependencies_file=f'{CONDA_ENV_NAME}.yaml',\n",
" use_gpu=True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"run = experiment.submit(estimator)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "97f3678284a44f7aab5c27fa3e19bb11",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"RunDetails(run).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"If you stop the notebook and come back, \n",
"you'll need to use the run_id in the output of the previous cell \n",
"to get run details.\n",
"\"\"\"\n",
"# fetched_run = Run(experiment, \"NLP-ExtSum_1579816237_ea238f69\")\n",
"# RunDetails(fetched_run).show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Generated Summaries "
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# need to clear the local output dir as the ds.download won't download if the path exists\n",
"!rm -rf {LOCAL_OUTPUT_DIR}/* "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading output/NLP-ExtSum/generated_summaries.txt\n",
"Downloaded output/NLP-ExtSum/generated_summaries.txt, 1 files out of an estimated total of 1\n"
]
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
" prefix=f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}',\n",
" show_progress=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from utils_nlp.eval.evaluate_summarization import get_rouge\n",
"from utils_nlp.models.transformers.extractive_summarization import ExtSumProcessedData\n",
"import pickle\n",
"from utils_nlp.models.transformers.extractive_summarization import ExtractiveSummarizer"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"train_dataset, test_dataset = ExtSumProcessedData().splits(root=LOCAL_DATA_FOLDER)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"target = [i['tgt_txt'] for i in test_dataset]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"prediction = []\n",
"with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \"r\") as filehandle:\n",
" for cnt, line in enumerate(filehandle):\n",
" prediction.append(line[0:-1]) # remove the ending \"\\n\""
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading output/NLP-ExtSum/dist_extsum.pt\n",
"Downloaded output/NLP-ExtSum/dist_extsum.pt, 1 files out of an estimated total of 1\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 546/546 [00:00<00:00, 306489.56B/s]\n",
"100%|██████████| 267967963/267967963 [00:04<00:00, 63548158.24B/s]\n",
"Scoring: 100%|██████████| 90/90 [00:41<00:00, 3.68it/s]\n"
]
}
],
"source": [
"## you can also download the saved model and run prediction if you are running the notebook on a gpu machine\n",
"#\"\"\"\n",
"ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
" prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\n",
" show_progress=True)\n",
"summarizer = ExtractiveSummarizer(MODEL_NAME, ENCODER, LOCAL_OUTPUT_DIR)\n",
"summarizer.model.load_state_dict(\n",
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'))\n",
")\n",
"prediction = summarizer.predict(test_dataset, num_gpus=torch.cuda.device_count(), batch_size=128)\n",
"#\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .',\n",
" 'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .',\n",
" 'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .',\n",
" 'andrew mogni , 20 , from glen ellyn , illinois , a university of iowa student has died nearly three months after a fall in rome in a suspected robbery',\n",
" 'he was taken to a medical facility in the chicago area , close to his family home in glen ellyn .',\n",
" \"he died on sunday at northwestern memorial hospital - medical examiner 's office spokesman frank shuftan says a cause of death wo n't be released until monday at the earliest .\",\n",
" 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed .',\n",
" \"on sunday , his cousin abby wrote online : ` this morning my cousin andrew 's soul was lifted up to heaven .\",\n",
" 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed',\n",
" '` at the beginning of january he went to rome to study aboard and on the way home from a party he was brutally attacked and thrown off a 40ft bridge and hit the concrete below .',\n",
" \"` he was in a coma and in critical condition for months . '\",\n",
" 'paula barnett , who said she is a close family friend , told my suburban life , that mogni had only been in the country for six hours when the incident happened .',\n",
" 'she said he was was alone at the time of the alleged assault and personal items were stolen .',\n",
" 'she added that he was in a non-medically induced coma , having suffered serious infection and internal bleeding .',\n",
" 'mogni was a third-year finance major from glen ellyn , ill. , who was participating in a semester-long program at john cabot university .',\n",
" \"mogni belonged to the school 's chapter of the sigma nu fraternity , reports the chicago tribune who posted a sign outside a building reading ` pray for mogni . '\",\n",
" \"the fraternity 's iowa chapter announced sunday afternoon via twitter that a memorial service will be held on campus to remember mogni .\"]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_dataset[0]['src_txt']"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .<q>andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .<q>a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .'"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prediction[0]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program when the incident happened in january<q>he was flown back to chicago via air on march 20 but he died on sunday<q>initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed<q>his cousin claims he was attacked and thrown 40ft from a bridge'"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target[0]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"RESULT_DIR = TemporaryDirectory().name"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"11489\n",
"11489\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2020-01-31 03:38:44,480 [MainThread ] [INFO ] Writing summaries.\n",
"2020-01-31 03:38:44,481 [MainThread ] [INFO ] Processing summaries. Saving system files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system and model files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.\n",
"2020-01-31 03:38:44,481 [MainThread ] [INFO ] Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/candidate/.\n",
"2020-01-31 03:38:45,679 [MainThread ] [INFO ] Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system.\n",
"2020-01-31 03:38:45,681 [MainThread ] [INFO ] Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/reference/.\n",
"2020-01-31 03:38:46,904 [MainThread ] [INFO ] Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.\n",
"2020-01-31 03:38:46,989 [MainThread ] [INFO ] Written ROUGE configuration to /tmp/tmpz9fasy6o/tmpicwmu3se/rouge_conf.xml\n",
"2020-01-31 03:38:46,989 [MainThread ] [INFO ] Running ROUGE with command /dadendev/pyrouge/tools/ROUGE-1.5.5/ROUGE-1.5.5.pl -e /dadendev/pyrouge/tools/ROUGE-1.5.5/data -c 95 -m -r 1000 -n 2 -a /tmp/tmpz9fasy6o/tmpicwmu3se/rouge_conf.xml\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"---------------------------------------------\n",
"1 ROUGE-1 Average_R: 0.52322 (95%-conf.int. 0.52036 - 0.52598)\n",
"1 ROUGE-1 Average_P: 0.35403 (95%-conf.int. 0.35171 - 0.35628)\n",
"1 ROUGE-1 Average_F: 0.40840 (95%-conf.int. 0.40623 - 0.41040)\n",
"---------------------------------------------\n",
"1 ROUGE-2 Average_R: 0.23066 (95%-conf.int. 0.22789 - 0.23341)\n",
"1 ROUGE-2 Average_P: 0.15558 (95%-conf.int. 0.15365 - 0.15744)\n",
"1 ROUGE-2 Average_F: 0.17947 (95%-conf.int. 0.17740 - 0.18147)\n",
"---------------------------------------------\n",
"1 ROUGE-L Average_R: 0.47554 (95%-conf.int. 0.47274 - 0.47834)\n",
"1 ROUGE-L Average_P: 0.32224 (95%-conf.int. 0.32001 - 0.32440)\n",
"1 ROUGE-L Average_F: 0.37150 (95%-conf.int. 0.36935 - 0.37361)\n",
"\n"
]
}
],
"source": [
"rouge_score = get_rouge(prediction, target, RESULT_DIR)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleanup"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"import shutil\n",
"if os.path.exists(LOCAL_DATA_FOLDER):\n",
" shutil.rmtree(LOCAL_DATA_FOLDER, ignore_errors=True)\n",
"if os.path.exists(LOCAL_OUTPUT_DIR):\n",
" shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\n",
"if os.path.exists(PROJECT_FOLDER):\n",
" shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)\n",
"if os.path.exists(RESULT_DIR):\n",
" shutil.rmtree(RESULT_DIR, ignore_errors=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (nlp_gpu)",
"language": "python",
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,165 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
import argparse
import os
import sys
import time
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
nlp_path = os.path.abspath("../../")
if nlp_path not in sys.path:
sys.path.insert(0, nlp_path)
from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset
from utils_nlp.models.transformers.extractive_summarization import (
ExtractiveSummarizer,
ExtSumProcessedData,
ExtSumProcessor,
)
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
os.environ["NCCL_IB_DISABLE"] = "0"
parser = argparse.ArgumentParser()
parser.add_argument("--rank", type=int, default=0,
help="The rank of the current node in the cluster")
parser.add_argument("--dist_url", type=str, default="tcp://127.0.0.1:29500",
help="URL specifying how to initialize the process groupi.")
parser.add_argument("--node_count", type=int, default=1,
help="Number of nodes in the cluster.")
parser.add_argument("--cache_dir", type=str, default="./",
help="Directory to cache the tokenizer.")
parser.add_argument("--data_dir", type=str, default="./",
help="Directory to download the preprocessed data.")
parser.add_argument("--output_dir", type=str, default="./",
help="Directory to save the output model and prediction results.")
parser.add_argument("--quick_run", type=str.lower, default='false', choices=['true', 'false'],
help="Whether to have a quick run")
parser.add_argument("--model_name", type=str, default="distilbert-base-uncased",
help="Transformer model used in the extractive summarization, only \
\"bert-uncased\" and \"distilbert-base-uncased\" are supported.")
parser.add_argument("--encoder", type=str.lower, default='transformer',
choices=['baseline', 'classifier', 'transformer', 'rnn'],
help="Encoder types in the extractive summarizer.")
parser.add_argument("--learning_rate", type=float, default=1e-3,
help="Learning rate.")
parser.add_argument("--batch_size", type=int, default=3000,
help="batch size in terms of input token numbers in training")
parser.add_argument("--max_steps", type=int, default=1e4,
help="Maximum number of training steps run in training. If quick_run is set,\
it's not used.")
parser.add_argument("--warmup_steps", type=int, default=5e3,
help="Warm-up number of training steps run in training. If quick_run is set,\
it's not used.")
parser.add_argument("--top_n", type=int, default=3,
help="Number of sentences selected in prediction for evaluation.")
parser.add_argument("--summary_filename", type=str, default="generated_summaries.txt",
help="Summary file name generated by prediction for evaluation.")
parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt",
help="model file name saved for evaluation.")
def cleanup():
dist.destroy_process_group()
# How often the statistics reports show up in training, unit is step.
REPORT_EVERY = 100
SAVE_EVERY = 1000
def main():
print("NCCL_IB_DISABLE: {}".format(os.getenv("NCCL_IB_DISABLE")))
args = parser.parse_args()
print("quick_run is {}".format(args.quick_run))
print("output_dir is {}".format(args.output_dir))
print("data_dir is {}".format(args.data_dir))
print("cache_dir is {}".format(args.cache_dir))
#shutil.rmtree(args.output_dir)
os.makedirs(args.output_dir, exist_ok=True)
os.makedirs(args.cache_dir, exist_ok=True)
ngpus_per_node = torch.cuda.device_count()
summarizer = ExtractiveSummarizer(args.model_name, args.encoder, args.cache_dir)
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, summarizer, args))
def main_worker(local_rank, ngpus_per_node, summarizer, args):
rank = args.rank * ngpus_per_node + local_rank
world_size = args.node_count * ngpus_per_node
print("init_method: {}".format(args.dist_url))
print("ngpus_per_node: {}".format(ngpus_per_node))
print("rank: {}".format(rank))
print("local_rank: {}".format(local_rank))
print("world_size: {}".format(world_size))
torch.distributed.init_process_group(
backend="nccl",
init_method=args.dist_url,
world_size=world_size,
rank=rank,
)
train_dataset, test_dataset = ExtSumProcessedData().splits(root=args.data_dir)
# total number of steps for training
MAX_STEPS = 1e3
# number of steps for warm up
WARMUP_STEPS = 5e2
if args.quick_run.lower() == "false":
MAX_STEPS = args.max_steps
WARMUP_STEPS = args.warmup_steps
print("max steps is {}".format(MAX_STEPS))
print("warmup steps is {}".format(WARMUP_STEPS))
start = time.time()
if rank not in [-1, 0]:
save_every = -1
else:
save_every = SAVE_EVERY
summarizer.fit(
train_dataset,
num_gpus=world_size,
batch_size=args.batch_size,
gradient_accumulation_steps=2,
max_steps=MAX_STEPS / world_size,
learning_rate=args.learning_rate,
warmup_steps=WARMUP_STEPS,
verbose=True,
report_every=REPORT_EVERY,
clip_grad_norm=False,
local_rank=rank,
save_every=save_every,
world_size=world_size
)
end = time.time()
print("rank {0}, duration {1:.6f}s".format(rank, end - start))
if rank in [-1, 0]:
summarizer.save_model(os.path.join(args.output_dir, args.model_filename))
prediction = summarizer.predict(test_dataset, num_gpus=ngpus_per_node, batch_size=128)
def _write_list_to_file(list_items, filename):
with open(filename, "w") as filehandle:
# for cnt, line in enumerate(filehandle):
for item in list_items:
filehandle.write("%s\n" % item)
print("writing generated summaries")
_write_list_to_file(prediction, os.path.join(args.output_dir, args.summary_filename))
# only use the following line when you use your own cluster.
# AML distributed training run cleanup for you.
# cleanup()
if __name__ == "__main__":
main()

Просмотреть файл

@ -1,2 +1,2 @@
[tool.black]
line-length = 100
line-length = 88

Просмотреть файл

@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import pytest
from utils_nlp.models.transformers.extractive_summarization import IterableDistributedSampler
@pytest.mark.cpu
def test_sampler():
sampler = IterableDistributedSampler(1, -1)
samples = list(sampler.iter('abcdefg'))
assert ''.join(samples) == 'abcdefg'
sampler = IterableDistributedSampler(2, -1)
samples = list(sampler.iter('abcdefg'))
assert ''.join(samples) == 'abcdefg'
sampler = IterableDistributedSampler(4, 1)
samples = list(sampler.iter('abcdefg'))
assert ''.join(samples) == 'bf'
sampler = IterableDistributedSampler(4, 2)
samples = list(sampler.iter('abcdefg'))
assert ''.join(samples) == 'cg'
sampler = IterableDistributedSampler(4, 3)
samples = list(sampler.iter('abcdefg'))
assert ''.join(samples) == 'd'

Просмотреть файл

@ -65,22 +65,18 @@ def qa_test_data(qa_test_df, tmp_module):
qa_id_col=qa_test_df["qa_id_col"],
)
# bert
qa_processor_bert = QAProcessor(cache_dir=tmp_module)
train_features_bert = qa_processor_bert.preprocess(
train_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=True,
max_question_length=16,
max_seq_length=64,
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_features_bert = qa_processor_bert.preprocess(
test_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=False,
max_question_length=16,
max_seq_length=64,
@ -88,22 +84,18 @@ def qa_test_data(qa_test_df, tmp_module):
feature_cache_dir=tmp_module,
)
# xlnet
qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
train_features_xlnet = qa_processor_xlnet.preprocess(
train_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=True,
max_question_length=16,
max_seq_length=64,
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_features_xlnet = qa_processor_xlnet.preprocess(
test_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=False,
max_question_length=16,
max_seq_length=64,
@ -111,22 +103,20 @@ def qa_test_data(qa_test_df, tmp_module):
feature_cache_dir=tmp_module,
)
qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
# distilbert
qa_processor_distilbert = QAProcessor(
model_name="distilbert-base-uncased", cache_dir=tmp_module
)
train_features_distilbert = qa_processor_distilbert.preprocess(
train_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=True,
max_question_length=16,
max_seq_length=64,
doc_stride=32,
feature_cache_dir=tmp_module,
)
test_features_distilbert = qa_processor_distilbert.preprocess(
test_dataset,
batch_size=BATCH_SIZE,
num_gpus=NUM_GPUS,
is_training=False,
max_question_length=16,
max_seq_length=64,
@ -151,11 +141,21 @@ def qa_test_data(qa_test_df, tmp_module):
@pytest.mark.gpu
def test_QAProcessor(qa_test_data, tmp_module):
for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
for model_name in [
"bert-base-cased",
"xlnet-base-cased",
"distilbert-base-uncased",
]:
qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
qa_processor.preprocess(
qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
)
qa_processor.preprocess(
qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
)
qa_processor.preprocess(
qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
)
# test unsupported model type
with pytest.raises(ValueError):
@ -163,18 +163,24 @@ def test_QAProcessor(qa_test_data, tmp_module):
# test training data has no ground truth exception
with pytest.raises(Exception):
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
qa_processor.preprocess(
qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
)
# test when answer start is a list, but answer text is not
with pytest.raises(Exception):
qa_processor.preprocess(
qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
qa_test_data["train_dataset_start_text_mismatch"],
is_training=True,
feature_cache_dir=tmp_module,
)
# test when training data has multiple answers
with pytest.raises(Exception):
qa_processor.preprocess(
qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
qa_test_data["train_dataset_multi_answers"],
is_training=True,
feature_cache_dir=tmp_module,
)
@ -190,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
assert os.path.exists(os.path.join(model_output_dir, "config.json"))
qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
qa_extractor_from_cache = AnswerExtractor(
cache_dir=tmp_module, load_model_from_dir=model_output_dir
)
qa_extractor_from_cache.predict(test_loader_bert, verbose=False)
# xlnet
@ -202,8 +210,12 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
# distilbert
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
test_loader_xlnet = dataloader_from_dataset(
qa_test_data["test_features_distilbert"], shuffle=False
)
qa_extractor_distilbert = AnswerExtractor(
model_name="distilbert-base-uncased", cache_dir=tmp_module
)
qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)

Просмотреть файл

@ -4,6 +4,7 @@
# This script reuses some code from
# https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py
import datetime
import logging
import os
import random
@ -191,6 +192,7 @@ class Transformer:
verbose=True,
seed=None,
report_every=10,
save_every=-1,
clip_grad_norm=True,
):
@ -251,9 +253,14 @@ class Transformer:
if global_step % report_every == 0 and verbose:
end = time.time()
endtime_string = datetime.datetime.fromtimestamp(end).strftime(
"%d/%m/%Y %H:%M:%S"
)
print(
"loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, "
"step:{3:.0f}/{4:.0f}".format(
"""timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f},
number of examples in current step: {3:.0f}, step {4:.0f}
out of total {5:.0f}""".format(
endtime_string,
accum_loss / report_every,
end - start,
len(batch),
@ -268,7 +275,10 @@ class Transformer:
if scheduler:
scheduler.step()
self.model.zero_grad()
if save_every != -1 and global_step % save_every == 0 and verbose:
self.save_model(
os.path.join(self.cache_dir, f"{self.model_name}_step_{global_step}.pt")
)
if global_step > max_steps:
epoch_iterator.close()
break
@ -292,16 +302,31 @@ class Transformer:
logits = outputs[0]
yield logits.detach().cpu().numpy()
def save_model(self):
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
def save_model(self, full_name=None):
"""
save the trained model.
os.makedirs(self.cache_dir, exist_ok=True)
os.makedirs(output_model_dir, exist_ok=True)
Args:
full_name (str, optional): File name to save the model's `state_dict()` and can
be loaded by torch.load(). If it's None, the trained model, configuration
and tokenizer using `save_pretrained()`; and the file is going to be saved
under "fine_tuned" folder of the cached directory of the object. Defaults to None.
"""
logger.info("Saving model checkpoint to %s", output_model_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = (
self.model.module if hasattr(self.model, "module") else self.model
) # Take care of distributed/parallel training
model_to_save.save_pretrained(output_model_dir)
if full_name:
logger.info("Saving model checkpoint to %s", full_name)
torch.save(model_to_save.state_dict(), full_name)
else:
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
os.makedirs(self.cache_dir, exist_ok=True)
os.makedirs(output_model_dir, exist_ok=True)
logger.info("Saving model checkpoint to %s", output_model_dir)
model_to_save.save_pretrained(output_model_dir)

Просмотреть файл

@ -3,9 +3,11 @@
# This script reuses some code from https://github.com/nlpyang/BertSum
import gc
import itertools
import logging
import os
import pickle
import random
import numpy as np
@ -15,8 +17,8 @@ from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSam
# from torch.utils.data.distributed import DistributedSampler
from transformers import BertModel, DistilBertModel
from bertsum.models import data_loader, model_builder
from bertsum.models.data_loader import Batch
from bertsum.models import model_builder
from bertsum.models.data_loader import Batch, DataIterator
from bertsum.models.model_builder import Summarizer
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device
from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
@ -27,6 +29,79 @@ MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": Distil
logger = logging.getLogger(__name__)
# https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataloader.html
class IterableDistributedSampler(object):
""" Distributed sampler for iterable dataset.
Args:
world_size (int): Total number of GPUs that will be used. Defaults to 1.
rank (int): Rank of the current GPU. Defaults to -1.
"""
def __init__(self, world_size=1, rank=-1):
self.world_size = world_size
self.rank = rank
def iter(self, iterable):
if self.rank != -1:
return itertools.islice(iterable, self.rank, None, self.world_size)
else:
return iterable
class ChunkDataLoader(object):
""" Data Loader for Chunked Dataset.
Args:
datasets (list): list of data item list.
batch_size (int): Number of tokens per batch.
shuffle (bool): Whether the data is shuffled.
is_labeled (bool): Whether the data is labeled.
sampler (obj): Data sampler.
"""
def __init__(self, datasets, batch_size, shuffle, is_labeled, sampler):
self.datasets = datasets
self.batch_size = batch_size
self.shuffle = shuffle
self.is_labeled = is_labeled
self.cur_iter = self._next_dataset_iterator(datasets)
assert self.cur_iter is not None
self.sampler = sampler
def eachiter(self):
dataset_iter = (d for d in self.datasets)
while self.cur_iter is not None:
for batch in self.cur_iter:
yield batch
self.cur_iter = self._next_dataset_iterator(dataset_iter)
def __iter__(self):
return self.sampler.iter(self.eachiter())
def _next_dataset_iterator(self, dataset_iter):
try:
# Drop the current dataset for decreasing memory
if hasattr(self, "cur_dataset"):
self.cur_dataset = None
gc.collect()
del self.cur_dataset
gc.collect()
self.cur_dataset = next(dataset_iter)
except StopIteration:
return None
return DataIterator(
dataset=self.cur_dataset,
batch_size=self.batch_size,
shuffle=self.shuffle,
is_labeled=self.is_labeled,
)
class Bunch(object):
""" Class which convert a dictionary to an object """
@ -34,21 +109,28 @@ class Bunch(object):
self.__dict__.update(adict)
def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000):
def get_dataloader(
data_iter, shuffle=True, is_labeled=False, batch_size=3000, world_size=1, rank=-1
):
"""
Function to get data iterator over a list of data objects.
Args:
data_iter (generator): data generator.
shuffle (bool): whether the data is shuffled.
is_labeled (bool): specifies whether the data objects are labeled data.
batch_size (int): number of tokens per batch.
data_iter (generator): Data generator.
shuffle (bool): Whether the data is shuffled. Defaults to True.
is_labeled (bool): Whether the data objects are labeled data.
Defaults to False.
batch_size (int): Number of tokens per batch. Defaults to 3000.
world_size (int): Total number of GPUs that will be used. Defaults to 1.
rank (int): Rank of the current GPU. Defaults to -1.
Returns:
DataIterator
"""
return data_loader.Dataloader(data_iter, batch_size, shuffle=shuffle, is_labeled=is_labeled)
sampler = IterableDistributedSampler(world_size, rank)
return ChunkDataLoader(
data_iter, batch_size, shuffle=shuffle, is_labeled=is_labeled, sampler=sampler
)
def get_dataset(file):
@ -77,7 +159,9 @@ class ExtSumProcessedIterableDataset(IterableDataset):
if self.is_shuffle:
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list)))
else:
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list))))
return itertools.chain.from_iterable(
map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))
)
def __iter__(self):
return self.get_stream()
@ -96,12 +180,12 @@ class ExtSumProcessedDataset(Dataset):
files is shuffled when the dataset is loaded. Defaults to False.
"""
self.file_list = file_list
self.file_list = sorted(file_list)
if is_shuffle:
random.shuffle(file_list)
random.shuffle(self.file_list)
self.data = []
for file in file_list:
self.data.extend(torch.load(file))
for f in self.file_list:
self.data.extend(torch.load(f))
def __len__(self):
return len(self.data)
@ -110,7 +194,9 @@ class ExtSumProcessedDataset(Dataset):
return self.data[idx]
def get_pred(example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3):
def get_pred(
example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3
):
"""
Get the summarization prediction for the paragraph example based on the scores
returned by the transformer summarization model.
@ -223,7 +309,9 @@ class ExtSumProcessedData:
def _get_files(self, root):
train_files = []
test_files = []
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))]
files = [
os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
]
for fname in files:
if fname.find("train") != -1:
train_files.append(fname)
@ -484,7 +572,9 @@ class ExtractiveSummarizer(Transformer):
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
"""
super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir)
super().__init__(
model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir
)
if model_name not in self.list_supported_models():
raise ValueError(
"Model name {} is not supported by ExtractiveSummarizer. "
@ -530,6 +620,8 @@ class ExtractiveSummarizer(Transformer):
report_every=50,
verbose=True,
seed=None,
save_every=-1,
world_size=1,
**kwargs,
):
"""
@ -582,11 +674,19 @@ class ExtractiveSummarizer(Transformer):
)
# batch_size is the number of tokens in a batch
train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)
train_dataloader = get_dataloader(
train_dataset.get_stream(),
is_labeled=True,
batch_size=batch_size,
world_size=world_size,
rank=local_rank,
)
# compute the max number of training steps
max_steps = compute_training_steps(
train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
train_dataloader,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
)
super().fine_tune(
@ -603,6 +703,7 @@ class ExtractiveSummarizer(Transformer):
seed=seed,
report_every=report_every,
clip_grad_norm=False,
save_every=save_every,
)
def predict(
@ -647,32 +748,22 @@ class ExtractiveSummarizer(Transformer):
# tuple_batch = [list(col) for col in zip(*[d.values() for d in dict_list]
if dict_list is None or len(dict_list) <= 0:
return None
is_labeled = False
if "labels" in dict_list[0]:
is_labeled = True
tuple_batch = [list(d.values()) for d in dict_list]
## generate mask and mask_cls, and only select tensors for the model input
batch = Batch(tuple_batch, is_labeled=True)
if is_labeled:
return {
"src": batch.src,
"segs": batch.segs,
"clss": batch.clss,
"mask": batch.mask,
"mask_cls": batch.mask_cls,
"labels": batch.labels,
}
else:
return {
"src": batch.src,
"segs": batch.segs,
"clss": batch.clss,
"mask": batch.mask,
"mask_cls": batch.mask_cls,
}
## the labels was never used in prediction, set is_labeled as False
batch = Batch(tuple_batch, is_labeled=False)
return {
"src": batch.src,
"segs": batch.segs,
"clss": batch.clss,
"mask": batch.mask,
"mask_cls": batch.mask_cls,
}
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn)
test_dataloader = DataLoader(
test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn
)
sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids)
sent_scores_list = list(sent_scores)
scores_list = []
@ -722,12 +813,34 @@ class ExtractiveSummarizer(Transformer):
)
return preds
def save_model(self, name):
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
def save_model(self, full_name=None):
"""
save the trained model.
os.makedirs(self.cache_dir, exist_ok=True)
os.makedirs(output_model_dir, exist_ok=True)
Args:
full_name (str, optional): File name to save the model's `state_dict()`. If it's None,
the model is going to be saved under "fine_tuned" folder of the cached directory
of the object. Defaults to None.
"""
model_to_save = (
self.model.module if hasattr(self.model, "module") else self.model
) # Take care of distributed/parallel training
if full_name is None:
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
os.makedirs(self.cache_dir, exist_ok=True)
os.makedirs(output_model_dir, exist_ok=True)
full_name = os.path.join(output_model_dir, name)
full_name = os.path.join(output_model_dir, name)
logger.info("Saving model checkpoint to %s", full_name)
torch.save(self.model, name)
try:
print("saving through pytorch")
torch.save(model_to_save.state_dict(), full_name)
except OSError:
try:
print("saving as pickle")
pickle.dump(model_to_save.state_dict(), open(full_name, "wb"))
except Exception:
raise
except Exception:
raise

Просмотреть файл

@ -27,19 +27,41 @@ import jsonlines
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
from transformers.modeling_albert import (
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
AlbertForQuestionAnswering,
)
from transformers.modeling_bert import (
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
BertForQuestionAnswering,
)
from transformers.modeling_distilbert import (
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
DistilBertForQuestionAnswering,
)
from transformers.modeling_xlnet import (
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
XLNetForQuestionAnswering,
)
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
from utils_nlp.common.pytorch_utils import (
compute_training_steps,
get_device,
move_model_to_device,
)
from utils_nlp.models.transformers.common import (
MAX_SEQ_LEN,
TOKENIZER_CLASS,
Transformer,
)
MODEL_CLASS = {}
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
MODEL_CLASS.update(
{k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
)
MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
# cached files during preprocessing
@ -62,25 +84,29 @@ class QAProcessor:
Class for preprocessing and postprocessing question answering data.
Args:
model_name (str, optional): Name of the model. Call QAProcessor.list_supported_models() to
get all supported models. Defaults to "bert-base-cased".
model_name (str, optional): Name of the model.
Call QAProcessor.list_supported_models() to get all supported models.
Defaults to "bert-base-cased".
to_lower (bool, optional): Whether to convert all letters to lower case during
tokenization. This is determined by if a cased model is used. Defaults to False,
which corresponds to a cased model.
custom_tokenize (function, optional): A custom tokenize function used to tokenize the
input text. If not provided, the default tokenizer corresponding to the model_name
is loaded and its `tokenize` method is used. NOTE that even this function is
provided, the numerical token ids are still generated by the `convert_tokens_to_ids`
method of the default tokenizer, so there is a risk that tokens generated by the
custom_tokenize function don't have correponding token ids in the default toeknizer.
tokenization. This is determined by if a cased model is used.
Defaults to False, which corresponds to a cased model.
custom_tokenize (function, optional): A custom tokenize function
used to tokenize the input text. If not provided, the default tokenizer
corresponding to the model_name is loaded and its `tokenize` method is used.
NOTE that even this function is provided, the numerical token ids are still
generated by the `convert_tokens_to_ids` method of the default tokenizer,
so there is a risk that tokens generated by the custom_tokenize
function don't have correponding token ids in the default toeknizer.
Defaults to None.
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
"""
def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
def __init__(
self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
):
self.model_name = model_name
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
)
self.do_lower_case = to_lower
self.custom_tokenize = custom_tokenize
@ -149,9 +175,6 @@ class QAProcessor:
self,
qa_dataset,
is_training,
batch_size=32,
num_gpus=None,
distributed=False,
max_question_length=64,
max_seq_length=MAX_SEQ_LEN,
doc_stride=128,
@ -161,28 +184,31 @@ class QAProcessor:
Preprocesses raw question answering data and generates train/test features.
Args:
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`): Question answering data in
standard QADataset format.
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`):
Question answering data in standard QADataset format.
is_training (bool): Whether the input data is training data.
max_question_length (int, optional): Maximum number of tokens of the question sequence
after tokenization, so the number of words in the raw question is usually less than
max_question_length. Defaults to 64.
max_seq_length (int, optional): Maximum number of tokens of the entire feature token
sequence after tokenization. The entire feature token sequence is composed
of [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP] for models other
than XLNet, and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
max_question_length (int, optional): Maximum number of tokens
of the question sequence after tokenization, so the number of words
in the raw question is usually less than max_question_length.
Defaults to 64.
max_seq_length (int, optional): Maximum number of tokens of the entire
feature token sequence after tokenization. The entire feature token
sequence is composed of:
[CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP]
for models other than XLNet,
and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
XLNet. Defaults to MAX_SEQ_LEN.
doc_stride (int, optional): Size (number of tokens) of the sliding window when
breaking down a long document paragraph in to multiple document spans. Defaults
to 128.
feature_cache_dir (int, optional): Directory to save some intermediate preprocessing
results.
doc_stride (int, optional): Size (number of tokens) of the sliding window
when breaking down a long document paragraph in to multiple document
spans. Defaults to 128.
feature_cache_dir (int, optional): Directory to save some intermediate
preprocessing results.
If `is_training` is True, CACHED_EXAMPLES_TRAIN_FILE and
CACHED_FEATURES_TRAIN_FILE are saved to this directory. Otherwise,
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved to this
directory. These files are required during postprocessing to generate the final
answer texts from predicted answer start and answer end indices. Defaults to
"./cached_qa_features".
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved
to this directory. These files are required during postprocessing to
generate the final answer texts from predicted answer start and answer
end indices. Defaults to "./cached_qa_features".
Returns:
DataSet: A Pytorch DataSet.
"""
@ -217,7 +243,9 @@ class QAProcessor:
qa_examples.append(qa_example_cur)
qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
qa_examples_json.append(
{"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
)
features_cur = _create_qa_features(
qa_example_cur,
@ -257,17 +285,25 @@ class QAProcessor:
input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)
if is_training:
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
qa_dataset = TensorDataset(
input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
input_ids,
input_mask,
segment_ids,
start_positions,
end_positions,
cls_index,
p_mask,
)
else:
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
qa_dataset = TensorDataset(
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
)
return qa_dataset
@ -397,7 +433,14 @@ class QAResult(QAResult_):
QAResultExtended_ = collections.namedtuple(
"QAResultExtended",
["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
[
"unique_id",
"start_top_log_probs",
"start_top_index",
"end_top_log_probs",
"end_top_index",
"cls_logits",
],
)
@ -481,36 +524,45 @@ class AnswerExtractor(Transformer):
num_epochs (int, optional): Number of training epochs. Defaults to 1.
max_steps (int, optional): Total number of training steps.
If set to a positive value, it overrides num_epochs.
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
Otherwise, it's determined by the dataset length,
gradient_accumulation_steps, and num_epochs.
Defualts to -1.
gradient_accumulation_steps (int, optional): Number of steps to accumulate
before performing a backward/update pass.
Default to 1.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will be used.
num_gpus (int, optional): The number of GPUs to use.
If None, all available GPUs will be used.
If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
-1, which means non-distributed training.
weight_decay (float, optional): Weight decay to apply after each parameter update.
Defaults to 0.0.
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
5e-5.
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
to `learning rate`. Defaults to 0.
verbose (bool, optional): Whether to print out the training log. Defaults to True.
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
cache_model (bool, optional): Whether to save the fine-tuned model. If True,
the fine-tuned model is saved to a `fine_tuned` folder under of the `cache_dir`
of AnswerExtractor. Defaults to True.
local_rank (int, optional): Local_rank for distributed training on GPUs.
Defaults to -1, which means non-distributed training.
weight_decay (float, optional): Weight decay to apply after each
parameter update. Defaults to 0.0.
learning_rate (float, optional): Learning rate of the AdamW optimizer.
Defaults to 5e-5.
adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
Defaults to 1e-8.
warmup_steps (int, optional): Number of steps taken to increase
learning rate from 0 to `learning rate`.
Defaults to 0.
verbose (bool, optional): Whether to print out the training log.
Defaults to True.
seed (int, optional): Random seed used to improve reproducibility.
Defaults to None.
cache_model (bool, optional): Whether to save the fine-tuned model.
If True, the fine-tuned model is saved to a `fine_tuned` folder
under of the `cache_dir` of AnswerExtractor.
Defaults to True.
"""
# init optimizer
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
optimizer = Transformer.get_default_optimizer(
self.model, weight_decay, learning_rate, adam_epsilon
)
# compute the max number of training steps
max_steps = compute_training_steps(
@ -522,7 +574,7 @@ class AnswerExtractor(Transformer):
# inin scheduler
scheduler = Transformer.get_default_scheduler(
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
)
# fine tune
@ -530,7 +582,7 @@ class AnswerExtractor(Transformer):
train_dataloader=train_dataloader,
get_inputs=QAProcessor.get_inputs,
num_gpus=num_gpus,
gpu_ids=gpu_ids,
gpu_ids=gpu_ids,
max_steps=max_steps,
gradient_accumulation_steps=gradient_accumulation_steps,
optimizer=optimizer,
@ -550,13 +602,15 @@ class AnswerExtractor(Transformer):
Args:
test_dataloader (DataLoader): DataLoader for scoring the data.
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
be used. If set to 0 or GPUs are not available, CPU device will
be used. Defaults to None.
num_gpus (int, optional): The number of GPUs to use.
If None, all available GPUs will be used.
If set to 0 or GPUs are not available, CPU device will be used.
Defaults to None.
gpu_ids (list): List of GPU IDs to be used.
If set to None, the first num_gpus GPUs will be used.
Defaults to None.
verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
verbose (bool, optional): Whether to print out the predicting log.
Defaults to True.
Returns:
list: List of :class:`QAResult` or :class:`QAResultExtended`.
@ -589,7 +643,9 @@ class AnswerExtractor(Transformer):
)
else:
result = QAResult(
unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
unique_id=u_id.item(),
start_logits=_to_list(outputs[0][i]),
end_logits=_to_list(outputs[1][i]),
)
all_results.append(result)
torch.cuda.empty_cache()
@ -612,53 +668,61 @@ def postprocess_bert_answer(
verbose_logging=False,
):
"""
Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit` for BERT.
Postprocesses start and end logits
generated by :meth:`AnswerExtractor.fit` for BERT.
Args:
results (list): List of :class:`QAResult`.
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
contains the original document tokens that are used to generate the final answers
from the predicted start and end positions.
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
contains the mapping from indices in the processed token list to the original
document tokens that are used to generate the final predicted answers.
do_lower_case (bool): Whether an uncased tokenizer was used during data preprocessing.
This is required during answer finalization by comparing the predicted answer text
and the original text span in :func:`_get_final_text`.
unanswerable_exists (bool, optional): Whether there are unanswerable questions in the
data. If True, the start and end logits of the [CLS] token, which indicate the
probability of the answer being empty, are included in the candidate answer list.
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
This file contains the original document tokens that are used to generate
the final answers from the predicted start and end positions.
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
This file contains the mapping from indices in the processed token list
to the original document tokens that are used to generate the final
predicted answers.
do_lower_case (bool): Whether an uncased tokenizer was used during
data preprocessing. This is required during answer finalization
by comparing the predicted answer text and the original
text span in :func:`_get_final_text`.
unanswerable_exists (bool, optional): Whether there are unanswerable
questions in the data. If True, the start and end logits of the [CLS]
token, which indicate the probability of the answer being empty,
are included in the candidate answer list.
Defaults to False.
n_best_size (int, optional): The number of candidates to choose from each QAResult to
generate the final prediction. It's also the maximum number of n-best answers to
output for each question. Note that the number of n-best answers can be smaller than
`n_best_size` because some unqualified answers, e.g. answer that are too long,
are removed.
n_best_size (int, optional): The number of candidates to choose from each
QAResult to generate the final prediction. It's also the maximum number
of n-best answers to output for each question.
Note that the number of n-best answers can be smaller than `n_best_size`
because some unqualified answers,
e.g. answer that are too long, are removed.
max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.
output_prediction_file (str, optional): Path of the file to save the predicted answers.
Defaults to "./qa_predictions.json".
output_nbest_file (str, optional): Path of the file to save the n-best answers. Defaults
to "./nbest_predictions.json".
output_null_log_odds_file (str, optional): If unanswerable_exists is True, the score
difference between empty prediction and best non-empty prediction are saved to this
file. These scores can be used to find the best threshold for predicting an empty
answer. Defaults to "./null_odds.json".
null_score_diff_threshold (float, optional): If unanswerable_exists=True and the score
difference between empty prediction and best non-empty prediction is higher than this
threshold, the final predicted answer is empty. Defaults to 0.0.
verbose_logging (bool, optional): Whether to log details of answer postprocessing.
Defaults to False.
output_prediction_file (str, optional): Path of the file to save the
predicted answers. Defaults to "./qa_predictions.json".
output_nbest_file (str, optional): Path of the file to save the n-best answers.
Defaults to "./nbest_predictions.json".
output_null_log_odds_file (str, optional): If unanswerable_exists is True,
the score difference between empty prediction and best non-empty prediction
are saved to this file. These scores can be used to find the best threshold
for predicting an empty answer. Defaults to "./null_odds.json".
null_score_diff_threshold (float, optional): If unanswerable_exists=True
and the score difference between empty prediction and best non-empty
prediction is higher than this threshold, the final predicted
answer is empty.
Defaults to 0.0.
verbose_logging (bool, optional): Whether to log details of
answer postprocessing. Defaults to False.
Returns:
tuple: (OrderedDict, OrderedDict, OrderedDict)
The keys of the dictionaries are the `qa_id` in the original
:class:`utils_nlp.dataset.pytorch.QADataset`
The values of the first dictionary are the predicted answer texts in string type.
The values of the second dictionary are the softmax probabilities of the predicted
answers.
The values of the third dictionary are the n-best answers for each qa_id. Note that
the number of n-best answers can be smaller than `n_best_size` because some
unqualified answers, e.g. answers that are too long, are removed.
The values of the first dictionary are the predicted answer texts
in string type. The values of the second dictionary are the softmax
probabilities of the predicted answers.
The values of the third dictionary are the n-best answers for each qa_id.
Note that the number of n-best answers can be smaller than `n_best_size`
because some unqualified answers, e.g. answers that are too long,
are removed.
"""
with jsonlines.open(examples_file) as reader:
@ -753,7 +817,9 @@ def postprocess_bert_answer(
# Sort by the sum of the start and end logits in ascending order,
# so that the first element is the most probable answer
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
)
seen_predictions = {}
nbest = []
@ -786,11 +852,19 @@ def postprocess_bert_answer(
final_text = ""
seen_predictions[final_text] = True
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
)
)
# if we didn't include the empty option in the n-best, include it
if unanswerable_exists:
if "" not in seen_predictions:
nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
nbest.append(
_NbestPrediction(
text="", start_logit=null_start_logit, end_logit=null_end_logit
)
)
# In very rare edge cases we could only have single null prediction.
# So we just create a nonce prediction in this case to avoid failure.
@ -834,7 +908,9 @@ def postprocess_bert_answer(
all_probs[example["qa_id"]] = nbest_json[0]["probability"]
else:
# predict "" iff the null score - the score of best non-null > threshold
score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
score_diff = (
score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
)
scores_diff_json[example["qa_id"]] = score_diff
if score_diff > null_score_diff_threshold:
all_predictions[example["qa_id"]] = ""
@ -1000,7 +1076,9 @@ def postprocess_xlnet_answer(
)
)
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
prelim_predictions = sorted(
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
)
seen_predictions = {}
nbest = []
@ -1031,14 +1109,20 @@ def postprocess_xlnet_answer(
tok_text = " ".join(tok_text.split())
orig_text = " ".join(orig_tokens)
final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
final_text = _get_final_text(
tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
)
if final_text in seen_predictions:
continue
seen_predictions[final_text] = True
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
nbest.append(
_NbestPrediction(
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
)
)
# In very rare edge cases we could have no valid predictions. So we
# just create a nonce prediction in this case to avoid failure.
@ -1185,7 +1269,9 @@ def _create_qa_example(qa_input, is_training):
actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
logger.warning(
"Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
)
return
else:
start_position = -1
@ -1408,7 +1494,7 @@ def _create_qa_features(
else:
tok_end_position = len(all_doc_tokens) - 1
(tok_start_position, tok_end_position) = _improve_answer_span(
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
)
# The -3 accounts for [CLS], [SEP] and [SEP]
@ -1579,7 +1665,7 @@ def _create_qa_features(
# -------------------------------------------------------------------------------------------------
# Post processing helper functions
_PrelimPrediction = collections.namedtuple(
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
)
_NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
@ -1644,7 +1730,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
if len(orig_ns_text) != len(tok_ns_text):
if verbose_logging:
logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
logger.info(
"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
)
return orig_text
# We then project the characters in `pred_text` back to `orig_text` using