Merge branch 'staging' into hlu/unilm_abstractive_summarization
This commit is contained in:
Коммит
2f6aaf7a41
3
.flake8
3
.flake8
|
@ -14,4 +14,5 @@
|
|||
# E722: do not use bare except
|
||||
# E231: missing white space after "," --> black generates autoformat [,] which fails flake8
|
||||
ignore = E203, E266, W503, F403, F405, E402, E731, F821, E722, E231
|
||||
max-line-length = 100
|
||||
|
||||
max-line-length = 88
|
||||
|
|
|
@ -87,7 +87,7 @@ The following is a list of related repositories that we like and think are usefu
|
|||
|[MASS](https://github.com/microsoft/MASS)|MASS: Masked Sequence to Sequence Pre-training for Language Generation.|
|
||||
|[MT-DNN](https://github.com/namisan/mt-dnn)|Multi-Task Deep Neural Networks for Natural Language Understanding.|
|
||||
|[UniLM](https://github.com/microsoft/unilm)|Unified Language Model Pre-training.|
|
||||
|
||||
|[DialoGPT](https://github.com/microsoft/DialoGPT)|DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation|
|
||||
|
||||
|
||||
## Build Status
|
||||
|
|
|
@ -53,7 +53,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
|
@ -76,6 +76,7 @@
|
|||
"\n",
|
||||
"from utils_nlp.models.transformers.sequence_classification import Processor, SequenceClassifier\n",
|
||||
"from utils_nlp.dataset.multinli import load_pandas_df\n",
|
||||
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
|
||||
"from utils_nlp.common.timer import Timer"
|
||||
]
|
||||
},
|
||||
|
@ -201,6 +202,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# sample\n",
|
||||
"train_df = train_df.sample(frac=TRAIN_DATA_USED_FRACTION).reset_index(drop=True)\n",
|
||||
"dev_df_matched = dev_df_matched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)\n",
|
||||
"dev_df_mismatched = dev_df_mismatched.sample(frac=DEV_DATA_USED_FRACTION).reset_index(drop=True)"
|
||||
|
@ -224,7 +226,7 @@
|
|||
"source": [
|
||||
"## Tokenize and Preprocess\n",
|
||||
"Before training, we tokenize and preprocess the sentence texts to convert them into the format required by transformer model classes. \n",
|
||||
"The `create_dataloader_from_df` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataLoader`\n",
|
||||
"The `dataset_from_dataframe` method of the `Processor` class performs the following preprocessing steps and returns a Pytorch `DataSet`\n",
|
||||
"* Tokenize input texts using the tokenizer of the pre-trained model specified by `model_name`. \n",
|
||||
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
|
||||
"* Pad or truncate the token lists to the specified max length."
|
||||
|
@ -237,30 +239,35 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"processor = Processor(model_name=MODEL_NAME, cache_dir=CACHE_DIR, to_lower=TO_LOWER)\n",
|
||||
"train_dataloader = processor.create_dataloader_from_df(\n",
|
||||
"\n",
|
||||
"train_dataset = processor.dataset_from_dataframe(\n",
|
||||
" df=train_df,\n",
|
||||
" text_col=TEXT_COL_1,\n",
|
||||
" label_col=LABEL_COL_NUM,\n",
|
||||
" shuffle=True,\n",
|
||||
" text2_col=TEXT_COL_2,\n",
|
||||
" max_len=MAX_SEQ_LENGTH,\n",
|
||||
" batch_size=BATCH_SIZE,\n",
|
||||
")\n",
|
||||
"dev_dataloader_matched = processor.create_dataloader_from_df(\n",
|
||||
"dev_dataset_matched = processor.dataset_from_dataframe(\n",
|
||||
" df=dev_df_matched,\n",
|
||||
" text_col=TEXT_COL_1,\n",
|
||||
" shuffle=False,\n",
|
||||
" text_col=TEXT_COL_1, \n",
|
||||
" text2_col=TEXT_COL_2,\n",
|
||||
" max_len=MAX_SEQ_LENGTH,\n",
|
||||
" batch_size=BATCH_SIZE,\n",
|
||||
")\n",
|
||||
"dev_dataloader_mismatched = processor.create_dataloader_from_df(\n",
|
||||
"dev_dataset_mismatched = processor.dataset_from_dataframe(\n",
|
||||
" df=dev_df_mismatched,\n",
|
||||
" text_col=TEXT_COL_1,\n",
|
||||
" shuffle=False,\n",
|
||||
" text_col=TEXT_COL_1, \n",
|
||||
" text2_col=TEXT_COL_2,\n",
|
||||
" max_len=MAX_SEQ_LENGTH,\n",
|
||||
" batch_size=BATCH_SIZE,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"train_dataloader = dataloader_from_dataset(\n",
|
||||
" train_dataset, batch_size=BATCH_SIZE, shuffle=True\n",
|
||||
")\n",
|
||||
"dev_dataloader_matched = dataloader_from_dataset(\n",
|
||||
" dev_dataset_matched, batch_size=BATCH_SIZE, shuffle=False\n",
|
||||
")\n",
|
||||
"dev_dataloader_mismatched = dataloader_from_dataset(\n",
|
||||
" dev_dataset_mismatched, batch_size=BATCH_SIZE, shuffle=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -69,23 +69,19 @@
|
|||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"import scrapbook as sb\n",
|
||||
"\n",
|
||||
"import scrapbook as sb\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"nlp_path = os.path.abspath('../../')\n",
|
||||
"if nlp_path not in sys.path:\n",
|
||||
" sys.path.insert(0, nlp_path)\n",
|
||||
"\n",
|
||||
"from utils_nlp.common.pytorch_utils import dataloader_from_dataset\n",
|
||||
"from utils_nlp.common.timer import Timer\n",
|
||||
"from utils_nlp.dataset.squad import load_pandas_df\n",
|
||||
"from utils_nlp.eval.question_answering import evaluate_qa\n",
|
||||
"from utils_nlp.models.transformers.datasets import QADataset\n",
|
||||
"from utils_nlp.models.transformers.question_answering import (\n",
|
||||
" AnswerExtractor,\n",
|
||||
" QAProcessor,\n",
|
||||
" AnswerExtractor\n",
|
||||
")\n",
|
||||
" \n",
|
||||
"from utils_nlp.eval.question_answering import evaluate_qa\n",
|
||||
"from utils_nlp.common.timer import Timer"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -559,7 +555,7 @@
|
|||
"* Pad the concatenated token sequence to `max_seq_length` if it's shorter.\n",
|
||||
"* Convert the tokens into token indices corresponding to the tokenizer's vocabulary.\n",
|
||||
"\n",
|
||||
"`QAProcessor.preprocess` returns a Pytorch Dataloader. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
|
||||
"`QAProcessor.preprocess` returns a Pytorch DataSet. By default, it saves `cached_examples_train/test.jsonl` and `cached_features_train/test.jsonl` to `./cached_qa_features`. These files are required by postprocessing the predicted answer start and end indices to get the final answer text. You can change the default file directory by specifying `feature_cache_dir`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -577,24 +573,28 @@
|
|||
],
|
||||
"source": [
|
||||
"qa_processor = QAProcessor(model_name=MODEL_NAME, to_lower=DO_LOWER_CASE)\n",
|
||||
"train_dataloader = qa_processor.preprocess(\n",
|
||||
" train_dataset, \n",
|
||||
" batch_size=PER_GPU_BATCH_SIZE,\n",
|
||||
" num_gpus=NUM_GPUS,\n",
|
||||
"train_dataset = qa_processor.preprocess(\n",
|
||||
" train_dataset,\n",
|
||||
" is_training=True,\n",
|
||||
" max_question_length=MAX_QUESTION_LENGTH,\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
" doc_stride=DOC_STRIDE\n",
|
||||
" doc_stride=DOC_STRIDE,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"dev_dataloader = qa_processor.preprocess(\n",
|
||||
" dev_dataset, \n",
|
||||
" batch_size=PER_GPU_BATCH_SIZE,\n",
|
||||
" num_gpus=NUM_GPUS,\n",
|
||||
"# we keep a copy of the oroginal dev_dataset as it is needed for evaluation\n",
|
||||
"dev_dataset_processed = qa_processor.preprocess(\n",
|
||||
" dev_dataset,\n",
|
||||
" is_training=False,\n",
|
||||
" max_question_length=MAX_QUESTION_LENGTH,\n",
|
||||
" max_seq_length=MAX_SEQ_LENGTH,\n",
|
||||
" doc_stride=DOC_STRIDE\n",
|
||||
" doc_stride=DOC_STRIDE,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"train_dataloader = dataloader_from_dataset(\n",
|
||||
" train_dataset, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=True\n",
|
||||
")\n",
|
||||
"dev_dataloader = dataloader_from_dataset(\n",
|
||||
" dev_dataset_processed, batch_size=PER_GPU_BATCH_SIZE, num_gpus=NUM_GPUS, shuffle=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
|
|
@ -525,7 +525,7 @@
|
|||
"source": [
|
||||
"with Timer() as t:\n",
|
||||
" preds = model.predict(\n",
|
||||
" eval_dataloader=test_dataloader,\n",
|
||||
" test_dataloader=test_dataloader,\n",
|
||||
" num_gpus=CONFIG['num_gpus'],\n",
|
||||
" verbose=CONFIG['verbose']\n",
|
||||
" )\n",
|
||||
|
|
|
@ -0,0 +1,768 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Distributed Training For Extractive Summarization on CNN/DM Dataset\n",
|
||||
"\n",
|
||||
"## Summary\n",
|
||||
"This notebook demonstrates how to use Azure Machine Learning to run distributed training using Distributed Data Parallel in Pytorch for extractive summarization. For more detailed model related information, please see [extractive_summarization_cnndm_transformer.ipynb](extractive_summarization_cnndm_transformer.ipynb)\n",
|
||||
"\n",
|
||||
"## Prerequisites\n",
|
||||
"If you are using an Azure Machine Learning Notebook VM, you are all set. Otherwise, refer to the [Configuration Notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) first if you haven't already to establish your connection to the AzureML Workspace. Prerequisites are:\n",
|
||||
"\n",
|
||||
"- Azure subscription\n",
|
||||
"- Azure Machine Learning Workspace\n",
|
||||
"- Azure Machine Learning SDK\n",
|
||||
"\n",
|
||||
"To run rouge evaluation, please refer to the section of compute_rouge_perl in [summarization_evaluation.ipynb](summarization_evaluation.ipynb). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Import Libraries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%load_ext autoreload"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import sys\n",
|
||||
"from tempfile import TemporaryDirectory\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
"from azureml.core import Experiment, Workspace, Run\n",
|
||||
"from azureml.core.compute import ComputeTarget, AmlCompute\n",
|
||||
"from azureml.core.compute_target import ComputeTargetException\n",
|
||||
"from azureml.train.dnn import PyTorch\n",
|
||||
"from azureml.train.dnn import Nccl\n",
|
||||
"from azureml.widgets import RunDetails\n",
|
||||
"\n",
|
||||
"nlp_path = os.path.abspath(\"../../\")\n",
|
||||
"if nlp_path not in sys.path:\n",
|
||||
" sys.path.insert(0, nlp_path)\n",
|
||||
"from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
|
||||
"from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\n",
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Configuration "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# for Azure ML Workspacen\n",
|
||||
"SUBSRIPTION_ID = \"YOUR_SUBSCRIPTION_ID\"\n",
|
||||
"LOCATION = \"YOUR_RESOURCE_GROUP_NAME\" # example \"eastus2\"\n",
|
||||
"RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\" # modifiy to use your own\n",
|
||||
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# for creating Azure ML Compute Cluster\n",
|
||||
"AMLCOMPUTE_CLUSTER_NAME = \"extsum5\" # modifiy to use your own\n",
|
||||
"NODE_COUNT = 4\n",
|
||||
"VM_SIZE = \"STANDARD_NC6\" # this should be the VM that's supported by Azure and Azure ML\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# for creating Azure ML Experiment\n",
|
||||
"EXPERIMENT_NAME = \"NLP-ExtSum\" # modifiy to use your own\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# local folder to save the downloaded data\n",
|
||||
"LOCAL_DATA_FOLDER = (\n",
|
||||
" \"./bertsumdata/\"\n",
|
||||
") # modify to use your own, the penultimate level folder should exist\n",
|
||||
"\n",
|
||||
"# Training related parameter\n",
|
||||
"MODEL_NAME = \"distilbert-base-uncased\" # limited choice\n",
|
||||
"ENCODER = \"transformer\"\n",
|
||||
"# folder in the workspace where the data is uploaded to\n",
|
||||
"TARGET_DATA_FOLDER = \"/bertsum_processed_data\" # modify to use your own\n",
|
||||
"TARGET_OUTPUT_DIR = f\"output/{EXPERIMENT_NAME}/\"\n",
|
||||
"# cache dir in the workspace\n",
|
||||
"TARGET_CACHE_DIR = f\"cache/{EXPERIMENT_NAME}/\"\n",
|
||||
"# file name for saving the prediction\n",
|
||||
"SUMMARY_FILENAME = \"generated_summaries.txt\"\n",
|
||||
"# file name for saving the trained model\n",
|
||||
"MODEL_FILENAME = \"dist_extsum.pt\"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# local path to download the output from the cluster\n",
|
||||
"LOCAL_OUTPUT_DIR = \"./output\" # modifiy to use your own, the penultimate level folder\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# local folder to store all the related files to be copied to the workspace\n",
|
||||
"PROJECT_FOLDER = \"./azureml_exp\"\n",
|
||||
"# conda environment name, the yaml file will be copied to the workspace\n",
|
||||
"CONDA_ENV_NAME = \"nlp_gpu\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create an AML Workspace"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create the workspace using the specified parameters\n",
|
||||
"ws = get_or_create_workspace(\n",
|
||||
" workspace_name=WORKSPACE_NAME,\n",
|
||||
" subscription_id=SUBSRIPTION_ID,\n",
|
||||
" resource_group=RESOURCE_GROUP,\n",
|
||||
" workspace_region=LOCATION,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\n",
|
||||
" \"Workspace name: \" + ws.name,\n",
|
||||
" \"Azure region: \" + ws.location,\n",
|
||||
" \"Subscription id: \" + ws.subscription_id,\n",
|
||||
" \"Resource group: \" + ws.resource_group,\n",
|
||||
" sep=\"\\n\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create an AML GPU Compute Cluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found existing compute target.\n",
|
||||
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-01-30T22:43:39.856000+00:00', 'errors': None, 'creationTime': '2020-01-23T04:50:26.160743+00:00', 'modifiedTime': '2020-01-23T20:31:35.349184+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" gpu_compute_target = ComputeTarget(workspace=ws, name=AMLCOMPUTE_CLUSTER_NAME)\n",
|
||||
" print(\"Found existing compute target.\")\n",
|
||||
"except ComputeTargetException:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, NodeIdleTimeBeforeScaleDown=\"PT1200S\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
" gpu_compute_target = ComputeTarget.create(\n",
|
||||
" ws, AMLCOMPUTE_CLUSTER_NAME, compute_config\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" gpu_compute_target.wait_for_completion(show_output=True)\n",
|
||||
"\n",
|
||||
"# use get_status() to get a detailed status for the current AmlCompute.\n",
|
||||
"print(gpu_compute_target.get_status().serialize())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create an Experiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment = Experiment(ws, name=EXPERIMENT_NAME)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download Dataset to Local File System"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p {LOCAL_DATA_FOLDER}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"bertsum_data.zip: 869MB [00:29, 29.7MB/s] \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'./bertsumdata/'"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Upload the Downloaded Dataset to AML Workspace"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ds = ws.get_default_datastore()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Prepare for the Experiment Run\n",
|
||||
"Prepare the local project folder which is mirror to the workspace for the experiment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Generated conda file: nlp_gpu.yaml\r\n",
|
||||
"\r\n",
|
||||
"To create the conda environment:\r\n",
|
||||
"$ conda env create -f nlp_gpu.yaml\r\n",
|
||||
"\r\n",
|
||||
"To update the conda environment:\r\n",
|
||||
"$ conda env update -f nlp_gpu.yaml\r\n",
|
||||
"\r\n",
|
||||
"To register the conda environment in Jupyter:\r\n",
|
||||
"$ conda activate nlp_gpu\r\n",
|
||||
"$ python -m ipykernel install --user --name nlp_gpu --display-name \"Python (nlp_gpu)\"\r\n",
|
||||
"\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ENTRY_SCRIPT = \"extractive_summarization_cnndm_distributed_train.py\"\n",
|
||||
"!mkdir -p {PROJECT_FOLDER}\n",
|
||||
"!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
|
||||
"!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
|
||||
"!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
|
||||
"!cp -r ../../utils_nlp {PROJECT_FOLDER}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Submit Run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"os.makedirs(LOCAL_OUTPUT_DIR, exist_ok=True)\n",
|
||||
"os.makedirs(os.path.join(LOCAL_OUTPUT_DIR, EXPERIMENT_NAME), exist_ok=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:azureml.train.estimator._framework_base_estimator:If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
|
||||
"WARNING:azureml.train.estimator._framework_base_estimator:framework_version is not specified, defaulting to version 1.3.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"NcclConfig=Nccl()\n",
|
||||
"estimator = PyTorch(source_directory=PROJECT_FOLDER,\n",
|
||||
" compute_target=gpu_compute_target,\n",
|
||||
" script_params={\n",
|
||||
" \"--dist_url\": \"$AZ_BATCHAI_PYTORCH_INIT_METHOD\",\n",
|
||||
" \"--rank\": \"$AZ_BATCHAI_TASK_INDEX\",\n",
|
||||
" \"--node_count\": NODE_COUNT,\n",
|
||||
" \"--data_dir\":ds.path(f'{TARGET_DATA_FOLDER}').as_mount(),\n",
|
||||
" \"--cache_dir\": ds.path(f'{TARGET_CACHE_DIR}').as_mount(),\n",
|
||||
" '--output_dir':ds.path(f'{TARGET_OUTPUT_DIR}').as_mount(),\n",
|
||||
" \"--quick_run\": 'true',\n",
|
||||
" \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
|
||||
" \"--model_filename\": f'{MODEL_FILENAME}',\n",
|
||||
" \"--model_name\": MODEL_NAME,\n",
|
||||
" \"--encoder\": ENCODER\n",
|
||||
" },\n",
|
||||
" entry_script= ENTRY_SCRIPT,\n",
|
||||
" node_count=NODE_COUNT,\n",
|
||||
" distributed_training=NcclConfig,\n",
|
||||
" conda_dependencies_file=f'{CONDA_ENV_NAME}.yaml',\n",
|
||||
" use_gpu=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = experiment.submit(estimator)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "97f3678284a44f7aab5c27fa3e19bb11",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"RunDetails(run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"If you stop the notebook and come back, \n",
|
||||
"you'll need to use the run_id in the output of the previous cell \n",
|
||||
"to get run details.\n",
|
||||
"\"\"\"\n",
|
||||
"# fetched_run = Run(experiment, \"NLP-ExtSum_1579816237_ea238f69\")\n",
|
||||
"# RunDetails(fetched_run).show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download Generated Summaries "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# need to clear the local output dir as the ds.download won't download if the path exists\n",
|
||||
"!rm -rf {LOCAL_OUTPUT_DIR}/* "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading output/NLP-ExtSum/generated_summaries.txt\n",
|
||||
"Downloaded output/NLP-ExtSum/generated_summaries.txt, 1 files out of an estimated total of 1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
|
||||
" prefix=f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}',\n",
|
||||
" show_progress=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from utils_nlp.eval.evaluate_summarization import get_rouge\n",
|
||||
"from utils_nlp.models.transformers.extractive_summarization import ExtSumProcessedData\n",
|
||||
"import pickle\n",
|
||||
"from utils_nlp.models.transformers.extractive_summarization import ExtractiveSummarizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dataset, test_dataset = ExtSumProcessedData().splits(root=LOCAL_DATA_FOLDER)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"target = [i['tgt_txt'] for i in test_dataset]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prediction = []\n",
|
||||
"with open(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{SUMMARY_FILENAME}'), \"r\") as filehandle:\n",
|
||||
" for cnt, line in enumerate(filehandle):\n",
|
||||
" prediction.append(line[0:-1]) # remove the ending \"\\n\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloading output/NLP-ExtSum/dist_extsum.pt\n",
|
||||
"Downloaded output/NLP-ExtSum/dist_extsum.pt, 1 files out of an estimated total of 1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"100%|██████████| 546/546 [00:00<00:00, 306489.56B/s]\n",
|
||||
"100%|██████████| 267967963/267967963 [00:04<00:00, 63548158.24B/s]\n",
|
||||
"Scoring: 100%|██████████| 90/90 [00:41<00:00, 3.68it/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"## you can also download the saved model and run prediction if you are running the notebook on a gpu machine\n",
|
||||
"#\"\"\"\n",
|
||||
"ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
|
||||
" prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\n",
|
||||
" show_progress=True)\n",
|
||||
"summarizer = ExtractiveSummarizer(MODEL_NAME, ENCODER, LOCAL_OUTPUT_DIR)\n",
|
||||
"summarizer.model.load_state_dict(\n",
|
||||
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'))\n",
|
||||
")\n",
|
||||
"prediction = summarizer.predict(test_dataset, num_gpus=torch.cuda.device_count(), batch_size=128)\n",
|
||||
"#\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .',\n",
|
||||
" 'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .',\n",
|
||||
" 'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .',\n",
|
||||
" 'andrew mogni , 20 , from glen ellyn , illinois , a university of iowa student has died nearly three months after a fall in rome in a suspected robbery',\n",
|
||||
" 'he was taken to a medical facility in the chicago area , close to his family home in glen ellyn .',\n",
|
||||
" \"he died on sunday at northwestern memorial hospital - medical examiner 's office spokesman frank shuftan says a cause of death wo n't be released until monday at the earliest .\",\n",
|
||||
" 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed .',\n",
|
||||
" \"on sunday , his cousin abby wrote online : ` this morning my cousin andrew 's soul was lifted up to heaven .\",\n",
|
||||
" 'initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed',\n",
|
||||
" '` at the beginning of january he went to rome to study aboard and on the way home from a party he was brutally attacked and thrown off a 40ft bridge and hit the concrete below .',\n",
|
||||
" \"` he was in a coma and in critical condition for months . '\",\n",
|
||||
" 'paula barnett , who said she is a close family friend , told my suburban life , that mogni had only been in the country for six hours when the incident happened .',\n",
|
||||
" 'she said he was was alone at the time of the alleged assault and personal items were stolen .',\n",
|
||||
" 'she added that he was in a non-medically induced coma , having suffered serious infection and internal bleeding .',\n",
|
||||
" 'mogni was a third-year finance major from glen ellyn , ill. , who was participating in a semester-long program at john cabot university .',\n",
|
||||
" \"mogni belonged to the school 's chapter of the sigma nu fraternity , reports the chicago tribune who posted a sign outside a building reading ` pray for mogni . '\",\n",
|
||||
" \"the fraternity 's iowa chapter announced sunday afternoon via twitter that a memorial service will be held on campus to remember mogni .\"]"
|
||||
]
|
||||
},
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"test_dataset[0]['src_txt']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'he was flown back to chicago via air ambulance on march 20 , but he died on sunday .<q>andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program in italy when the incident happened in january .<q>a university of iowa student has died nearly three months after a fall in rome in a suspected robbery attack in rome .'"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"prediction[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'andrew mogni , 20 , from glen ellyn , illinois , had only just arrived for a semester program when the incident happened in january<q>he was flown back to chicago via air on march 20 but he died on sunday<q>initial police reports indicated the fall was an accident but authorities are investigating the possibility that mogni was robbed<q>his cousin claims he was attacked and thrown 40ft from a bridge'"
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"target[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RESULT_DIR = TemporaryDirectory().name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"11489\n",
|
||||
"11489\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2020-01-31 03:38:44,480 [MainThread ] [INFO ] Writing summaries.\n",
|
||||
"2020-01-31 03:38:44,481 [MainThread ] [INFO ] Processing summaries. Saving system files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system and model files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.\n",
|
||||
"2020-01-31 03:38:44,481 [MainThread ] [INFO ] Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/candidate/.\n",
|
||||
"2020-01-31 03:38:45,679 [MainThread ] [INFO ] Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/system.\n",
|
||||
"2020-01-31 03:38:45,681 [MainThread ] [INFO ] Processing files in /tmp/tmpz9fasy6o/rouge-tmp-2020-01-31-03-38-43/reference/.\n",
|
||||
"2020-01-31 03:38:46,904 [MainThread ] [INFO ] Saved processed files to /tmp/tmpz9fasy6o/tmpnhtvedhc/model.\n",
|
||||
"2020-01-31 03:38:46,989 [MainThread ] [INFO ] Written ROUGE configuration to /tmp/tmpz9fasy6o/tmpicwmu3se/rouge_conf.xml\n",
|
||||
"2020-01-31 03:38:46,989 [MainThread ] [INFO ] Running ROUGE with command /dadendev/pyrouge/tools/ROUGE-1.5.5/ROUGE-1.5.5.pl -e /dadendev/pyrouge/tools/ROUGE-1.5.5/data -c 95 -m -r 1000 -n 2 -a /tmp/tmpz9fasy6o/tmpicwmu3se/rouge_conf.xml\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"---------------------------------------------\n",
|
||||
"1 ROUGE-1 Average_R: 0.52322 (95%-conf.int. 0.52036 - 0.52598)\n",
|
||||
"1 ROUGE-1 Average_P: 0.35403 (95%-conf.int. 0.35171 - 0.35628)\n",
|
||||
"1 ROUGE-1 Average_F: 0.40840 (95%-conf.int. 0.40623 - 0.41040)\n",
|
||||
"---------------------------------------------\n",
|
||||
"1 ROUGE-2 Average_R: 0.23066 (95%-conf.int. 0.22789 - 0.23341)\n",
|
||||
"1 ROUGE-2 Average_P: 0.15558 (95%-conf.int. 0.15365 - 0.15744)\n",
|
||||
"1 ROUGE-2 Average_F: 0.17947 (95%-conf.int. 0.17740 - 0.18147)\n",
|
||||
"---------------------------------------------\n",
|
||||
"1 ROUGE-L Average_R: 0.47554 (95%-conf.int. 0.47274 - 0.47834)\n",
|
||||
"1 ROUGE-L Average_P: 0.32224 (95%-conf.int. 0.32001 - 0.32440)\n",
|
||||
"1 ROUGE-L Average_F: 0.37150 (95%-conf.int. 0.36935 - 0.37361)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"rouge_score = get_rouge(prediction, target, RESULT_DIR)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Cleanup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import shutil\n",
|
||||
"if os.path.exists(LOCAL_DATA_FOLDER):\n",
|
||||
" shutil.rmtree(LOCAL_DATA_FOLDER, ignore_errors=True)\n",
|
||||
"if os.path.exists(LOCAL_OUTPUT_DIR):\n",
|
||||
" shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\n",
|
||||
"if os.path.exists(PROJECT_FOLDER):\n",
|
||||
" shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)\n",
|
||||
"if os.path.exists(RESULT_DIR):\n",
|
||||
" shutil.rmtree(RESULT_DIR, ignore_errors=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python (nlp_gpu)",
|
||||
"language": "python",
|
||||
"name": "nlp_gpu"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.multiprocessing as mp
|
||||
|
||||
nlp_path = os.path.abspath("../../")
|
||||
if nlp_path not in sys.path:
|
||||
sys.path.insert(0, nlp_path)
|
||||
|
||||
|
||||
from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset
|
||||
from utils_nlp.models.transformers.extractive_summarization import (
|
||||
ExtractiveSummarizer,
|
||||
ExtSumProcessedData,
|
||||
ExtSumProcessor,
|
||||
)
|
||||
|
||||
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
|
||||
|
||||
os.environ["NCCL_IB_DISABLE"] = "0"
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--rank", type=int, default=0,
|
||||
help="The rank of the current node in the cluster")
|
||||
parser.add_argument("--dist_url", type=str, default="tcp://127.0.0.1:29500",
|
||||
help="URL specifying how to initialize the process groupi.")
|
||||
parser.add_argument("--node_count", type=int, default=1,
|
||||
help="Number of nodes in the cluster.")
|
||||
parser.add_argument("--cache_dir", type=str, default="./",
|
||||
help="Directory to cache the tokenizer.")
|
||||
parser.add_argument("--data_dir", type=str, default="./",
|
||||
help="Directory to download the preprocessed data.")
|
||||
parser.add_argument("--output_dir", type=str, default="./",
|
||||
help="Directory to save the output model and prediction results.")
|
||||
parser.add_argument("--quick_run", type=str.lower, default='false', choices=['true', 'false'],
|
||||
help="Whether to have a quick run")
|
||||
parser.add_argument("--model_name", type=str, default="distilbert-base-uncased",
|
||||
help="Transformer model used in the extractive summarization, only \
|
||||
\"bert-uncased\" and \"distilbert-base-uncased\" are supported.")
|
||||
parser.add_argument("--encoder", type=str.lower, default='transformer',
|
||||
choices=['baseline', 'classifier', 'transformer', 'rnn'],
|
||||
help="Encoder types in the extractive summarizer.")
|
||||
parser.add_argument("--learning_rate", type=float, default=1e-3,
|
||||
help="Learning rate.")
|
||||
parser.add_argument("--batch_size", type=int, default=3000,
|
||||
help="batch size in terms of input token numbers in training")
|
||||
parser.add_argument("--max_steps", type=int, default=1e4,
|
||||
help="Maximum number of training steps run in training. If quick_run is set,\
|
||||
it's not used.")
|
||||
parser.add_argument("--warmup_steps", type=int, default=5e3,
|
||||
help="Warm-up number of training steps run in training. If quick_run is set,\
|
||||
it's not used.")
|
||||
parser.add_argument("--top_n", type=int, default=3,
|
||||
help="Number of sentences selected in prediction for evaluation.")
|
||||
parser.add_argument("--summary_filename", type=str, default="generated_summaries.txt",
|
||||
help="Summary file name generated by prediction for evaluation.")
|
||||
parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt",
|
||||
help="model file name saved for evaluation.")
|
||||
|
||||
|
||||
|
||||
def cleanup():
|
||||
dist.destroy_process_group()
|
||||
|
||||
# How often the statistics reports show up in training, unit is step.
|
||||
REPORT_EVERY = 100
|
||||
SAVE_EVERY = 1000
|
||||
|
||||
def main():
|
||||
print("NCCL_IB_DISABLE: {}".format(os.getenv("NCCL_IB_DISABLE")))
|
||||
args = parser.parse_args()
|
||||
print("quick_run is {}".format(args.quick_run))
|
||||
print("output_dir is {}".format(args.output_dir))
|
||||
print("data_dir is {}".format(args.data_dir))
|
||||
print("cache_dir is {}".format(args.cache_dir))
|
||||
|
||||
#shutil.rmtree(args.output_dir)
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
os.makedirs(args.cache_dir, exist_ok=True)
|
||||
|
||||
ngpus_per_node = torch.cuda.device_count()
|
||||
|
||||
summarizer = ExtractiveSummarizer(args.model_name, args.encoder, args.cache_dir)
|
||||
|
||||
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, summarizer, args))
|
||||
|
||||
|
||||
def main_worker(local_rank, ngpus_per_node, summarizer, args):
|
||||
rank = args.rank * ngpus_per_node + local_rank
|
||||
world_size = args.node_count * ngpus_per_node
|
||||
|
||||
print("init_method: {}".format(args.dist_url))
|
||||
print("ngpus_per_node: {}".format(ngpus_per_node))
|
||||
print("rank: {}".format(rank))
|
||||
print("local_rank: {}".format(local_rank))
|
||||
print("world_size: {}".format(world_size))
|
||||
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
init_method=args.dist_url,
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
)
|
||||
|
||||
train_dataset, test_dataset = ExtSumProcessedData().splits(root=args.data_dir)
|
||||
# total number of steps for training
|
||||
MAX_STEPS = 1e3
|
||||
# number of steps for warm up
|
||||
WARMUP_STEPS = 5e2
|
||||
if args.quick_run.lower() == "false":
|
||||
MAX_STEPS = args.max_steps
|
||||
WARMUP_STEPS = args.warmup_steps
|
||||
|
||||
print("max steps is {}".format(MAX_STEPS))
|
||||
print("warmup steps is {}".format(WARMUP_STEPS))
|
||||
start = time.time()
|
||||
|
||||
if rank not in [-1, 0]:
|
||||
save_every = -1
|
||||
else:
|
||||
save_every = SAVE_EVERY
|
||||
summarizer.fit(
|
||||
train_dataset,
|
||||
num_gpus=world_size,
|
||||
batch_size=args.batch_size,
|
||||
gradient_accumulation_steps=2,
|
||||
max_steps=MAX_STEPS / world_size,
|
||||
learning_rate=args.learning_rate,
|
||||
warmup_steps=WARMUP_STEPS,
|
||||
verbose=True,
|
||||
report_every=REPORT_EVERY,
|
||||
clip_grad_norm=False,
|
||||
local_rank=rank,
|
||||
save_every=save_every,
|
||||
world_size=world_size
|
||||
)
|
||||
|
||||
end = time.time()
|
||||
print("rank {0}, duration {1:.6f}s".format(rank, end - start))
|
||||
if rank in [-1, 0]:
|
||||
summarizer.save_model(os.path.join(args.output_dir, args.model_filename))
|
||||
prediction = summarizer.predict(test_dataset, num_gpus=ngpus_per_node, batch_size=128)
|
||||
|
||||
def _write_list_to_file(list_items, filename):
|
||||
with open(filename, "w") as filehandle:
|
||||
# for cnt, line in enumerate(filehandle):
|
||||
for item in list_items:
|
||||
filehandle.write("%s\n" % item)
|
||||
|
||||
print("writing generated summaries")
|
||||
_write_list_to_file(prediction, os.path.join(args.output_dir, args.summary_filename))
|
||||
# only use the following line when you use your own cluster.
|
||||
# AML distributed training run cleanup for you.
|
||||
# cleanup()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,2 +1,2 @@
|
|||
[tool.black]
|
||||
line-length = 100
|
||||
line-length = 88
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import pytest
|
||||
from utils_nlp.models.transformers.extractive_summarization import IterableDistributedSampler
|
||||
|
||||
@pytest.mark.cpu
|
||||
def test_sampler():
|
||||
sampler = IterableDistributedSampler(1, -1)
|
||||
samples = list(sampler.iter('abcdefg'))
|
||||
assert ''.join(samples) == 'abcdefg'
|
||||
|
||||
sampler = IterableDistributedSampler(2, -1)
|
||||
samples = list(sampler.iter('abcdefg'))
|
||||
assert ''.join(samples) == 'abcdefg'
|
||||
|
||||
sampler = IterableDistributedSampler(4, 1)
|
||||
samples = list(sampler.iter('abcdefg'))
|
||||
assert ''.join(samples) == 'bf'
|
||||
|
||||
sampler = IterableDistributedSampler(4, 2)
|
||||
samples = list(sampler.iter('abcdefg'))
|
||||
assert ''.join(samples) == 'cg'
|
||||
|
||||
sampler = IterableDistributedSampler(4, 3)
|
||||
samples = list(sampler.iter('abcdefg'))
|
||||
assert ''.join(samples) == 'd'
|
||||
|
|
@ -65,22 +65,18 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
qa_id_col=qa_test_df["qa_id_col"],
|
||||
)
|
||||
|
||||
# bert
|
||||
qa_processor_bert = QAProcessor(cache_dir=tmp_module)
|
||||
train_features_bert = qa_processor_bert.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
test_features_bert = qa_processor_bert.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -88,22 +84,18 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
# xlnet
|
||||
qa_processor_xlnet = QAProcessor(model_name="xlnet-base-cased", cache_dir=tmp_module)
|
||||
train_features_xlnet = qa_processor_xlnet.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
test_features_xlnet = qa_processor_xlnet.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -111,22 +103,20 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
qa_processor_distilbert = QAProcessor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
|
||||
# distilbert
|
||||
qa_processor_distilbert = QAProcessor(
|
||||
model_name="distilbert-base-uncased", cache_dir=tmp_module
|
||||
)
|
||||
train_features_distilbert = qa_processor_distilbert.preprocess(
|
||||
train_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=True,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
doc_stride=32,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
test_features_distilbert = qa_processor_distilbert.preprocess(
|
||||
test_dataset,
|
||||
batch_size=BATCH_SIZE,
|
||||
num_gpus=NUM_GPUS,
|
||||
is_training=False,
|
||||
max_question_length=16,
|
||||
max_seq_length=64,
|
||||
|
@ -151,11 +141,21 @@ def qa_test_data(qa_test_df, tmp_module):
|
|||
|
||||
@pytest.mark.gpu
|
||||
def test_QAProcessor(qa_test_data, tmp_module):
|
||||
for model_name in ["bert-base-cased", "xlnet-base-cased", "distilbert-base-uncased"]:
|
||||
for model_name in [
|
||||
"bert-base-cased",
|
||||
"xlnet-base-cased",
|
||||
"distilbert-base-uncased",
|
||||
]:
|
||||
qa_processor = QAProcessor(model_name=model_name, cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset"], is_training=True, feature_cache_dir=tmp_module,
|
||||
)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_list"], is_training=True, feature_cache_dir=tmp_module,
|
||||
)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["test_dataset"], is_training=False, feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
# test unsupported model type
|
||||
with pytest.raises(ValueError):
|
||||
|
@ -163,18 +163,24 @@ def test_QAProcessor(qa_test_data, tmp_module):
|
|||
|
||||
# test training data has no ground truth exception
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module)
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["test_dataset"], is_training=True, feature_cache_dir=tmp_module
|
||||
)
|
||||
|
||||
# test when answer start is a list, but answer text is not
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_start_text_mismatch"], is_training=True, feature_cache_dir=tmp_module,
|
||||
qa_test_data["train_dataset_start_text_mismatch"],
|
||||
is_training=True,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
# test when training data has multiple answers
|
||||
with pytest.raises(Exception):
|
||||
qa_processor.preprocess(
|
||||
qa_test_data["train_dataset_multi_answers"], is_training=True, feature_cache_dir=tmp_module,
|
||||
qa_test_data["train_dataset_multi_answers"],
|
||||
is_training=True,
|
||||
feature_cache_dir=tmp_module,
|
||||
)
|
||||
|
||||
|
||||
|
@ -190,7 +196,9 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
|
|||
assert os.path.exists(os.path.join(model_output_dir, "pytorch_model.bin"))
|
||||
assert os.path.exists(os.path.join(model_output_dir, "config.json"))
|
||||
|
||||
qa_extractor_from_cache = AnswerExtractor(cache_dir=tmp_module, load_model_from_dir=model_output_dir)
|
||||
qa_extractor_from_cache = AnswerExtractor(
|
||||
cache_dir=tmp_module, load_model_from_dir=model_output_dir
|
||||
)
|
||||
qa_extractor_from_cache.predict(test_loader_bert, verbose=False)
|
||||
|
||||
# xlnet
|
||||
|
@ -202,8 +210,12 @@ def test_AnswerExtractor(qa_test_data, tmp_module):
|
|||
|
||||
# distilbert
|
||||
train_loader_xlnet = dataloader_from_dataset(qa_test_data["train_features_distilbert"])
|
||||
test_loader_xlnet = dataloader_from_dataset(qa_test_data["test_features_distilbert"], shuffle=False)
|
||||
qa_extractor_distilbert = AnswerExtractor(model_name="distilbert-base-uncased", cache_dir=tmp_module)
|
||||
test_loader_xlnet = dataloader_from_dataset(
|
||||
qa_test_data["test_features_distilbert"], shuffle=False
|
||||
)
|
||||
qa_extractor_distilbert = AnswerExtractor(
|
||||
model_name="distilbert-base-uncased", cache_dir=tmp_module
|
||||
)
|
||||
qa_extractor_distilbert.fit(train_loader_xlnet, verbose=False, cache_model=False)
|
||||
qa_extractor_distilbert.predict(test_loader_xlnet, verbose=False)
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
# This script reuses some code from
|
||||
# https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
|
@ -191,6 +192,7 @@ class Transformer:
|
|||
verbose=True,
|
||||
seed=None,
|
||||
report_every=10,
|
||||
save_every=-1,
|
||||
clip_grad_norm=True,
|
||||
):
|
||||
|
||||
|
@ -251,9 +253,14 @@ class Transformer:
|
|||
|
||||
if global_step % report_every == 0 and verbose:
|
||||
end = time.time()
|
||||
endtime_string = datetime.datetime.fromtimestamp(end).strftime(
|
||||
"%d/%m/%Y %H:%M:%S"
|
||||
)
|
||||
print(
|
||||
"loss:{0:.6f}, time:{1:f}, examples:{2:.0f}, "
|
||||
"step:{3:.0f}/{4:.0f}".format(
|
||||
"""timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f},
|
||||
number of examples in current step: {3:.0f}, step {4:.0f}
|
||||
out of total {5:.0f}""".format(
|
||||
endtime_string,
|
||||
accum_loss / report_every,
|
||||
end - start,
|
||||
len(batch),
|
||||
|
@ -268,7 +275,10 @@ class Transformer:
|
|||
if scheduler:
|
||||
scheduler.step()
|
||||
self.model.zero_grad()
|
||||
|
||||
if save_every != -1 and global_step % save_every == 0 and verbose:
|
||||
self.save_model(
|
||||
os.path.join(self.cache_dir, f"{self.model_name}_step_{global_step}.pt")
|
||||
)
|
||||
if global_step > max_steps:
|
||||
epoch_iterator.close()
|
||||
break
|
||||
|
@ -292,16 +302,31 @@ class Transformer:
|
|||
logits = outputs[0]
|
||||
yield logits.detach().cpu().numpy()
|
||||
|
||||
def save_model(self):
|
||||
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
|
||||
def save_model(self, full_name=None):
|
||||
"""
|
||||
save the trained model.
|
||||
|
||||
os.makedirs(self.cache_dir, exist_ok=True)
|
||||
os.makedirs(output_model_dir, exist_ok=True)
|
||||
Args:
|
||||
full_name (str, optional): File name to save the model's `state_dict()` and can
|
||||
be loaded by torch.load(). If it's None, the trained model, configuration
|
||||
and tokenizer using `save_pretrained()`; and the file is going to be saved
|
||||
under "fine_tuned" folder of the cached directory of the object. Defaults to None.
|
||||
"""
|
||||
|
||||
logger.info("Saving model checkpoint to %s", output_model_dir)
|
||||
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
|
||||
# They can then be reloaded using `from_pretrained()`
|
||||
model_to_save = (
|
||||
self.model.module if hasattr(self.model, "module") else self.model
|
||||
) # Take care of distributed/parallel training
|
||||
model_to_save.save_pretrained(output_model_dir)
|
||||
|
||||
if full_name:
|
||||
logger.info("Saving model checkpoint to %s", full_name)
|
||||
torch.save(model_to_save.state_dict(), full_name)
|
||||
else:
|
||||
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
|
||||
|
||||
os.makedirs(self.cache_dir, exist_ok=True)
|
||||
os.makedirs(output_model_dir, exist_ok=True)
|
||||
|
||||
logger.info("Saving model checkpoint to %s", output_model_dir)
|
||||
model_to_save.save_pretrained(output_model_dir)
|
||||
|
|
|
@ -3,9 +3,11 @@
|
|||
|
||||
# This script reuses some code from https://github.com/nlpyang/BertSum
|
||||
|
||||
import gc
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
|
@ -15,8 +17,8 @@ from torch.utils.data import DataLoader, Dataset, IterableDataset, SequentialSam
|
|||
# from torch.utils.data.distributed import DistributedSampler
|
||||
from transformers import BertModel, DistilBertModel
|
||||
|
||||
from bertsum.models import data_loader, model_builder
|
||||
from bertsum.models.data_loader import Batch
|
||||
from bertsum.models import model_builder
|
||||
from bertsum.models.data_loader import Batch, DataIterator
|
||||
from bertsum.models.model_builder import Summarizer
|
||||
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device
|
||||
from utils_nlp.dataset.sentence_selection import combination_selection, greedy_selection
|
||||
|
@ -27,6 +29,79 @@ MODEL_CLASS = {"bert-base-uncased": BertModel, "distilbert-base-uncased": Distil
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# https://pytorch.org/docs/1.1.0/_modules/torch/utils/data/dataloader.html
|
||||
class IterableDistributedSampler(object):
|
||||
""" Distributed sampler for iterable dataset.
|
||||
|
||||
Args:
|
||||
world_size (int): Total number of GPUs that will be used. Defaults to 1.
|
||||
rank (int): Rank of the current GPU. Defaults to -1.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, world_size=1, rank=-1):
|
||||
self.world_size = world_size
|
||||
self.rank = rank
|
||||
|
||||
def iter(self, iterable):
|
||||
if self.rank != -1:
|
||||
return itertools.islice(iterable, self.rank, None, self.world_size)
|
||||
else:
|
||||
return iterable
|
||||
|
||||
|
||||
class ChunkDataLoader(object):
|
||||
""" Data Loader for Chunked Dataset.
|
||||
|
||||
Args:
|
||||
datasets (list): list of data item list.
|
||||
batch_size (int): Number of tokens per batch.
|
||||
shuffle (bool): Whether the data is shuffled.
|
||||
is_labeled (bool): Whether the data is labeled.
|
||||
sampler (obj): Data sampler.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, datasets, batch_size, shuffle, is_labeled, sampler):
|
||||
self.datasets = datasets
|
||||
self.batch_size = batch_size
|
||||
self.shuffle = shuffle
|
||||
self.is_labeled = is_labeled
|
||||
self.cur_iter = self._next_dataset_iterator(datasets)
|
||||
assert self.cur_iter is not None
|
||||
self.sampler = sampler
|
||||
|
||||
def eachiter(self):
|
||||
dataset_iter = (d for d in self.datasets)
|
||||
while self.cur_iter is not None:
|
||||
for batch in self.cur_iter:
|
||||
yield batch
|
||||
self.cur_iter = self._next_dataset_iterator(dataset_iter)
|
||||
|
||||
def __iter__(self):
|
||||
return self.sampler.iter(self.eachiter())
|
||||
|
||||
def _next_dataset_iterator(self, dataset_iter):
|
||||
try:
|
||||
# Drop the current dataset for decreasing memory
|
||||
if hasattr(self, "cur_dataset"):
|
||||
self.cur_dataset = None
|
||||
gc.collect()
|
||||
del self.cur_dataset
|
||||
gc.collect()
|
||||
|
||||
self.cur_dataset = next(dataset_iter)
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
return DataIterator(
|
||||
dataset=self.cur_dataset,
|
||||
batch_size=self.batch_size,
|
||||
shuffle=self.shuffle,
|
||||
is_labeled=self.is_labeled,
|
||||
)
|
||||
|
||||
|
||||
class Bunch(object):
|
||||
""" Class which convert a dictionary to an object """
|
||||
|
||||
|
@ -34,21 +109,28 @@ class Bunch(object):
|
|||
self.__dict__.update(adict)
|
||||
|
||||
|
||||
def get_dataloader(data_iter, shuffle=True, is_labeled=False, batch_size=3000):
|
||||
def get_dataloader(
|
||||
data_iter, shuffle=True, is_labeled=False, batch_size=3000, world_size=1, rank=-1
|
||||
):
|
||||
"""
|
||||
Function to get data iterator over a list of data objects.
|
||||
|
||||
Args:
|
||||
data_iter (generator): data generator.
|
||||
shuffle (bool): whether the data is shuffled.
|
||||
is_labeled (bool): specifies whether the data objects are labeled data.
|
||||
batch_size (int): number of tokens per batch.
|
||||
data_iter (generator): Data generator.
|
||||
shuffle (bool): Whether the data is shuffled. Defaults to True.
|
||||
is_labeled (bool): Whether the data objects are labeled data.
|
||||
Defaults to False.
|
||||
batch_size (int): Number of tokens per batch. Defaults to 3000.
|
||||
world_size (int): Total number of GPUs that will be used. Defaults to 1.
|
||||
rank (int): Rank of the current GPU. Defaults to -1.
|
||||
|
||||
Returns:
|
||||
DataIterator
|
||||
"""
|
||||
|
||||
return data_loader.Dataloader(data_iter, batch_size, shuffle=shuffle, is_labeled=is_labeled)
|
||||
sampler = IterableDistributedSampler(world_size, rank)
|
||||
return ChunkDataLoader(
|
||||
data_iter, batch_size, shuffle=shuffle, is_labeled=is_labeled, sampler=sampler
|
||||
)
|
||||
|
||||
|
||||
def get_dataset(file):
|
||||
|
@ -77,7 +159,9 @@ class ExtSumProcessedIterableDataset(IterableDataset):
|
|||
if self.is_shuffle:
|
||||
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(self.file_list)))
|
||||
else:
|
||||
return itertools.chain.from_iterable(map(get_dataset, itertools.cycle(random.shuffle(self.file_list))))
|
||||
return itertools.chain.from_iterable(
|
||||
map(get_dataset, itertools.cycle(random.shuffle(self.file_list)))
|
||||
)
|
||||
|
||||
def __iter__(self):
|
||||
return self.get_stream()
|
||||
|
@ -96,12 +180,12 @@ class ExtSumProcessedDataset(Dataset):
|
|||
files is shuffled when the dataset is loaded. Defaults to False.
|
||||
"""
|
||||
|
||||
self.file_list = file_list
|
||||
self.file_list = sorted(file_list)
|
||||
if is_shuffle:
|
||||
random.shuffle(file_list)
|
||||
random.shuffle(self.file_list)
|
||||
self.data = []
|
||||
for file in file_list:
|
||||
self.data.extend(torch.load(file))
|
||||
for f in self.file_list:
|
||||
self.data.extend(torch.load(f))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
|
@ -110,7 +194,9 @@ class ExtSumProcessedDataset(Dataset):
|
|||
return self.data[idx]
|
||||
|
||||
|
||||
def get_pred(example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3):
|
||||
def get_pred(
|
||||
example, sent_scores, cal_lead=False, sentence_separator="<q>", block_trigram=True, top_n=3
|
||||
):
|
||||
"""
|
||||
Get the summarization prediction for the paragraph example based on the scores
|
||||
returned by the transformer summarization model.
|
||||
|
@ -223,7 +309,9 @@ class ExtSumProcessedData:
|
|||
def _get_files(self, root):
|
||||
train_files = []
|
||||
test_files = []
|
||||
files = [os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))]
|
||||
files = [
|
||||
os.path.join(root, f) for f in os.listdir(root) if os.path.isfile(os.path.join(root, f))
|
||||
]
|
||||
for fname in files:
|
||||
if fname.find("train") != -1:
|
||||
train_files.append(fname)
|
||||
|
@ -484,7 +572,9 @@ class ExtractiveSummarizer(Transformer):
|
|||
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
|
||||
"""
|
||||
|
||||
super().__init__(model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir)
|
||||
super().__init__(
|
||||
model_class=MODEL_CLASS, model_name=model_name, num_labels=0, cache_dir=cache_dir
|
||||
)
|
||||
if model_name not in self.list_supported_models():
|
||||
raise ValueError(
|
||||
"Model name {} is not supported by ExtractiveSummarizer. "
|
||||
|
@ -530,6 +620,8 @@ class ExtractiveSummarizer(Transformer):
|
|||
report_every=50,
|
||||
verbose=True,
|
||||
seed=None,
|
||||
save_every=-1,
|
||||
world_size=1,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
|
@ -582,11 +674,19 @@ class ExtractiveSummarizer(Transformer):
|
|||
)
|
||||
|
||||
# batch_size is the number of tokens in a batch
|
||||
train_dataloader = get_dataloader(train_dataset.get_stream(), is_labeled=True, batch_size=batch_size)
|
||||
train_dataloader = get_dataloader(
|
||||
train_dataset.get_stream(),
|
||||
is_labeled=True,
|
||||
batch_size=batch_size,
|
||||
world_size=world_size,
|
||||
rank=local_rank,
|
||||
)
|
||||
|
||||
# compute the max number of training steps
|
||||
max_steps = compute_training_steps(
|
||||
train_dataloader, max_steps=max_steps, gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
train_dataloader,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
super().fine_tune(
|
||||
|
@ -603,6 +703,7 @@ class ExtractiveSummarizer(Transformer):
|
|||
seed=seed,
|
||||
report_every=report_every,
|
||||
clip_grad_norm=False,
|
||||
save_every=save_every,
|
||||
)
|
||||
|
||||
def predict(
|
||||
|
@ -647,32 +748,22 @@ class ExtractiveSummarizer(Transformer):
|
|||
# tuple_batch = [list(col) for col in zip(*[d.values() for d in dict_list]
|
||||
if dict_list is None or len(dict_list) <= 0:
|
||||
return None
|
||||
is_labeled = False
|
||||
if "labels" in dict_list[0]:
|
||||
is_labeled = True
|
||||
tuple_batch = [list(d.values()) for d in dict_list]
|
||||
## generate mask and mask_cls, and only select tensors for the model input
|
||||
batch = Batch(tuple_batch, is_labeled=True)
|
||||
if is_labeled:
|
||||
return {
|
||||
"src": batch.src,
|
||||
"segs": batch.segs,
|
||||
"clss": batch.clss,
|
||||
"mask": batch.mask,
|
||||
"mask_cls": batch.mask_cls,
|
||||
"labels": batch.labels,
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"src": batch.src,
|
||||
"segs": batch.segs,
|
||||
"clss": batch.clss,
|
||||
"mask": batch.mask,
|
||||
"mask_cls": batch.mask_cls,
|
||||
}
|
||||
## the labels was never used in prediction, set is_labeled as False
|
||||
batch = Batch(tuple_batch, is_labeled=False)
|
||||
return {
|
||||
"src": batch.src,
|
||||
"segs": batch.segs,
|
||||
"clss": batch.clss,
|
||||
"mask": batch.mask,
|
||||
"mask_cls": batch.mask_cls,
|
||||
}
|
||||
|
||||
test_sampler = SequentialSampler(test_dataset)
|
||||
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn)
|
||||
test_dataloader = DataLoader(
|
||||
test_dataset, sampler=test_sampler, batch_size=batch_size, collate_fn=collate_fn
|
||||
)
|
||||
sent_scores = self.predict_scores(test_dataloader, num_gpus=num_gpus, gpu_ids=gpu_ids)
|
||||
sent_scores_list = list(sent_scores)
|
||||
scores_list = []
|
||||
|
@ -722,12 +813,34 @@ class ExtractiveSummarizer(Transformer):
|
|||
)
|
||||
return preds
|
||||
|
||||
def save_model(self, name):
|
||||
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
|
||||
def save_model(self, full_name=None):
|
||||
"""
|
||||
save the trained model.
|
||||
|
||||
os.makedirs(self.cache_dir, exist_ok=True)
|
||||
os.makedirs(output_model_dir, exist_ok=True)
|
||||
Args:
|
||||
full_name (str, optional): File name to save the model's `state_dict()`. If it's None,
|
||||
the model is going to be saved under "fine_tuned" folder of the cached directory
|
||||
of the object. Defaults to None.
|
||||
"""
|
||||
model_to_save = (
|
||||
self.model.module if hasattr(self.model, "module") else self.model
|
||||
) # Take care of distributed/parallel training
|
||||
|
||||
if full_name is None:
|
||||
output_model_dir = os.path.join(self.cache_dir, "fine_tuned")
|
||||
os.makedirs(self.cache_dir, exist_ok=True)
|
||||
os.makedirs(output_model_dir, exist_ok=True)
|
||||
full_name = os.path.join(output_model_dir, name)
|
||||
|
||||
full_name = os.path.join(output_model_dir, name)
|
||||
logger.info("Saving model checkpoint to %s", full_name)
|
||||
torch.save(self.model, name)
|
||||
try:
|
||||
print("saving through pytorch")
|
||||
torch.save(model_to_save.state_dict(), full_name)
|
||||
except OSError:
|
||||
try:
|
||||
print("saving as pickle")
|
||||
pickle.dump(model_to_save.state_dict(), open(full_name, "wb"))
|
||||
except Exception:
|
||||
raise
|
||||
except Exception:
|
||||
raise
|
||||
|
|
|
@ -27,19 +27,41 @@ import jsonlines
|
|||
import torch
|
||||
from torch.utils.data import TensorDataset
|
||||
from tqdm import tqdm
|
||||
from transformers.modeling_albert import ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForQuestionAnswering
|
||||
from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertForQuestionAnswering
|
||||
from transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForQuestionAnswering
|
||||
from transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, XLNetForQuestionAnswering
|
||||
from transformers.modeling_albert import (
|
||||
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
AlbertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_bert import (
|
||||
BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
BertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_distilbert import (
|
||||
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
DistilBertForQuestionAnswering,
|
||||
)
|
||||
from transformers.modeling_xlnet import (
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
XLNetForQuestionAnswering,
|
||||
)
|
||||
from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
|
||||
|
||||
from utils_nlp.common.pytorch_utils import compute_training_steps, get_device, move_model_to_device
|
||||
from utils_nlp.models.transformers.common import MAX_SEQ_LEN, TOKENIZER_CLASS, Transformer
|
||||
from utils_nlp.common.pytorch_utils import (
|
||||
compute_training_steps,
|
||||
get_device,
|
||||
move_model_to_device,
|
||||
)
|
||||
from utils_nlp.models.transformers.common import (
|
||||
MAX_SEQ_LEN,
|
||||
TOKENIZER_CLASS,
|
||||
Transformer,
|
||||
)
|
||||
|
||||
MODEL_CLASS = {}
|
||||
MODEL_CLASS.update({k: BertForQuestionAnswering for k in BERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: XLNetForQuestionAnswering for k in XLNET_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update({k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
MODEL_CLASS.update(
|
||||
{k: DistilBertForQuestionAnswering for k in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP}
|
||||
)
|
||||
MODEL_CLASS.update({k: AlbertForQuestionAnswering for k in ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP})
|
||||
|
||||
# cached files during preprocessing
|
||||
|
@ -62,25 +84,29 @@ class QAProcessor:
|
|||
Class for preprocessing and postprocessing question answering data.
|
||||
|
||||
Args:
|
||||
model_name (str, optional): Name of the model. Call QAProcessor.list_supported_models() to
|
||||
get all supported models. Defaults to "bert-base-cased".
|
||||
model_name (str, optional): Name of the model.
|
||||
Call QAProcessor.list_supported_models() to get all supported models.
|
||||
Defaults to "bert-base-cased".
|
||||
to_lower (bool, optional): Whether to convert all letters to lower case during
|
||||
tokenization. This is determined by if a cased model is used. Defaults to False,
|
||||
which corresponds to a cased model.
|
||||
custom_tokenize (function, optional): A custom tokenize function used to tokenize the
|
||||
input text. If not provided, the default tokenizer corresponding to the model_name
|
||||
is loaded and its `tokenize` method is used. NOTE that even this function is
|
||||
provided, the numerical token ids are still generated by the `convert_tokens_to_ids`
|
||||
method of the default tokenizer, so there is a risk that tokens generated by the
|
||||
custom_tokenize function don't have correponding token ids in the default toeknizer.
|
||||
tokenization. This is determined by if a cased model is used.
|
||||
Defaults to False, which corresponds to a cased model.
|
||||
custom_tokenize (function, optional): A custom tokenize function
|
||||
used to tokenize the input text. If not provided, the default tokenizer
|
||||
corresponding to the model_name is loaded and its `tokenize` method is used.
|
||||
NOTE that even this function is provided, the numerical token ids are still
|
||||
generated by the `convert_tokens_to_ids` method of the default tokenizer,
|
||||
so there is a risk that tokens generated by the custom_tokenize
|
||||
function don't have correponding token ids in the default toeknizer.
|
||||
Defaults to None.
|
||||
cache_dir (str, optional): Directory to cache the tokenizer. Defaults to ".".
|
||||
"""
|
||||
|
||||
def __init__(self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir="."):
|
||||
def __init__(
|
||||
self, model_name="bert-base-cased", to_lower=False, custom_tokenize=None, cache_dir=".",
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.tokenizer = TOKENIZER_CLASS[model_name].from_pretrained(
|
||||
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False
|
||||
model_name, do_lower_case=to_lower, cache_dir=cache_dir, output_loading_info=False,
|
||||
)
|
||||
self.do_lower_case = to_lower
|
||||
self.custom_tokenize = custom_tokenize
|
||||
|
@ -149,9 +175,6 @@ class QAProcessor:
|
|||
self,
|
||||
qa_dataset,
|
||||
is_training,
|
||||
batch_size=32,
|
||||
num_gpus=None,
|
||||
distributed=False,
|
||||
max_question_length=64,
|
||||
max_seq_length=MAX_SEQ_LEN,
|
||||
doc_stride=128,
|
||||
|
@ -161,28 +184,31 @@ class QAProcessor:
|
|||
Preprocesses raw question answering data and generates train/test features.
|
||||
|
||||
Args:
|
||||
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`): Question answering data in
|
||||
standard QADataset format.
|
||||
qa_dataset (:class:`utils_nlp.dataset.pytorch.QADataset`):
|
||||
Question answering data in standard QADataset format.
|
||||
is_training (bool): Whether the input data is training data.
|
||||
max_question_length (int, optional): Maximum number of tokens of the question sequence
|
||||
after tokenization, so the number of words in the raw question is usually less than
|
||||
max_question_length. Defaults to 64.
|
||||
max_seq_length (int, optional): Maximum number of tokens of the entire feature token
|
||||
sequence after tokenization. The entire feature token sequence is composed
|
||||
of [CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP] for models other
|
||||
than XLNet, and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
|
||||
max_question_length (int, optional): Maximum number of tokens
|
||||
of the question sequence after tokenization, so the number of words
|
||||
in the raw question is usually less than max_question_length.
|
||||
Defaults to 64.
|
||||
max_seq_length (int, optional): Maximum number of tokens of the entire
|
||||
feature token sequence after tokenization. The entire feature token
|
||||
sequence is composed of:
|
||||
[CLS] + [Question tokens] + [SEP] + [Document tokens] + [SEP]
|
||||
for models other than XLNet,
|
||||
and [Document tokens] + [SEP] + [Question tokens] + [SEP] + [CLS} for
|
||||
XLNet. Defaults to MAX_SEQ_LEN.
|
||||
doc_stride (int, optional): Size (number of tokens) of the sliding window when
|
||||
breaking down a long document paragraph in to multiple document spans. Defaults
|
||||
to 128.
|
||||
feature_cache_dir (int, optional): Directory to save some intermediate preprocessing
|
||||
results.
|
||||
doc_stride (int, optional): Size (number of tokens) of the sliding window
|
||||
when breaking down a long document paragraph in to multiple document
|
||||
spans. Defaults to 128.
|
||||
feature_cache_dir (int, optional): Directory to save some intermediate
|
||||
preprocessing results.
|
||||
If `is_training` is True, CACHED_EXAMPLES_TRAIN_FILE and
|
||||
CACHED_FEATURES_TRAIN_FILE are saved to this directory. Otherwise,
|
||||
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved to this
|
||||
directory. These files are required during postprocessing to generate the final
|
||||
answer texts from predicted answer start and answer end indices. Defaults to
|
||||
"./cached_qa_features".
|
||||
CACHED_EXAMPLES_TEST_FILE and CACHED_FEATURES_TEST_FILE are saved
|
||||
to this directory. These files are required during postprocessing to
|
||||
generate the final answer texts from predicted answer start and answer
|
||||
end indices. Defaults to "./cached_qa_features".
|
||||
Returns:
|
||||
DataSet: A Pytorch DataSet.
|
||||
"""
|
||||
|
@ -217,7 +243,9 @@ class QAProcessor:
|
|||
|
||||
qa_examples.append(qa_example_cur)
|
||||
|
||||
qa_examples_json.append({"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens})
|
||||
qa_examples_json.append(
|
||||
{"qa_id": qa_example_cur.qa_id, "doc_tokens": qa_example_cur.doc_tokens}
|
||||
)
|
||||
|
||||
features_cur = _create_qa_features(
|
||||
qa_example_cur,
|
||||
|
@ -257,17 +285,25 @@ class QAProcessor:
|
|||
input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
|
||||
segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
|
||||
cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
|
||||
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
|
||||
p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.long)
|
||||
|
||||
if is_training:
|
||||
start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
|
||||
end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask,
|
||||
input_ids,
|
||||
input_mask,
|
||||
segment_ids,
|
||||
start_positions,
|
||||
end_positions,
|
||||
cls_index,
|
||||
p_mask,
|
||||
)
|
||||
else:
|
||||
unique_id_all = torch.tensor(unique_id_all, dtype=torch.long)
|
||||
qa_dataset = TensorDataset(input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all)
|
||||
qa_dataset = TensorDataset(
|
||||
input_ids, input_mask, segment_ids, cls_index, p_mask, unique_id_all
|
||||
)
|
||||
|
||||
return qa_dataset
|
||||
|
||||
|
@ -397,7 +433,14 @@ class QAResult(QAResult_):
|
|||
|
||||
QAResultExtended_ = collections.namedtuple(
|
||||
"QAResultExtended",
|
||||
["unique_id", "start_top_log_probs", "start_top_index", "end_top_log_probs", "end_top_index", "cls_logits",],
|
||||
[
|
||||
"unique_id",
|
||||
"start_top_log_probs",
|
||||
"start_top_index",
|
||||
"end_top_log_probs",
|
||||
"end_top_index",
|
||||
"cls_logits",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
|
@ -481,36 +524,45 @@ class AnswerExtractor(Transformer):
|
|||
num_epochs (int, optional): Number of training epochs. Defaults to 1.
|
||||
max_steps (int, optional): Total number of training steps.
|
||||
If set to a positive value, it overrides num_epochs.
|
||||
Otherwise, it's determined by the dataset length, gradient_accumulation_steps, and num_epochs.
|
||||
Otherwise, it's determined by the dataset length,
|
||||
gradient_accumulation_steps, and num_epochs.
|
||||
Defualts to -1.
|
||||
gradient_accumulation_steps (int, optional): Number of steps to accumulate
|
||||
before performing a backward/update pass.
|
||||
Default to 1.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will be used.
|
||||
num_gpus (int, optional): The number of GPUs to use.
|
||||
If None, all available GPUs will be used.
|
||||
If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs. Defaults to
|
||||
-1, which means non-distributed training.
|
||||
weight_decay (float, optional): Weight decay to apply after each parameter update.
|
||||
Defaults to 0.0.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer. Defaults to
|
||||
5e-5.
|
||||
adam_epsilon (float, optional): Epsilon of the AdamW optimizer. Defaults to 1e-8.
|
||||
warmup_steps (int, optional): Number of steps taken to increase learning rate from 0
|
||||
to `learning rate`. Defaults to 0.
|
||||
verbose (bool, optional): Whether to print out the training log. Defaults to True.
|
||||
seed (int, optional): Random seed used to improve reproducibility. Defaults to None.
|
||||
cache_model (bool, optional): Whether to save the fine-tuned model. If True,
|
||||
the fine-tuned model is saved to a `fine_tuned` folder under of the `cache_dir`
|
||||
of AnswerExtractor. Defaults to True.
|
||||
local_rank (int, optional): Local_rank for distributed training on GPUs.
|
||||
Defaults to -1, which means non-distributed training.
|
||||
weight_decay (float, optional): Weight decay to apply after each
|
||||
parameter update. Defaults to 0.0.
|
||||
learning_rate (float, optional): Learning rate of the AdamW optimizer.
|
||||
Defaults to 5e-5.
|
||||
adam_epsilon (float, optional): Epsilon of the AdamW optimizer.
|
||||
Defaults to 1e-8.
|
||||
warmup_steps (int, optional): Number of steps taken to increase
|
||||
learning rate from 0 to `learning rate`.
|
||||
Defaults to 0.
|
||||
verbose (bool, optional): Whether to print out the training log.
|
||||
Defaults to True.
|
||||
seed (int, optional): Random seed used to improve reproducibility.
|
||||
Defaults to None.
|
||||
cache_model (bool, optional): Whether to save the fine-tuned model.
|
||||
If True, the fine-tuned model is saved to a `fine_tuned` folder
|
||||
under of the `cache_dir` of AnswerExtractor.
|
||||
Defaults to True.
|
||||
|
||||
"""
|
||||
|
||||
# init optimizer
|
||||
optimizer = Transformer.get_default_optimizer(self.model, weight_decay, learning_rate, adam_epsilon)
|
||||
optimizer = Transformer.get_default_optimizer(
|
||||
self.model, weight_decay, learning_rate, adam_epsilon
|
||||
)
|
||||
|
||||
# compute the max number of training steps
|
||||
max_steps = compute_training_steps(
|
||||
|
@ -522,7 +574,7 @@ class AnswerExtractor(Transformer):
|
|||
|
||||
# inin scheduler
|
||||
scheduler = Transformer.get_default_scheduler(
|
||||
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps,
|
||||
optimizer=optimizer, warmup_steps=warmup_steps, num_training_steps=max_steps
|
||||
)
|
||||
|
||||
# fine tune
|
||||
|
@ -530,7 +582,7 @@ class AnswerExtractor(Transformer):
|
|||
train_dataloader=train_dataloader,
|
||||
get_inputs=QAProcessor.get_inputs,
|
||||
num_gpus=num_gpus,
|
||||
gpu_ids=gpu_ids,
|
||||
gpu_ids=gpu_ids,
|
||||
max_steps=max_steps,
|
||||
gradient_accumulation_steps=gradient_accumulation_steps,
|
||||
optimizer=optimizer,
|
||||
|
@ -550,13 +602,15 @@ class AnswerExtractor(Transformer):
|
|||
|
||||
Args:
|
||||
test_dataloader (DataLoader): DataLoader for scoring the data.
|
||||
num_gpus (int, optional): The number of GPUs to use. If None, all available GPUs will
|
||||
be used. If set to 0 or GPUs are not available, CPU device will
|
||||
be used. Defaults to None.
|
||||
num_gpus (int, optional): The number of GPUs to use.
|
||||
If None, all available GPUs will be used.
|
||||
If set to 0 or GPUs are not available, CPU device will be used.
|
||||
Defaults to None.
|
||||
gpu_ids (list): List of GPU IDs to be used.
|
||||
If set to None, the first num_gpus GPUs will be used.
|
||||
Defaults to None.
|
||||
verbose (bool, optional): Whether to print out the predicting log. Defaults to True.
|
||||
verbose (bool, optional): Whether to print out the predicting log.
|
||||
Defaults to True.
|
||||
|
||||
Returns:
|
||||
list: List of :class:`QAResult` or :class:`QAResultExtended`.
|
||||
|
@ -589,7 +643,9 @@ class AnswerExtractor(Transformer):
|
|||
)
|
||||
else:
|
||||
result = QAResult(
|
||||
unique_id=u_id.item(), start_logits=_to_list(outputs[0][i]), end_logits=_to_list(outputs[1][i]),
|
||||
unique_id=u_id.item(),
|
||||
start_logits=_to_list(outputs[0][i]),
|
||||
end_logits=_to_list(outputs[1][i]),
|
||||
)
|
||||
all_results.append(result)
|
||||
torch.cuda.empty_cache()
|
||||
|
@ -612,53 +668,61 @@ def postprocess_bert_answer(
|
|||
verbose_logging=False,
|
||||
):
|
||||
"""
|
||||
Postprocesses start and end logits generated by :meth:`AnswerExtractor.fit` for BERT.
|
||||
Postprocesses start and end logits
|
||||
generated by :meth:`AnswerExtractor.fit` for BERT.
|
||||
|
||||
Args:
|
||||
results (list): List of :class:`QAResult`.
|
||||
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
|
||||
contains the original document tokens that are used to generate the final answers
|
||||
from the predicted start and end positions.
|
||||
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`. This file
|
||||
contains the mapping from indices in the processed token list to the original
|
||||
document tokens that are used to generate the final predicted answers.
|
||||
do_lower_case (bool): Whether an uncased tokenizer was used during data preprocessing.
|
||||
This is required during answer finalization by comparing the predicted answer text
|
||||
and the original text span in :func:`_get_final_text`.
|
||||
unanswerable_exists (bool, optional): Whether there are unanswerable questions in the
|
||||
data. If True, the start and end logits of the [CLS] token, which indicate the
|
||||
probability of the answer being empty, are included in the candidate answer list.
|
||||
examples_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
|
||||
This file contains the original document tokens that are used to generate
|
||||
the final answers from the predicted start and end positions.
|
||||
features_file (str): One of the files cached by :meth:`QAProcessor.preprocess`.
|
||||
This file contains the mapping from indices in the processed token list
|
||||
to the original document tokens that are used to generate the final
|
||||
predicted answers.
|
||||
do_lower_case (bool): Whether an uncased tokenizer was used during
|
||||
data preprocessing. This is required during answer finalization
|
||||
by comparing the predicted answer text and the original
|
||||
text span in :func:`_get_final_text`.
|
||||
unanswerable_exists (bool, optional): Whether there are unanswerable
|
||||
questions in the data. If True, the start and end logits of the [CLS]
|
||||
token, which indicate the probability of the answer being empty,
|
||||
are included in the candidate answer list.
|
||||
Defaults to False.
|
||||
n_best_size (int, optional): The number of candidates to choose from each QAResult to
|
||||
generate the final prediction. It's also the maximum number of n-best answers to
|
||||
output for each question. Note that the number of n-best answers can be smaller than
|
||||
`n_best_size` because some unqualified answers, e.g. answer that are too long,
|
||||
are removed.
|
||||
n_best_size (int, optional): The number of candidates to choose from each
|
||||
QAResult to generate the final prediction. It's also the maximum number
|
||||
of n-best answers to output for each question.
|
||||
Note that the number of n-best answers can be smaller than `n_best_size`
|
||||
because some unqualified answers,
|
||||
e.g. answer that are too long, are removed.
|
||||
max_answer_length (int, optional): Maximum length of the answer. Defaults to 30.
|
||||
output_prediction_file (str, optional): Path of the file to save the predicted answers.
|
||||
Defaults to "./qa_predictions.json".
|
||||
output_nbest_file (str, optional): Path of the file to save the n-best answers. Defaults
|
||||
to "./nbest_predictions.json".
|
||||
output_null_log_odds_file (str, optional): If unanswerable_exists is True, the score
|
||||
difference between empty prediction and best non-empty prediction are saved to this
|
||||
file. These scores can be used to find the best threshold for predicting an empty
|
||||
answer. Defaults to "./null_odds.json".
|
||||
null_score_diff_threshold (float, optional): If unanswerable_exists=True and the score
|
||||
difference between empty prediction and best non-empty prediction is higher than this
|
||||
threshold, the final predicted answer is empty. Defaults to 0.0.
|
||||
verbose_logging (bool, optional): Whether to log details of answer postprocessing.
|
||||
Defaults to False.
|
||||
output_prediction_file (str, optional): Path of the file to save the
|
||||
predicted answers. Defaults to "./qa_predictions.json".
|
||||
output_nbest_file (str, optional): Path of the file to save the n-best answers.
|
||||
Defaults to "./nbest_predictions.json".
|
||||
output_null_log_odds_file (str, optional): If unanswerable_exists is True,
|
||||
the score difference between empty prediction and best non-empty prediction
|
||||
are saved to this file. These scores can be used to find the best threshold
|
||||
for predicting an empty answer. Defaults to "./null_odds.json".
|
||||
null_score_diff_threshold (float, optional): If unanswerable_exists=True
|
||||
and the score difference between empty prediction and best non-empty
|
||||
prediction is higher than this threshold, the final predicted
|
||||
answer is empty.
|
||||
Defaults to 0.0.
|
||||
verbose_logging (bool, optional): Whether to log details of
|
||||
answer postprocessing. Defaults to False.
|
||||
|
||||
Returns:
|
||||
tuple: (OrderedDict, OrderedDict, OrderedDict)
|
||||
The keys of the dictionaries are the `qa_id` in the original
|
||||
:class:`utils_nlp.dataset.pytorch.QADataset`
|
||||
The values of the first dictionary are the predicted answer texts in string type.
|
||||
The values of the second dictionary are the softmax probabilities of the predicted
|
||||
answers.
|
||||
The values of the third dictionary are the n-best answers for each qa_id. Note that
|
||||
the number of n-best answers can be smaller than `n_best_size` because some
|
||||
unqualified answers, e.g. answers that are too long, are removed.
|
||||
The values of the first dictionary are the predicted answer texts
|
||||
in string type. The values of the second dictionary are the softmax
|
||||
probabilities of the predicted answers.
|
||||
The values of the third dictionary are the n-best answers for each qa_id.
|
||||
Note that the number of n-best answers can be smaller than `n_best_size`
|
||||
because some unqualified answers, e.g. answers that are too long,
|
||||
are removed.
|
||||
|
||||
"""
|
||||
with jsonlines.open(examples_file) as reader:
|
||||
|
@ -753,7 +817,9 @@ def postprocess_bert_answer(
|
|||
|
||||
# Sort by the sum of the start and end logits in ascending order,
|
||||
# so that the first element is the most probable answer
|
||||
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
|
||||
)
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
|
@ -786,11 +852,19 @@ def postprocess_bert_answer(
|
|||
final_text = ""
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
|
||||
)
|
||||
)
|
||||
# if we didn't include the empty option in the n-best, include it
|
||||
if unanswerable_exists:
|
||||
if "" not in seen_predictions:
|
||||
nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text="", start_logit=null_start_logit, end_logit=null_end_logit
|
||||
)
|
||||
)
|
||||
|
||||
# In very rare edge cases we could only have single null prediction.
|
||||
# So we just create a nonce prediction in this case to avoid failure.
|
||||
|
@ -834,7 +908,9 @@ def postprocess_bert_answer(
|
|||
all_probs[example["qa_id"]] = nbest_json[0]["probability"]
|
||||
else:
|
||||
# predict "" iff the null score - the score of best non-null > threshold
|
||||
score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
|
||||
score_diff = (
|
||||
score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
|
||||
)
|
||||
scores_diff_json[example["qa_id"]] = score_diff
|
||||
if score_diff > null_score_diff_threshold:
|
||||
all_predictions[example["qa_id"]] = ""
|
||||
|
@ -1000,7 +1076,9 @@ def postprocess_xlnet_answer(
|
|||
)
|
||||
)
|
||||
|
||||
prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
|
||||
prelim_predictions = sorted(
|
||||
prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,
|
||||
)
|
||||
|
||||
seen_predictions = {}
|
||||
nbest = []
|
||||
|
@ -1031,14 +1109,20 @@ def postprocess_xlnet_answer(
|
|||
tok_text = " ".join(tok_text.split())
|
||||
orig_text = " ".join(orig_tokens)
|
||||
|
||||
final_text = _get_final_text(tok_text, orig_text, tokenizer.do_lower_case, verbose_logging)
|
||||
final_text = _get_final_text(
|
||||
tok_text, orig_text, tokenizer.do_lower_case, verbose_logging
|
||||
)
|
||||
|
||||
if final_text in seen_predictions:
|
||||
continue
|
||||
|
||||
seen_predictions[final_text] = True
|
||||
|
||||
nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
|
||||
nbest.append(
|
||||
_NbestPrediction(
|
||||
text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,
|
||||
)
|
||||
)
|
||||
|
||||
# In very rare edge cases we could have no valid predictions. So we
|
||||
# just create a nonce prediction in this case to avoid failure.
|
||||
|
@ -1185,7 +1269,9 @@ def _create_qa_example(qa_input, is_training):
|
|||
actual_text = " ".join(d_tokens[start_position : (end_position + 1)])
|
||||
cleaned_answer_text = " ".join(whitespace_tokenize(a_text))
|
||||
if actual_text.find(cleaned_answer_text) == -1:
|
||||
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
|
||||
logger.warning(
|
||||
"Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text,
|
||||
)
|
||||
return
|
||||
else:
|
||||
start_position = -1
|
||||
|
@ -1408,7 +1494,7 @@ def _create_qa_features(
|
|||
else:
|
||||
tok_end_position = len(all_doc_tokens) - 1
|
||||
(tok_start_position, tok_end_position) = _improve_answer_span(
|
||||
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text
|
||||
all_doc_tokens, tok_start_position, tok_end_position, example.orig_answer_text,
|
||||
)
|
||||
|
||||
# The -3 accounts for [CLS], [SEP] and [SEP]
|
||||
|
@ -1579,7 +1665,7 @@ def _create_qa_features(
|
|||
# -------------------------------------------------------------------------------------------------
|
||||
# Post processing helper functions
|
||||
_PrelimPrediction = collections.namedtuple(
|
||||
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
|
||||
"PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"],
|
||||
)
|
||||
|
||||
_NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
|
||||
|
@ -1644,7 +1730,9 @@ def _get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
|
|||
|
||||
if len(orig_ns_text) != len(tok_ns_text):
|
||||
if verbose_logging:
|
||||
logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
|
||||
logger.info(
|
||||
"Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text,
|
||||
)
|
||||
return orig_text
|
||||
|
||||
# We then project the characters in `pred_text` back to `orig_text` using
|
||||
|
|
Загрузка…
Ссылка в новой задаче