Merge branch 'daden/presumm' of https://github.com/microsoft/nlp-recipes into daden/presumm
This commit is contained in:
Коммит
c836249713
|
@ -37,7 +37,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -46,7 +46,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -55,7 +55,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -99,7 +99,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -110,16 +110,31 @@
|
|||
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
|
||||
"\n",
|
||||
"SUBSRIPTION_ID = \"9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\"\n",
|
||||
"\n",
|
||||
"#\"\"\"\n",
|
||||
"LOCATION = \"canadacentral\" # example \"eastus2\"\n",
|
||||
"RESOURCE_GROUP = \"daden1amlcc\" # modifiy to use your own\n",
|
||||
"WORKSPACE_NAME = \"daden1amlccws\" # modifiy to use your own\n",
|
||||
"\n",
|
||||
"# for creating Azure ML Compute Cluster\n",
|
||||
"AMLCOMPUTE_CLUSTER_NAME = \"bertabs2\" # modifiy to use your own\n",
|
||||
"NODE_COUNT = 2\n",
|
||||
"VM_SIZE = \"Standard_NC12s_v3\" # this should be the VM that's supported by Azure and Azure ML\n",
|
||||
"VM_SIZE = \"STANDARD_NC24RS_V3\"\n",
|
||||
"CONFIG_PATH = \"./.azureml_canadacentral\"\n",
|
||||
"#\"\"\"\n",
|
||||
"\n",
|
||||
"\"\"\"\n",
|
||||
"LOCATION = \"eastus2\" # example \"eastus2\"\n",
|
||||
"RESOURCE_GROUP = \"daden1aml\" # modifiy to use your own\n",
|
||||
"WORKSPACE_NAME = \"daden1amlws\" # modifiy to use your own\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# for creating Azure ML Compute Cluster\n",
|
||||
"AMLCOMPUTE_CLUSTER_NAME = \"bertabs1\" # modifiy to use your own\n",
|
||||
"NODE_COUNT = 2\n",
|
||||
"#VM_SIZE = \"STANDARD_NC12\" # this should be the VM that's supported by Azure and Azure ML\n",
|
||||
"VM_SIZE = \"Standard NC12s_v3\"\n",
|
||||
"VM_SIZE = \"STANDARD_NC12\" # this should be the VM that's supported by Azure and Azure ML\n",
|
||||
"CONFIG_PATH = \"./.azureml_eastus\"\n",
|
||||
"\"\"\"\n",
|
||||
"\n",
|
||||
"# for creating Azure ML Experiment\n",
|
||||
"EXPERIMENT_NAME = \"NLP-BertAbs\" # modifiy to use your own\n",
|
||||
|
@ -169,12 +184,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create the workspace using the specified parameters\n",
|
||||
"ws = get_or_create_workspace(\n",
|
||||
" config_path=CONFIG_PATH,\n",
|
||||
" workspace_name=WORKSPACE_NAME,\n",
|
||||
" subscription_id=SUBSRIPTION_ID,\n",
|
||||
" resource_group=RESOURCE_GROUP,\n",
|
||||
|
@ -184,17 +200,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Workspace name: daden1amlws\n",
|
||||
"Azure region: eastus2\n",
|
||||
"Workspace name: daden1amlccws\n",
|
||||
"Azure region: canadacentral\n",
|
||||
"Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n",
|
||||
"Resource group: daden1aml\n"
|
||||
"Resource group: daden1amlcc\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -217,15 +233,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found existing compute target.\n",
|
||||
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-21T17:58:05.717000+00:00', 'errors': None, 'creationTime': '2020-02-21T17:58:02.904515+00:00', 'modifiedTime': '2020-02-21T17:58:18.620746+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\n"
|
||||
"Creating a new compute target...\n",
|
||||
"Creating\n",
|
||||
"Succeeded\n",
|
||||
"AmlCompute wait for completion finished\n",
|
||||
"\n",
|
||||
"Minimum number of nodes requested have been provisioned\n",
|
||||
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-24T04:56:51.926000+00:00', 'errors': None, 'creationTime': '2020-02-24T04:56:49.122785+00:00', 'modifiedTime': '2020-02-24T04:57:05.105894+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -237,7 +258,7 @@
|
|||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
|
||||
" idle_seconds_before_scaledown=\"PT1200S\"\n",
|
||||
" idle_seconds_before_scaledown=\"600\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
|
@ -260,7 +281,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -346,7 +367,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -355,7 +376,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -364,18 +385,19 @@
|
|||
"text": [
|
||||
"Uploading an estimated of 2 files\n",
|
||||
"Uploading ./bert_abs_data/train_abssum_dataset_full.pt\n",
|
||||
"Target already exists. Skipping upload for bertabs_processed_data/test_abssum_dataset_full.pt\n",
|
||||
"Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
|
||||
"Uploaded 1 files\n"
|
||||
"Uploading ./bert_abs_data/test_abssum_dataset_full.pt\n",
|
||||
"Uploaded ./bert_abs_data/test_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
|
||||
"Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 2 files out of an estimated total of 2\n",
|
||||
"Uploaded 2 files\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"$AZUREML_DATAREFERENCE_fb1ab46646bf409496082009759b990b"
|
||||
"$AZUREML_DATAREFERENCE_f6e4f6e119d94f68ac66a4557e5f179c"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -414,7 +436,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -439,9 +461,9 @@
|
|||
"source": [
|
||||
"ENTRY_SCRIPT = \"bertabs_cnndm_distributed_train.py\"\n",
|
||||
"!mkdir -p {PROJECT_FOLDER}\n",
|
||||
"#!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
|
||||
"!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
|
||||
"#!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
|
||||
"!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
|
||||
"#!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
|
||||
"!cp -r ../../utils_nlp {PROJECT_FOLDER}"
|
||||
]
|
||||
},
|
||||
|
@ -464,7 +486,27 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"NODE_COUNT"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -490,6 +532,7 @@
|
|||
" \"--quick_run\": 'true',\n",
|
||||
" \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
|
||||
" \"--model_filename\": f'{MODEL_FILENAME}',\n",
|
||||
" \"--batch_size\": 4,\n",
|
||||
" },\n",
|
||||
" entry_script= ENTRY_SCRIPT,\n",
|
||||
" node_count=NODE_COUNT,\n",
|
||||
|
@ -500,7 +543,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -509,7 +552,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -517,7 +560,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "f96b6c43c0414398a236fe3c9127757d",
|
||||
"model_id": "2e15a2c42f9d46a7a9d983f137ba1b11",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
import argparse
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
import time
|
||||
import torch
|
||||
|
@ -68,6 +69,18 @@ parser.add_argument("--summary_filename", type=str, default="generated_summaries
|
|||
help="Summary file name generated by prediction for evaluation.")
|
||||
parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt",
|
||||
help="model file name saved for evaluation.")
|
||||
parser.add_argument("--checkpoint_filename", type=str, default=None,
|
||||
help="filename of a checkpoint where the trainging resumes from. \
|
||||
default path is at cache_dir")
|
||||
parser.add_argument("--report_every", type=int, default=10,
|
||||
help="number of steps between each loss report")
|
||||
parser.add_argument("--save_every", type=int, default=500,
|
||||
help="number of steps between each model save and validation")
|
||||
parser.add_argument("--fp16", type=str.lower, default='false', choices=['true', 'false'],
|
||||
help="Whether to use mixed precision training")
|
||||
parser.add_argument("--fp16_opt_level", type=str.upper, default='O2', choices=['O0', 'O1', 'O2', 'O3'],
|
||||
help="optimization level, refer to https://nvidia.github.io/apex/amp.html#opt-levels for details ")
|
||||
|
||||
|
||||
|
||||
def pretrained_model():
|
||||
|
@ -93,7 +106,6 @@ def main():
|
|||
print("output_dir is {}".format(args.output_dir))
|
||||
print("data_dir is {}".format(args.data_dir))
|
||||
print("cache_dir is {}".format(args.cache_dir))
|
||||
|
||||
ngpus_per_node = torch.cuda.device_count()
|
||||
processor = AbsSumProcessor(cache_dir=args.cache_dir)
|
||||
summarizer = AbsSum(
|
||||
|
@ -115,7 +127,10 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
|
|||
world_size=world_size,
|
||||
rank=rank,
|
||||
)
|
||||
|
||||
## should not load checkpoint from this place, otherwise, huge memory increase
|
||||
if args.checkpoint_filename:
|
||||
checkpoint = os.path.join(args.cache_dir, args.checkpoint_filename)
|
||||
else:
|
||||
checkpoint = None
|
||||
train_sum_dataset, test_sum_dataset = load_processed_cnndm_abs(args.data_dir)
|
||||
def this_validate(class_obj):
|
||||
|
@ -125,8 +140,10 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
|
|||
save_every = -1
|
||||
this_validate = None
|
||||
else:
|
||||
save_every = 400
|
||||
save_every = args.save_every
|
||||
|
||||
fp16 = args.fp16.lower()== 'true'
|
||||
print("fp16 is {}".format(fp16))
|
||||
# total number of steps for training
|
||||
MAX_STEPS = 400
|
||||
# number of steps for warm up
|
||||
|
@ -156,17 +173,17 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
|
|||
warmup_steps_bert=WARMUP_STEPS_BERT,
|
||||
warmup_steps_dec=WARMUP_STEPS_DEC,
|
||||
save_every=save_every,
|
||||
report_every=10,
|
||||
report_every=args.report_every,
|
||||
validation_function=this_validate,
|
||||
fp16=True,
|
||||
fp16_opt_level="O2",
|
||||
checkpoint=None
|
||||
fp16=fp16,
|
||||
fp16_opt_level=args.fp16_opt_level,
|
||||
checkpoint=checkpoint
|
||||
)
|
||||
|
||||
end = time.time()
|
||||
print("rank {0}, duration {1:.6f}s".format(rank, end - start))
|
||||
if rank == 0 or local_rank == -1:
|
||||
saved_model_path = os.path.join(args.output_dir, "summarizer_step{}_with_glocal_step.pt".format(MAX_STEPS))
|
||||
saved_model_path = os.path.join(args.output_dir, "{}_step{}".format(args.model_filename, MAX_STEPS))
|
||||
summarizer.save_model(MAX_STEPS, saved_model_path)
|
||||
top_n = 8
|
||||
prediction = summarizer.predict(shorten_dataset(test_sum_dataset, top_n=top_n),
|
||||
|
|
|
@ -126,7 +126,7 @@ def main():
|
|||
|
||||
#shutil.rmtree(args.output_dir)
|
||||
args = parser.parse_args()
|
||||
ngpus_per_node = torch.cuda.device_count()
|
||||
ngpus_per_node = 1 #torch.cuda.device_count()
|
||||
processor = AbsSumProcessor(cache_dir=CACHE_PATH)
|
||||
summarizer = AbsSum(
|
||||
processor, cache_dir=CACHE_PATH
|
||||
|
@ -168,7 +168,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
|
|||
num_gpus=None,
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
batch_size=6,
|
||||
batch_size=8,
|
||||
max_steps=50000/world_size,
|
||||
learning_rate_bert=0.003,
|
||||
learning_rate_dec=0.3,
|
||||
|
@ -178,7 +178,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
|
|||
report_every=10,
|
||||
validation_function=this_validate,
|
||||
fp16=True,
|
||||
fp16_opt_level="O1",
|
||||
fp16_opt_level="O2",
|
||||
checkpoint=None
|
||||
)
|
||||
if rank == 0 or local_rank == -1:
|
||||
|
@ -208,16 +208,16 @@ def test_train_model():
|
|||
return validate(class_obj, test_sum_dataset, CACHE_PATH)
|
||||
summarizer.fit(
|
||||
train_sum_dataset,
|
||||
batch_size=8,
|
||||
batch_size=6,
|
||||
max_steps=30000,
|
||||
local_rank=-1,
|
||||
learning_rate_bert=0.002,
|
||||
learning_rate_dec=0.2,
|
||||
warmup_steps_bert=20000,
|
||||
warmup_steps_dec=10000,
|
||||
num_gpus=2,
|
||||
num_gpus=1,
|
||||
report_every=10,
|
||||
save_every=100,
|
||||
save_every=400,
|
||||
validation_function=this_validate,
|
||||
fp16=False,
|
||||
fp16_opt_level="O1",
|
||||
|
@ -259,7 +259,7 @@ def test_pretrained_model():
|
|||
checkpoint = torch.load(os.path.join(MODEL_PATH, "new_model_step_148000_torch1.4.0.pt"))
|
||||
|
||||
#checkpoint = torch.load(os.path.join(MODEL_PATH, "summarizer_step20000_with_global_step.pt"))
|
||||
|
||||
checkpoint = torch.load(os.path.join(MODEL_PATH, "bert-base-uncased_step_400.pt"))
|
||||
summarizer = AbsSum(
|
||||
processor,
|
||||
cache_dir=CACHE_PATH,
|
||||
|
@ -284,13 +284,14 @@ def test_pretrained_model():
|
|||
return
|
||||
"""
|
||||
|
||||
top_n = 10
|
||||
top_n = 8
|
||||
src = test_sum_dataset.source[0:top_n]
|
||||
reference_summaries = ["".join(t).rstrip("\n") for t in test_sum_dataset.target[0:top_n]]
|
||||
print("start prediction")
|
||||
generated_summaries = summarizer.predict(
|
||||
shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=3, num_gpus=2
|
||||
shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=4, num_gpus=2
|
||||
)
|
||||
print(generated_summaries[0])
|
||||
assert len(generated_summaries) == len(reference_summaries)
|
||||
RESULT_DIR = TemporaryDirectory().name
|
||||
rouge_score = get_rouge(generated_summaries, reference_summaries, RESULT_DIR)
|
||||
|
|
|
@ -9,6 +9,7 @@ from collections import namedtuple
|
|||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
|
|
|
@ -224,6 +224,7 @@ class Transformer:
|
|||
# init training
|
||||
tr_loss = 0.0
|
||||
accum_loss = 0
|
||||
train_size = 0
|
||||
self.model.train()
|
||||
self.model.zero_grad()
|
||||
|
||||
|
@ -254,7 +255,7 @@ class Transformer:
|
|||
|
||||
tr_loss += loss.item()
|
||||
accum_loss += loss.item()
|
||||
|
||||
train_size += list(inputs.values())[0].size()[0]
|
||||
if (step + 1) % gradient_accumulation_steps == 0:
|
||||
|
||||
global_step += 1
|
||||
|
@ -274,13 +275,14 @@ class Transformer:
|
|||
endtime_string = datetime.datetime.fromtimestamp(end).strftime(
|
||||
"%d/%m/%Y %H:%M:%S"
|
||||
)
|
||||
log_line = """timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f},
|
||||
number of examples in current step: {3:.0f}, step {4:.0f}
|
||||
log_line = """timestamp: {0:s}, average loss: {1:.6f}, time duration: {2:f},
|
||||
number of examples in current reporting: {3:.0f}, step {4:.0f}
|
||||
out of total {5:.0f}""".format(
|
||||
endtime_string,
|
||||
accum_loss / report_every,
|
||||
end - start,
|
||||
list(inputs.values())[0].size()[0],
|
||||
#list(inputs.values())[0].size()[0],
|
||||
train_size,
|
||||
global_step,
|
||||
max_steps,
|
||||
|
||||
|
@ -288,6 +290,7 @@ class Transformer:
|
|||
logger.info(log_line)
|
||||
print(log_line)
|
||||
accum_loss = 0
|
||||
train_size = 0
|
||||
start = end
|
||||
if optimizer:
|
||||
if type(optimizer) == list:
|
||||
|
|
Загрузка…
Ссылка в новой задаче