This commit is contained in:
Daisy Deng 2020-02-24 21:48:35 +00:00
Родитель 0bfbba1000 10185d6f3c
Коммит c836249713
5 изменённых файлов: 119 добавлений и 54 удалений

Просмотреть файл

@ -37,7 +37,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -46,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -55,7 +55,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -99,7 +99,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@ -110,16 +110,31 @@
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
"\n",
"SUBSRIPTION_ID = \"9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\"\n",
"\n",
"#\"\"\"\n",
"LOCATION = \"canadacentral\" # example \"eastus2\"\n",
"RESOURCE_GROUP = \"daden1amlcc\" # modifiy to use your own\n",
"WORKSPACE_NAME = \"daden1amlccws\" # modifiy to use your own\n",
"\n",
"# for creating Azure ML Compute Cluster\n",
"AMLCOMPUTE_CLUSTER_NAME = \"bertabs2\" # modifiy to use your own\n",
"NODE_COUNT = 2\n",
"VM_SIZE = \"Standard_NC12s_v3\" # this should be the VM that's supported by Azure and Azure ML\n",
"VM_SIZE = \"STANDARD_NC24RS_V3\"\n",
"CONFIG_PATH = \"./.azureml_canadacentral\"\n",
"#\"\"\"\n",
"\n",
"\"\"\"\n",
"LOCATION = \"eastus2\" # example \"eastus2\"\n",
"RESOURCE_GROUP = \"daden1aml\" # modifiy to use your own\n",
"WORKSPACE_NAME = \"daden1amlws\" # modifiy to use your own\n",
"\n",
"\n",
"# for creating Azure ML Compute Cluster\n",
"AMLCOMPUTE_CLUSTER_NAME = \"bertabs1\" # modifiy to use your own\n",
"NODE_COUNT = 2\n",
"#VM_SIZE = \"STANDARD_NC12\" # this should be the VM that's supported by Azure and Azure ML\n",
"VM_SIZE = \"Standard NC12s_v3\"\n",
"VM_SIZE = \"STANDARD_NC12\" # this should be the VM that's supported by Azure and Azure ML\n",
"CONFIG_PATH = \"./.azureml_eastus\"\n",
"\"\"\"\n",
"\n",
"# for creating Azure ML Experiment\n",
"EXPERIMENT_NAME = \"NLP-BertAbs\" # modifiy to use your own\n",
@ -169,12 +184,13 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# Create the workspace using the specified parameters\n",
"ws = get_or_create_workspace(\n",
" config_path=CONFIG_PATH,\n",
" workspace_name=WORKSPACE_NAME,\n",
" subscription_id=SUBSRIPTION_ID,\n",
" resource_group=RESOURCE_GROUP,\n",
@ -184,17 +200,17 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Workspace name: daden1amlws\n",
"Azure region: eastus2\n",
"Workspace name: daden1amlccws\n",
"Azure region: canadacentral\n",
"Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n",
"Resource group: daden1aml\n"
"Resource group: daden1amlcc\n"
]
}
],
@ -217,15 +233,20 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found existing compute target.\n",
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-21T17:58:05.717000+00:00', 'errors': None, 'creationTime': '2020-02-21T17:58:02.904515+00:00', 'modifiedTime': '2020-02-21T17:58:18.620746+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\n"
"Creating a new compute target...\n",
"Creating\n",
"Succeeded\n",
"AmlCompute wait for completion finished\n",
"\n",
"Minimum number of nodes requested have been provisioned\n",
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-24T04:56:51.926000+00:00', 'errors': None, 'creationTime': '2020-02-24T04:56:49.122785+00:00', 'modifiedTime': '2020-02-24T04:57:05.105894+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
]
}
],
@ -237,7 +258,7 @@
" print(\"Creating a new compute target...\")\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
" idle_seconds_before_scaledown=\"PT1200S\"\n",
" idle_seconds_before_scaledown=\"600\"\n",
" )\n",
"\n",
" # create the cluster\n",
@ -260,7 +281,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
@ -346,7 +367,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@ -355,7 +376,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 62,
"metadata": {},
"outputs": [
{
@ -364,18 +385,19 @@
"text": [
"Uploading an estimated of 2 files\n",
"Uploading ./bert_abs_data/train_abssum_dataset_full.pt\n",
"Target already exists. Skipping upload for bertabs_processed_data/test_abssum_dataset_full.pt\n",
"Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
"Uploaded 1 files\n"
"Uploading ./bert_abs_data/test_abssum_dataset_full.pt\n",
"Uploaded ./bert_abs_data/test_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
"Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 2 files out of an estimated total of 2\n",
"Uploaded 2 files\n"
]
},
{
"data": {
"text/plain": [
"$AZUREML_DATAREFERENCE_fb1ab46646bf409496082009759b990b"
"$AZUREML_DATAREFERENCE_f6e4f6e119d94f68ac66a4557e5f179c"
]
},
"execution_count": 25,
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
@ -414,7 +436,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 29,
"metadata": {},
"outputs": [
{
@ -439,9 +461,9 @@
"source": [
"ENTRY_SCRIPT = \"bertabs_cnndm_distributed_train.py\"\n",
"!mkdir -p {PROJECT_FOLDER}\n",
"#!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
"!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
"#!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
"!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
"#!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
"!cp -r ../../utils_nlp {PROJECT_FOLDER}"
]
},
@ -464,7 +486,27 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"NODE_COUNT"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
@ -490,6 +532,7 @@
" \"--quick_run\": 'true',\n",
" \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
" \"--model_filename\": f'{MODEL_FILENAME}',\n",
" \"--batch_size\": 4,\n",
" },\n",
" entry_script= ENTRY_SCRIPT,\n",
" node_count=NODE_COUNT,\n",
@ -500,7 +543,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@ -509,7 +552,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {
"scrolled": true
},
@ -517,7 +560,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f96b6c43c0414398a236fe3c9127757d",
"model_id": "2e15a2c42f9d46a7a9d983f137ba1b11",
"version_major": 2,
"version_minor": 0
},

Просмотреть файл

@ -3,6 +3,7 @@
import argparse
import os
import pickle
import sys
import time
import torch
@ -68,6 +69,18 @@ parser.add_argument("--summary_filename", type=str, default="generated_summaries
help="Summary file name generated by prediction for evaluation.")
parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt",
help="model file name saved for evaluation.")
parser.add_argument("--checkpoint_filename", type=str, default=None,
help="filename of a checkpoint where the trainging resumes from. \
default path is at cache_dir")
parser.add_argument("--report_every", type=int, default=10,
help="number of steps between each loss report")
parser.add_argument("--save_every", type=int, default=500,
help="number of steps between each model save and validation")
parser.add_argument("--fp16", type=str.lower, default='false', choices=['true', 'false'],
help="Whether to use mixed precision training")
parser.add_argument("--fp16_opt_level", type=str.upper, default='O2', choices=['O0', 'O1', 'O2', 'O3'],
help="optimization level, refer to https://nvidia.github.io/apex/amp.html#opt-levels for details ")
def pretrained_model():
@ -93,7 +106,6 @@ def main():
print("output_dir is {}".format(args.output_dir))
print("data_dir is {}".format(args.data_dir))
print("cache_dir is {}".format(args.cache_dir))
ngpus_per_node = torch.cuda.device_count()
processor = AbsSumProcessor(cache_dir=args.cache_dir)
summarizer = AbsSum(
@ -115,8 +127,11 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
world_size=world_size,
rank=rank,
)
checkpoint = None
## should not load checkpoint from this place, otherwise, huge memory increase
if args.checkpoint_filename:
checkpoint = os.path.join(args.cache_dir, args.checkpoint_filename)
else:
checkpoint = None
train_sum_dataset, test_sum_dataset = load_processed_cnndm_abs(args.data_dir)
def this_validate(class_obj):
return validate(class_obj, test_sum_dataset, args.cache_dir)
@ -125,8 +140,10 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
save_every = -1
this_validate = None
else:
save_every = 400
save_every = args.save_every
fp16 = args.fp16.lower()== 'true'
print("fp16 is {}".format(fp16))
# total number of steps for training
MAX_STEPS = 400
# number of steps for warm up
@ -156,17 +173,17 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
warmup_steps_bert=WARMUP_STEPS_BERT,
warmup_steps_dec=WARMUP_STEPS_DEC,
save_every=save_every,
report_every=10,
report_every=args.report_every,
validation_function=this_validate,
fp16=True,
fp16_opt_level="O2",
checkpoint=None
fp16=fp16,
fp16_opt_level=args.fp16_opt_level,
checkpoint=checkpoint
)
end = time.time()
print("rank {0}, duration {1:.6f}s".format(rank, end - start))
if rank == 0 or local_rank == -1:
saved_model_path = os.path.join(args.output_dir, "summarizer_step{}_with_glocal_step.pt".format(MAX_STEPS))
saved_model_path = os.path.join(args.output_dir, "{}_step{}".format(args.model_filename, MAX_STEPS))
summarizer.save_model(MAX_STEPS, saved_model_path)
top_n = 8
prediction = summarizer.predict(shorten_dataset(test_sum_dataset, top_n=top_n),

Просмотреть файл

@ -126,7 +126,7 @@ def main():
#shutil.rmtree(args.output_dir)
args = parser.parse_args()
ngpus_per_node = torch.cuda.device_count()
ngpus_per_node = 1 #torch.cuda.device_count()
processor = AbsSumProcessor(cache_dir=CACHE_PATH)
summarizer = AbsSum(
processor, cache_dir=CACHE_PATH
@ -168,7 +168,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
num_gpus=None,
local_rank=local_rank,
rank=rank,
batch_size=6,
batch_size=8,
max_steps=50000/world_size,
learning_rate_bert=0.003,
learning_rate_dec=0.3,
@ -178,7 +178,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
report_every=10,
validation_function=this_validate,
fp16=True,
fp16_opt_level="O1",
fp16_opt_level="O2",
checkpoint=None
)
if rank == 0 or local_rank == -1:
@ -208,16 +208,16 @@ def test_train_model():
return validate(class_obj, test_sum_dataset, CACHE_PATH)
summarizer.fit(
train_sum_dataset,
batch_size=8,
batch_size=6,
max_steps=30000,
local_rank=-1,
learning_rate_bert=0.002,
learning_rate_dec=0.2,
warmup_steps_bert=20000,
warmup_steps_dec=10000,
num_gpus=2,
num_gpus=1,
report_every=10,
save_every=100,
save_every=400,
validation_function=this_validate,
fp16=False,
fp16_opt_level="O1",
@ -259,7 +259,7 @@ def test_pretrained_model():
checkpoint = torch.load(os.path.join(MODEL_PATH, "new_model_step_148000_torch1.4.0.pt"))
#checkpoint = torch.load(os.path.join(MODEL_PATH, "summarizer_step20000_with_global_step.pt"))
checkpoint = torch.load(os.path.join(MODEL_PATH, "bert-base-uncased_step_400.pt"))
summarizer = AbsSum(
processor,
cache_dir=CACHE_PATH,
@ -284,13 +284,14 @@ def test_pretrained_model():
return
"""
top_n = 10
top_n = 8
src = test_sum_dataset.source[0:top_n]
reference_summaries = ["".join(t).rstrip("\n") for t in test_sum_dataset.target[0:top_n]]
print("start prediction")
generated_summaries = summarizer.predict(
shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=3, num_gpus=2
shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=4, num_gpus=2
)
print(generated_summaries[0])
assert len(generated_summaries) == len(reference_summaries)
RESULT_DIR = TemporaryDirectory().name
rouge_score = get_rouge(generated_summaries, reference_summaries, RESULT_DIR)

Просмотреть файл

@ -9,6 +9,7 @@ from collections import namedtuple
import itertools
import logging
import os
import pickle
import random
import numpy as np

Просмотреть файл

@ -224,6 +224,7 @@ class Transformer:
# init training
tr_loss = 0.0
accum_loss = 0
train_size = 0
self.model.train()
self.model.zero_grad()
@ -254,7 +255,7 @@ class Transformer:
tr_loss += loss.item()
accum_loss += loss.item()
train_size += list(inputs.values())[0].size()[0]
if (step + 1) % gradient_accumulation_steps == 0:
global_step += 1
@ -274,13 +275,14 @@ class Transformer:
endtime_string = datetime.datetime.fromtimestamp(end).strftime(
"%d/%m/%Y %H:%M:%S"
)
log_line = """timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f},
number of examples in current step: {3:.0f}, step {4:.0f}
log_line = """timestamp: {0:s}, average loss: {1:.6f}, time duration: {2:f},
number of examples in current reporting: {3:.0f}, step {4:.0f}
out of total {5:.0f}""".format(
endtime_string,
accum_loss / report_every,
end - start,
list(inputs.values())[0].size()[0],
#list(inputs.values())[0].size()[0],
train_size,
global_step,
max_steps,
@ -288,6 +290,7 @@ class Transformer:
logger.info(log_line)
print(log_line)
accum_loss = 0
train_size = 0
start = end
if optimizer:
if type(optimizer) == list: