From 6597c9314832349205e08dad8b8ec12482abc643 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Mon, 24 Feb 2020 21:25:02 +0000 Subject: [PATCH 1/5] update train size --- utils_nlp/models/transformers/common.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py index 7cd08e7..a957b09 100755 --- a/utils_nlp/models/transformers/common.py +++ b/utils_nlp/models/transformers/common.py @@ -224,6 +224,7 @@ class Transformer: # init training tr_loss = 0.0 accum_loss = 0 + train_size = 0 self.model.train() self.model.zero_grad() @@ -254,7 +255,7 @@ class Transformer: tr_loss += loss.item() accum_loss += loss.item() - + train_size += list(inputs.values())[0].size()[0] if (step + 1) % gradient_accumulation_steps == 0: global_step += 1 @@ -274,13 +275,14 @@ class Transformer: endtime_string = datetime.datetime.fromtimestamp(end).strftime( "%d/%m/%Y %H:%M:%S" ) - log_line = """timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f}, - number of examples in current step: {3:.0f}, step {4:.0f} + log_line = """timestamp: {0:s}, average loss: {1:.6f}, time duration: {2:f}, + number of examples in current reporting: {3:.0f}, step {4:.0f} out of total {5:.0f}""".format( endtime_string, accum_loss / report_every, end - start, - list(inputs.values())[0].size()[0], + #list(inputs.values())[0].size()[0], + train_size, global_step, max_steps, @@ -288,6 +290,7 @@ class Transformer: logger.info(log_line) print(log_line) accum_loss = 0 + train_size = 0 start = end if type(optimizer) == list: for o in optimizer: From badbebb96d43b9ba9cd5c7d11ed774af25a5f6a9 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Mon, 24 Feb 2020 21:27:31 +0000 Subject: [PATCH 2/5] only use 1 gpu for validation --- utils_nlp/models/transformers/abssum.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utils_nlp/models/transformers/abssum.py b/utils_nlp/models/transformers/abssum.py index 80751c2..14ba566 100644 --- a/utils_nlp/models/transformers/abssum.py +++ b/utils_nlp/models/transformers/abssum.py @@ -9,6 +9,7 @@ from collections import namedtuple import itertools import logging import os +import pickle import random import numpy as np @@ -335,9 +336,9 @@ def validate(summarizer, validate_sum_dataset, cache_dir): TOP_N = 8 src = validate_sum_dataset.source[0:TOP_N] - reference_summaries = ["".join(t).rstrip("\n") for t in validate_sum_dataset.target[0:TOP_N]] + reference_summaries = [" ".join(t).rstrip("\n") for t in validate_sum_dataset.target[0:TOP_N]] generated_summaries = summarizer.predict( - shorten_dataset(validate_sum_dataset, top_n=TOP_N), num_gpus=2, batch_size=4 + shorten_dataset(validate_sum_dataset, top_n=TOP_N), num_gpus=1, batch_size=4 ) assert len(generated_summaries) == len(reference_summaries) for i in generated_summaries[0:1]: From 027825062f907520bb68656cdd86dde09b0aa6d8 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Mon, 24 Feb 2020 21:30:11 +0000 Subject: [PATCH 3/5] add a few more options --- .../bertabs_cnndm_distributed_train.py | 37 ++++++++++++++----- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/examples/text_summarization/bertabs_cnndm_distributed_train.py b/examples/text_summarization/bertabs_cnndm_distributed_train.py index 1ab6b8c..64c7886 100644 --- a/examples/text_summarization/bertabs_cnndm_distributed_train.py +++ b/examples/text_summarization/bertabs_cnndm_distributed_train.py @@ -3,6 +3,7 @@ import argparse import os +import pickle import sys import time import torch @@ -68,6 +69,18 @@ parser.add_argument("--summary_filename", type=str, default="generated_summaries help="Summary file name generated by prediction for evaluation.") parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt", help="model file name saved for evaluation.") +parser.add_argument("--checkpoint_filename", type=str, default=None, + help="filename of a checkpoint where the trainging resumes from. \ + default path is at cache_dir") +parser.add_argument("--report_every", type=int, default=10, + help="number of steps between each loss report") +parser.add_argument("--save_every", type=int, default=500, + help="number of steps between each model save and validation") +parser.add_argument("--fp16", type=str.lower, default='false', choices=['true', 'false'], + help="Whether to use mixed precision training") +parser.add_argument("--fp16_opt_level", type=str.upper, default='O2', choices=['O0', 'O1', 'O2', 'O3'], + help="optimization level, refer to https://nvidia.github.io/apex/amp.html#opt-levels for details ") + def pretrained_model(): @@ -93,7 +106,6 @@ def main(): print("output_dir is {}".format(args.output_dir)) print("data_dir is {}".format(args.data_dir)) print("cache_dir is {}".format(args.cache_dir)) - ngpus_per_node = torch.cuda.device_count() processor = AbsSumProcessor(cache_dir=args.cache_dir) summarizer = AbsSum( @@ -115,8 +127,11 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args): world_size=world_size, rank=rank, ) - - checkpoint = None + ## should not load checkpoint from this place, otherwise, huge memory increase + if args.checkpoint_filename: + checkpoint = os.path.join(args.cache_dir, args.checkpoint_filename) + else: + checkpoint = None train_sum_dataset, test_sum_dataset = load_processed_cnndm_abs(args.data_dir) def this_validate(class_obj): return validate(class_obj, test_sum_dataset, args.cache_dir) @@ -125,8 +140,10 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args): save_every = -1 this_validate = None else: - save_every = 400 - + save_every = args.save_every + + fp16 = args.fp16.lower()== 'true' + print("fp16 is {}".format(fp16)) # total number of steps for training MAX_STEPS = 400 # number of steps for warm up @@ -156,17 +173,17 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args): warmup_steps_bert=WARMUP_STEPS_BERT, warmup_steps_dec=WARMUP_STEPS_DEC, save_every=save_every, - report_every=10, + report_every=args.report_every, validation_function=this_validate, - fp16=True, - fp16_opt_level="O2", - checkpoint=None + fp16=fp16, + fp16_opt_level=args.fp16_opt_level, + checkpoint=checkpoint ) end = time.time() print("rank {0}, duration {1:.6f}s".format(rank, end - start)) if rank == 0 or local_rank == -1: - saved_model_path = os.path.join(args.output_dir, "summarizer_step{}_with_glocal_step.pt".format(MAX_STEPS)) + saved_model_path = os.path.join(args.output_dir, "{}_step{}".format(args.model_filename, MAX_STEPS)) summarizer.save_model(MAX_STEPS, saved_model_path) top_n = 8 prediction = summarizer.predict(shorten_dataset(test_sum_dataset, top_n=top_n), From f283e84e801854d081e4ba63fd6cf53c3ae608a6 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Mon, 24 Feb 2020 21:30:40 +0000 Subject: [PATCH 4/5] save work --- .../bertabs_cnndm_aml_distributed.ipynb | 105 ++++++++++++------ 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb b/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb index 7f1d9b1..a9376d0 100644 --- a/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb +++ b/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -110,16 +110,31 @@ "WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n", "\n", "SUBSRIPTION_ID = \"9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\"\n", + "\n", + "#\"\"\"\n", + "LOCATION = \"canadacentral\" # example \"eastus2\"\n", + "RESOURCE_GROUP = \"daden1amlcc\" # modifiy to use your own\n", + "WORKSPACE_NAME = \"daden1amlccws\" # modifiy to use your own\n", + "\n", + "# for creating Azure ML Compute Cluster\n", + "AMLCOMPUTE_CLUSTER_NAME = \"bertabs2\" # modifiy to use your own\n", + "NODE_COUNT = 2\n", + "VM_SIZE = \"Standard_NC12s_v3\" # this should be the VM that's supported by Azure and Azure ML\n", + "VM_SIZE = \"STANDARD_NC24RS_V3\"\n", + "CONFIG_PATH = \"./.azureml_canadacentral\"\n", + "#\"\"\"\n", + "\n", + "\"\"\"\n", "LOCATION = \"eastus2\" # example \"eastus2\"\n", "RESOURCE_GROUP = \"daden1aml\" # modifiy to use your own\n", "WORKSPACE_NAME = \"daden1amlws\" # modifiy to use your own\n", "\n", - "\n", "# for creating Azure ML Compute Cluster\n", "AMLCOMPUTE_CLUSTER_NAME = \"bertabs1\" # modifiy to use your own\n", "NODE_COUNT = 2\n", - "#VM_SIZE = \"STANDARD_NC12\" # this should be the VM that's supported by Azure and Azure ML\n", - "VM_SIZE = \"Standard NC12s_v3\"\n", + "VM_SIZE = \"STANDARD_NC12\" # this should be the VM that's supported by Azure and Azure ML\n", + "CONFIG_PATH = \"./.azureml_eastus\"\n", + "\"\"\"\n", "\n", "# for creating Azure ML Experiment\n", "EXPERIMENT_NAME = \"NLP-BertAbs\" # modifiy to use your own\n", @@ -169,12 +184,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# Create the workspace using the specified parameters\n", "ws = get_or_create_workspace(\n", + " config_path=CONFIG_PATH,\n", " workspace_name=WORKSPACE_NAME,\n", " subscription_id=SUBSRIPTION_ID,\n", " resource_group=RESOURCE_GROUP,\n", @@ -184,17 +200,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Workspace name: daden1amlws\n", - "Azure region: eastus2\n", + "Workspace name: daden1amlccws\n", + "Azure region: canadacentral\n", "Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n", - "Resource group: daden1aml\n" + "Resource group: daden1amlcc\n" ] } ], @@ -217,15 +233,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Found existing compute target.\n", - "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-21T17:58:05.717000+00:00', 'errors': None, 'creationTime': '2020-02-21T17:58:02.904515+00:00', 'modifiedTime': '2020-02-21T17:58:18.620746+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\n" + "Creating a new compute target...\n", + "Creating\n", + "Succeeded\n", + "AmlCompute wait for completion finished\n", + "\n", + "Minimum number of nodes requested have been provisioned\n", + "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-24T04:56:51.926000+00:00', 'errors': None, 'creationTime': '2020-02-24T04:56:49.122785+00:00', 'modifiedTime': '2020-02-24T04:57:05.105894+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n" ] } ], @@ -237,7 +258,7 @@ " print(\"Creating a new compute target...\")\n", " compute_config = AmlCompute.provisioning_configuration(\n", " vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n", - " idle_seconds_before_scaledown=\"PT1200S\"\n", + " idle_seconds_before_scaledown=\"600\"\n", " )\n", "\n", " # create the cluster\n", @@ -260,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -346,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -355,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -364,18 +385,19 @@ "text": [ "Uploading an estimated of 2 files\n", "Uploading ./bert_abs_data/train_abssum_dataset_full.pt\n", - "Target already exists. Skipping upload for bertabs_processed_data/test_abssum_dataset_full.pt\n", - "Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n", - "Uploaded 1 files\n" + "Uploading ./bert_abs_data/test_abssum_dataset_full.pt\n", + "Uploaded ./bert_abs_data/test_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n", + "Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 2 files out of an estimated total of 2\n", + "Uploaded 2 files\n" ] }, { "data": { "text/plain": [ - "$AZUREML_DATAREFERENCE_fb1ab46646bf409496082009759b990b" + "$AZUREML_DATAREFERENCE_f6e4f6e119d94f68ac66a4557e5f179c" ] }, - "execution_count": 25, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -414,7 +436,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -439,9 +461,9 @@ "source": [ "ENTRY_SCRIPT = \"bertabs_cnndm_distributed_train.py\"\n", "!mkdir -p {PROJECT_FOLDER}\n", - "#!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n", + "!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n", "#!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n", - "!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n", + "#!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n", "!cp -r ../../utils_nlp {PROJECT_FOLDER}" ] }, @@ -464,7 +486,27 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "NODE_COUNT" + ] + }, + { + "cell_type": "code", + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -490,6 +532,7 @@ " \"--quick_run\": 'true',\n", " \"--summary_filename\": f'{SUMMARY_FILENAME}',\n", " \"--model_filename\": f'{MODEL_FILENAME}',\n", + " \"--batch_size\": 4,\n", " },\n", " entry_script= ENTRY_SCRIPT,\n", " node_count=NODE_COUNT,\n", @@ -500,7 +543,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -509,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": { "scrolled": true }, @@ -517,7 +560,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f96b6c43c0414398a236fe3c9127757d", + "model_id": "2e15a2c42f9d46a7a9d983f137ba1b11", "version_major": 2, "version_minor": 0 }, From 0aa7c0bf2f8b85b4062a90a73ade46a00d1af927 Mon Sep 17 00:00:00 2001 From: Daisy Deng Date: Mon, 24 Feb 2020 21:31:33 +0000 Subject: [PATCH 5/5] save work --- tests/unit/test_bertabs_abssum.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_bertabs_abssum.py b/tests/unit/test_bertabs_abssum.py index fd63caa..fe8c4ad 100644 --- a/tests/unit/test_bertabs_abssum.py +++ b/tests/unit/test_bertabs_abssum.py @@ -126,7 +126,7 @@ def main(): #shutil.rmtree(args.output_dir) args = parser.parse_args() - ngpus_per_node = torch.cuda.device_count() + ngpus_per_node = 1 #torch.cuda.device_count() processor = AbsSumProcessor(cache_dir=CACHE_PATH) summarizer = AbsSum( processor, cache_dir=CACHE_PATH @@ -168,7 +168,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args): num_gpus=None, local_rank=local_rank, rank=rank, - batch_size=6, + batch_size=8, max_steps=50000/world_size, learning_rate_bert=0.003, learning_rate_dec=0.3, @@ -178,7 +178,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args): report_every=10, validation_function=this_validate, fp16=True, - fp16_opt_level="O1", + fp16_opt_level="O2", checkpoint=None ) if rank == 0 or local_rank == -1: @@ -208,16 +208,16 @@ def test_train_model(): return validate(class_obj, test_sum_dataset, CACHE_PATH) summarizer.fit( train_sum_dataset, - batch_size=8, + batch_size=6, max_steps=30000, local_rank=-1, learning_rate_bert=0.002, learning_rate_dec=0.2, warmup_steps_bert=20000, warmup_steps_dec=10000, - num_gpus=2, + num_gpus=1, report_every=10, - save_every=100, + save_every=400, validation_function=this_validate, fp16=False, fp16_opt_level="O1", @@ -259,7 +259,7 @@ def test_pretrained_model(): checkpoint = torch.load(os.path.join(MODEL_PATH, "new_model_step_148000_torch1.4.0.pt")) #checkpoint = torch.load(os.path.join(MODEL_PATH, "summarizer_step20000_with_global_step.pt")) - + checkpoint = torch.load(os.path.join(MODEL_PATH, "bert-base-uncased_step_400.pt")) summarizer = AbsSum( processor, cache_dir=CACHE_PATH, @@ -284,13 +284,14 @@ def test_pretrained_model(): return """ - top_n = 10 + top_n = 8 src = test_sum_dataset.source[0:top_n] reference_summaries = ["".join(t).rstrip("\n") for t in test_sum_dataset.target[0:top_n]] print("start prediction") generated_summaries = summarizer.predict( - shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=3, num_gpus=2 + shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=4, num_gpus=2 ) + print(generated_summaries[0]) assert len(generated_summaries) == len(reference_summaries) RESULT_DIR = TemporaryDirectory().name rouge_score = get_rouge(generated_summaries, reference_summaries, RESULT_DIR)