From 6597c9314832349205e08dad8b8ec12482abc643 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daden@microsoft.com>
Date: Mon, 24 Feb 2020 21:25:02 +0000
Subject: [PATCH 1/5] update train size

---
 utils_nlp/models/transformers/common.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/utils_nlp/models/transformers/common.py b/utils_nlp/models/transformers/common.py
index 7cd08e7..a957b09 100755
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@@ -224,6 +224,7 @@ class Transformer:
         # init training
         tr_loss = 0.0
         accum_loss = 0
+        train_size = 0
         self.model.train()
         self.model.zero_grad()
 
@@ -254,7 +255,7 @@ class Transformer:
 
                 tr_loss += loss.item()
                 accum_loss += loss.item()
-
+                train_size += list(inputs.values())[0].size()[0] 
                 if (step + 1) % gradient_accumulation_steps == 0:
 
                     global_step += 1
@@ -274,13 +275,14 @@ class Transformer:
                         endtime_string = datetime.datetime.fromtimestamp(end).strftime(
                             "%d/%m/%Y %H:%M:%S"
                         )
-                        log_line = """timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f},
-                            number of examples in current step: {3:.0f}, step {4:.0f}
+                        log_line = """timestamp: {0:s}, average loss: {1:.6f}, time duration: {2:f},
+                            number of examples in current reporting: {3:.0f}, step {4:.0f}
                             out of total {5:.0f}""".format(
                             endtime_string,
                             accum_loss / report_every,
                             end - start,
-                            list(inputs.values())[0].size()[0],
+                            #list(inputs.values())[0].size()[0],
+                            train_size,
                             global_step,
                             max_steps,
 
@@ -288,6 +290,7 @@ class Transformer:
                         logger.info(log_line)
                         print(log_line)
                         accum_loss = 0
+                        train_size = 0
                         start = end
                     if type(optimizer) == list:
                         for o in optimizer:

From badbebb96d43b9ba9cd5c7d11ed774af25a5f6a9 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daden@microsoft.com>
Date: Mon, 24 Feb 2020 21:27:31 +0000
Subject: [PATCH 2/5] only use 1 gpu for validation

---
 utils_nlp/models/transformers/abssum.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utils_nlp/models/transformers/abssum.py b/utils_nlp/models/transformers/abssum.py
index 80751c2..14ba566 100644
--- a/utils_nlp/models/transformers/abssum.py
+++ b/utils_nlp/models/transformers/abssum.py
@@ -9,6 +9,7 @@ from collections import namedtuple
 import itertools
 import logging
 import os
+import pickle
 import random
 
 import numpy as np
@@ -335,9 +336,9 @@ def validate(summarizer, validate_sum_dataset, cache_dir):
     TOP_N = 8
 
     src = validate_sum_dataset.source[0:TOP_N]
-    reference_summaries = ["".join(t).rstrip("\n") for t in validate_sum_dataset.target[0:TOP_N]]
+    reference_summaries = [" ".join(t).rstrip("\n") for t in validate_sum_dataset.target[0:TOP_N]]
     generated_summaries = summarizer.predict(
-        shorten_dataset(validate_sum_dataset, top_n=TOP_N), num_gpus=2, batch_size=4
+        shorten_dataset(validate_sum_dataset, top_n=TOP_N), num_gpus=1, batch_size=4
     )
     assert len(generated_summaries) == len(reference_summaries)
     for i in generated_summaries[0:1]:

From 027825062f907520bb68656cdd86dde09b0aa6d8 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daden@microsoft.com>
Date: Mon, 24 Feb 2020 21:30:11 +0000
Subject: [PATCH 3/5] add a few more options

---
 .../bertabs_cnndm_distributed_train.py        | 37 ++++++++++++++-----
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/examples/text_summarization/bertabs_cnndm_distributed_train.py b/examples/text_summarization/bertabs_cnndm_distributed_train.py
index 1ab6b8c..64c7886 100644
--- a/examples/text_summarization/bertabs_cnndm_distributed_train.py
+++ b/examples/text_summarization/bertabs_cnndm_distributed_train.py
@@ -3,6 +3,7 @@
 
 import argparse
 import os
+import pickle
 import sys
 import time
 import torch
@@ -68,6 +69,18 @@ parser.add_argument("--summary_filename", type=str, default="generated_summaries
                     help="Summary file name generated by prediction for evaluation.")
 parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt", 
                     help="model file name saved for evaluation.")
+parser.add_argument("--checkpoint_filename", type=str, default=None, 
+                    help="filename of a checkpoint where the trainging resumes from. \
+                            default path is at cache_dir")
+parser.add_argument("--report_every", type=int, default=10,
+                    help="number of steps between each loss report")
+parser.add_argument("--save_every", type=int, default=500,
+                    help="number of steps between each model save and validation")
+parser.add_argument("--fp16", type=str.lower, default='false', choices=['true', 'false'],
+                    help="Whether to use mixed precision training")
+parser.add_argument("--fp16_opt_level", type=str.upper, default='O2', choices=['O0', 'O1', 'O2', 'O3'],
+                    help="optimization level, refer to https://nvidia.github.io/apex/amp.html#opt-levels for details ")
+
 
 
 def pretrained_model():
@@ -93,7 +106,6 @@ def main():
     print("output_dir is {}".format(args.output_dir))
     print("data_dir is {}".format(args.data_dir))
     print("cache_dir is {}".format(args.cache_dir))
-
     ngpus_per_node = torch.cuda.device_count()
     processor = AbsSumProcessor(cache_dir=args.cache_dir)
     summarizer = AbsSum(
@@ -115,8 +127,11 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
         world_size=world_size,
         rank=rank,
       )
-
-    checkpoint = None
+    ## should not load checkpoint from this place, otherwise, huge memory increase 
+    if args.checkpoint_filename:
+        checkpoint = os.path.join(args.cache_dir, args.checkpoint_filename)
+    else:
+        checkpoint = None
     train_sum_dataset, test_sum_dataset = load_processed_cnndm_abs(args.data_dir)
     def this_validate(class_obj):
         return validate(class_obj, test_sum_dataset, args.cache_dir)
@@ -125,8 +140,10 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
         save_every = -1
         this_validate = None
     else:
-        save_every = 400
-
+        save_every = args.save_every
+    
+    fp16 = args.fp16.lower()== 'true'
+    print("fp16 is {}".format(fp16))
     # total number of steps for training
     MAX_STEPS = 400
     # number of steps for warm up
@@ -156,17 +173,17 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
         warmup_steps_bert=WARMUP_STEPS_BERT,
         warmup_steps_dec=WARMUP_STEPS_DEC,
         save_every=save_every,
-        report_every=10,
+        report_every=args.report_every,
         validation_function=this_validate,
-        fp16=True,
-        fp16_opt_level="O2",
-        checkpoint=None
+        fp16=fp16,
+        fp16_opt_level=args.fp16_opt_level,
+        checkpoint=checkpoint
     )
 
     end = time.time()
     print("rank {0}, duration {1:.6f}s".format(rank, end - start))
     if rank == 0 or local_rank == -1:
-        saved_model_path = os.path.join(args.output_dir, "summarizer_step{}_with_glocal_step.pt".format(MAX_STEPS))
+        saved_model_path = os.path.join(args.output_dir, "{}_step{}".format(args.model_filename, MAX_STEPS))
         summarizer.save_model(MAX_STEPS, saved_model_path)  
         top_n = 8
         prediction = summarizer.predict(shorten_dataset(test_sum_dataset, top_n=top_n),

From f283e84e801854d081e4ba63fd6cf53c3ae608a6 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daden@microsoft.com>
Date: Mon, 24 Feb 2020 21:30:40 +0000
Subject: [PATCH 4/5] save work

---
 .../bertabs_cnndm_aml_distributed.ipynb       | 105 ++++++++++++------
 1 file changed, 74 insertions(+), 31 deletions(-)

diff --git a/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb b/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb
index 7f1d9b1..a9376d0 100644
--- a/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb
+++ b/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb
@@ -37,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -46,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -99,7 +99,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -110,16 +110,31 @@
     "WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\"  # modifiy to use your own\n",
     "\n",
     "SUBSRIPTION_ID = \"9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\"\n",
+    "\n",
+    "#\"\"\"\n",
+    "LOCATION = \"canadacentral\"  # example \"eastus2\"\n",
+    "RESOURCE_GROUP = \"daden1amlcc\"  # modifiy to use your own\n",
+    "WORKSPACE_NAME = \"daden1amlccws\"  # modifiy to use your own\n",
+    "\n",
+    "# for creating Azure ML Compute Cluster\n",
+    "AMLCOMPUTE_CLUSTER_NAME = \"bertabs2\"  # modifiy to use your own\n",
+    "NODE_COUNT = 2\n",
+    "VM_SIZE = \"Standard_NC12s_v3\"  # this should be the VM that's supported by Azure and Azure ML\n",
+    "VM_SIZE = \"STANDARD_NC24RS_V3\"\n",
+    "CONFIG_PATH = \"./.azureml_canadacentral\"\n",
+    "#\"\"\"\n",
+    "\n",
+    "\"\"\"\n",
     "LOCATION = \"eastus2\"  # example \"eastus2\"\n",
     "RESOURCE_GROUP = \"daden1aml\"  # modifiy to use your own\n",
     "WORKSPACE_NAME = \"daden1amlws\"  # modifiy to use your own\n",
     "\n",
-    "\n",
     "# for creating Azure ML Compute Cluster\n",
     "AMLCOMPUTE_CLUSTER_NAME = \"bertabs1\"  # modifiy to use your own\n",
     "NODE_COUNT = 2\n",
-    "#VM_SIZE = \"STANDARD_NC12\"  # this should be the VM that's supported by Azure and Azure ML\n",
-    "VM_SIZE = \"Standard NC12s_v3\"\n",
+    "VM_SIZE = \"STANDARD_NC12\"  # this should be the VM that's supported by Azure and Azure ML\n",
+    "CONFIG_PATH = \"./.azureml_eastus\"\n",
+    "\"\"\"\n",
     "\n",
     "# for creating Azure ML Experiment\n",
     "EXPERIMENT_NAME = \"NLP-BertAbs\"  # modifiy to use your own\n",
@@ -169,12 +184,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Create the workspace using the specified parameters\n",
     "ws = get_or_create_workspace(\n",
+    "    config_path=CONFIG_PATH,\n",
     "    workspace_name=WORKSPACE_NAME,\n",
     "    subscription_id=SUBSRIPTION_ID,\n",
     "    resource_group=RESOURCE_GROUP,\n",
@@ -184,17 +200,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Workspace name: daden1amlws\n",
-      "Azure region: eastus2\n",
+      "Workspace name: daden1amlccws\n",
+      "Azure region: canadacentral\n",
       "Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n",
-      "Resource group: daden1aml\n"
+      "Resource group: daden1amlcc\n"
      ]
     }
    ],
@@ -217,15 +233,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Found existing compute target.\n",
-      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-21T17:58:05.717000+00:00', 'errors': None, 'creationTime': '2020-02-21T17:58:02.904515+00:00', 'modifiedTime': '2020-02-21T17:58:18.620746+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\n"
+      "Creating a new compute target...\n",
+      "Creating\n",
+      "Succeeded\n",
+      "AmlCompute wait for completion finished\n",
+      "\n",
+      "Minimum number of nodes requested have been provisioned\n",
+      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-24T04:56:51.926000+00:00', 'errors': None, 'creationTime': '2020-02-24T04:56:49.122785+00:00', 'modifiedTime': '2020-02-24T04:57:05.105894+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
      ]
     }
    ],
@@ -237,7 +258,7 @@
     "    print(\"Creating a new compute target...\")\n",
     "    compute_config = AmlCompute.provisioning_configuration(\n",
     "        vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
-    "        idle_seconds_before_scaledown=\"PT1200S\"\n",
+    "        idle_seconds_before_scaledown=\"600\"\n",
     "    )\n",
     "\n",
     "    # create the cluster\n",
@@ -260,7 +281,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -346,7 +367,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -355,7 +376,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [
     {
@@ -364,18 +385,19 @@
      "text": [
       "Uploading an estimated of 2 files\n",
       "Uploading ./bert_abs_data/train_abssum_dataset_full.pt\n",
-      "Target already exists. Skipping upload for bertabs_processed_data/test_abssum_dataset_full.pt\n",
-      "Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
-      "Uploaded 1 files\n"
+      "Uploading ./bert_abs_data/test_abssum_dataset_full.pt\n",
+      "Uploaded ./bert_abs_data/test_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
+      "Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 2 files out of an estimated total of 2\n",
+      "Uploaded 2 files\n"
      ]
     },
     {
      "data": {
       "text/plain": [
-       "$AZUREML_DATAREFERENCE_fb1ab46646bf409496082009759b990b"
+       "$AZUREML_DATAREFERENCE_f6e4f6e119d94f68ac66a4557e5f179c"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 62,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -414,7 +436,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -439,9 +461,9 @@
    "source": [
     "ENTRY_SCRIPT = \"bertabs_cnndm_distributed_train.py\"\n",
     "!mkdir -p {PROJECT_FOLDER}\n",
-    "#!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
+    "!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
     "#!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
-    "!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
+    "#!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
     "!cp -r ../../utils_nlp {PROJECT_FOLDER}"
    ]
   },
@@ -464,7 +486,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "NODE_COUNT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
@@ -490,6 +532,7 @@
     "                        \"--quick_run\": 'true',\n",
     "                        \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
     "                        \"--model_filename\": f'{MODEL_FILENAME}',\n",
+    "                        \"--batch_size\": 4,\n",
     "                    },\n",
     "                    entry_script= ENTRY_SCRIPT,\n",
     "                    node_count=NODE_COUNT,\n",
@@ -500,7 +543,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -509,7 +552,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {
     "scrolled": true
    },
@@ -517,7 +560,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f96b6c43c0414398a236fe3c9127757d",
+       "model_id": "2e15a2c42f9d46a7a9d983f137ba1b11",
        "version_major": 2,
        "version_minor": 0
       },

From 0aa7c0bf2f8b85b4062a90a73ade46a00d1af927 Mon Sep 17 00:00:00 2001
From: Daisy Deng <daden@microsoft.com>
Date: Mon, 24 Feb 2020 21:31:33 +0000
Subject: [PATCH 5/5] save work

---
 tests/unit/test_bertabs_abssum.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/unit/test_bertabs_abssum.py b/tests/unit/test_bertabs_abssum.py
index fd63caa..fe8c4ad 100644
--- a/tests/unit/test_bertabs_abssum.py
+++ b/tests/unit/test_bertabs_abssum.py
@@ -126,7 +126,7 @@ def main():
 
     #shutil.rmtree(args.output_dir)
     args = parser.parse_args()
-    ngpus_per_node = torch.cuda.device_count()
+    ngpus_per_node = 1 #torch.cuda.device_count()
     processor = AbsSumProcessor(cache_dir=CACHE_PATH)
     summarizer = AbsSum(
         processor, cache_dir=CACHE_PATH
@@ -168,7 +168,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
         num_gpus=None,
         local_rank=local_rank,
         rank=rank,
-        batch_size=6,
+        batch_size=8,
         max_steps=50000/world_size,
         learning_rate_bert=0.003,
         learning_rate_dec=0.3,
@@ -178,7 +178,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
         report_every=10,
         validation_function=this_validate,
         fp16=True,
-        fp16_opt_level="O1",
+        fp16_opt_level="O2",
         checkpoint=None
     )
     if rank == 0 or local_rank == -1:
@@ -208,16 +208,16 @@ def test_train_model():
         return validate(class_obj, test_sum_dataset, CACHE_PATH)
     summarizer.fit(
         train_sum_dataset,
-        batch_size=8,
+        batch_size=6,
         max_steps=30000,
         local_rank=-1,
         learning_rate_bert=0.002,
         learning_rate_dec=0.2,
         warmup_steps_bert=20000,
         warmup_steps_dec=10000,
-        num_gpus=2,
+        num_gpus=1,
         report_every=10,
-        save_every=100,
+        save_every=400,
         validation_function=this_validate,
         fp16=False,
         fp16_opt_level="O1",
@@ -259,7 +259,7 @@ def test_pretrained_model():
     checkpoint = torch.load(os.path.join(MODEL_PATH, "new_model_step_148000_torch1.4.0.pt"))
     
     #checkpoint = torch.load(os.path.join(MODEL_PATH, "summarizer_step20000_with_global_step.pt"))
-    
+    checkpoint = torch.load(os.path.join(MODEL_PATH, "bert-base-uncased_step_400.pt")) 
     summarizer = AbsSum(
         processor,
         cache_dir=CACHE_PATH,
@@ -284,13 +284,14 @@ def test_pretrained_model():
     return
     """
 
-    top_n = 10
+    top_n = 8
     src = test_sum_dataset.source[0:top_n]
     reference_summaries = ["".join(t).rstrip("\n") for t in test_sum_dataset.target[0:top_n]]
     print("start prediction")
     generated_summaries = summarizer.predict(
-        shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=3, num_gpus=2
+        shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=4, num_gpus=2
     )
+    print(generated_summaries[0])
     assert len(generated_summaries) == len(reference_summaries)
     RESULT_DIR = TemporaryDirectory().name
     rouge_score = get_rouge(generated_summaries, reference_summaries, RESULT_DIR)