Merge branch 'daden/presumm' of https://github.com/microsoft/nlp-recipes into daden/presumm

2020-02-24 21:48:35 +00:00 · 2020-02-24 21:48:35 +00:00 · c836249713
--- a/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb
+++ b/examples/text_summarization/bertabs_cnndm_aml_distributed.ipynb
@ -37,7 +37,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -46,7 +46,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -55,7 +55,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -99,7 +99,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
@ -110,16 +110,31 @@
    "WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\"  # modifiy to use your own\n",
    "\n",
    "SUBSRIPTION_ID = \"9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\"\n",
+    "\n",
+    "#\"\"\"\n",
+    "LOCATION = \"canadacentral\"  # example \"eastus2\"\n",
+    "RESOURCE_GROUP = \"daden1amlcc\"  # modifiy to use your own\n",
+    "WORKSPACE_NAME = \"daden1amlccws\"  # modifiy to use your own\n",
+    "\n",
+    "# for creating Azure ML Compute Cluster\n",
+    "AMLCOMPUTE_CLUSTER_NAME = \"bertabs2\"  # modifiy to use your own\n",
+    "NODE_COUNT = 2\n",
+    "VM_SIZE = \"Standard_NC12s_v3\"  # this should be the VM that's supported by Azure and Azure ML\n",
+    "VM_SIZE = \"STANDARD_NC24RS_V3\"\n",
+    "CONFIG_PATH = \"./.azureml_canadacentral\"\n",
+    "#\"\"\"\n",
+    "\n",
+    "\"\"\"\n",
    "LOCATION = \"eastus2\"  # example \"eastus2\"\n",
    "RESOURCE_GROUP = \"daden1aml\"  # modifiy to use your own\n",
    "WORKSPACE_NAME = \"daden1amlws\"  # modifiy to use your own\n",
    "\n",
-    "\n",
    "# for creating Azure ML Compute Cluster\n",
    "AMLCOMPUTE_CLUSTER_NAME = \"bertabs1\"  # modifiy to use your own\n",
    "NODE_COUNT = 2\n",
-    "#VM_SIZE = \"STANDARD_NC12\"  # this should be the VM that's supported by Azure and Azure ML\n",
-    "VM_SIZE = \"Standard NC12s_v3\"\n",
+    "VM_SIZE = \"STANDARD_NC12\"  # this should be the VM that's supported by Azure and Azure ML\n",
+    "CONFIG_PATH = \"./.azureml_eastus\"\n",
+    "\"\"\"\n",
    "\n",
    "# for creating Azure ML Experiment\n",
    "EXPERIMENT_NAME = \"NLP-BertAbs\"  # modifiy to use your own\n",
@ -169,12 +184,13 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the workspace using the specified parameters\n",
    "ws = get_or_create_workspace(\n",
+    "    config_path=CONFIG_PATH,\n",
    "    workspace_name=WORKSPACE_NAME,\n",
    "    subscription_id=SUBSRIPTION_ID,\n",
    "    resource_group=RESOURCE_GROUP,\n",
@ -184,17 +200,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Workspace name: daden1amlws\n",
-      "Azure region: eastus2\n",
+      "Workspace name: daden1amlccws\n",
+      "Azure region: canadacentral\n",
      "Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n",
-      "Resource group: daden1aml\n"
+      "Resource group: daden1amlcc\n"
     ]
    }
   ],
@ -217,15 +233,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Found existing compute target.\n",
-      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-21T17:58:05.717000+00:00', 'errors': None, 'creationTime': '2020-02-21T17:58:02.904515+00:00', 'modifiedTime': '2020-02-21T17:58:18.620746+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 5, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}\n"
+      "Creating a new compute target...\n",
+      "Creating\n",
+      "Succeeded\n",
+      "AmlCompute wait for completion finished\n",
+      "\n",
+      "Minimum number of nodes requested have been provisioned\n",
+      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-02-24T04:56:51.926000+00:00', 'errors': None, 'creationTime': '2020-02-24T04:56:49.122785+00:00', 'modifiedTime': '2020-02-24T04:57:05.105894+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC24RS_V3'}\n"
     ]
    }
   ],
@ -237,7 +258,7 @@
    "    print(\"Creating a new compute target...\")\n",
    "    compute_config = AmlCompute.provisioning_configuration(\n",
    "        vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
-    "        idle_seconds_before_scaledown=\"PT1200S\"\n",
+    "        idle_seconds_before_scaledown=\"600\"\n",
    "    )\n",
    "\n",
    "    # create the cluster\n",
@ -260,7 +281,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
@ -346,7 +367,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@ -355,7 +376,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
@ -364,18 +385,19 @@
     "text": [
      "Uploading an estimated of 2 files\n",
      "Uploading ./bert_abs_data/train_abssum_dataset_full.pt\n",
-      "Target already exists. Skipping upload for bertabs_processed_data/test_abssum_dataset_full.pt\n",
-      "Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
-      "Uploaded 1 files\n"
+      "Uploading ./bert_abs_data/test_abssum_dataset_full.pt\n",
+      "Uploaded ./bert_abs_data/test_abssum_dataset_full.pt, 1 files out of an estimated total of 2\n",
+      "Uploaded ./bert_abs_data/train_abssum_dataset_full.pt, 2 files out of an estimated total of 2\n",
+      "Uploaded 2 files\n"
     ]
    },
    {
     "data": {
      "text/plain": [
-       "$AZUREML_DATAREFERENCE_fb1ab46646bf409496082009759b990b"
+       "$AZUREML_DATAREFERENCE_f6e4f6e119d94f68ac66a4557e5f179c"
      ]
     },
-     "execution_count": 25,
+     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -414,7 +436,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
@ -439,9 +461,9 @@
   "source": [
    "ENTRY_SCRIPT = \"bertabs_cnndm_distributed_train.py\"\n",
    "!mkdir -p {PROJECT_FOLDER}\n",
-    "#!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
+    "!python ../../tools/generate_conda_file.py --gpu --name {CONDA_ENV_NAME}\n",
    "#!cp ./nlp_gpu.yaml {PROJECT_FOLDER}\n",
-    "!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
+    "#!cp {ENTRY_SCRIPT} {PROJECT_FOLDER}\n",
    "!cp -r ../../utils_nlp {PROJECT_FOLDER}"
   ]
  },
@ -464,7 +486,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "NODE_COUNT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
@ -490,6 +532,7 @@
    "                        \"--quick_run\": 'true',\n",
    "                        \"--summary_filename\": f'{SUMMARY_FILENAME}',\n",
    "                        \"--model_filename\": f'{MODEL_FILENAME}',\n",
+    "                        \"--batch_size\": 4,\n",
    "                    },\n",
    "                    entry_script= ENTRY_SCRIPT,\n",
    "                    node_count=NODE_COUNT,\n",
@ -500,7 +543,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
@ -509,7 +552,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
   "metadata": {
    "scrolled": true
   },
@ -517,7 +560,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "f96b6c43c0414398a236fe3c9127757d",
+       "model_id": "2e15a2c42f9d46a7a9d983f137ba1b11",
       "version_major": 2,
       "version_minor": 0
      },
--- a/examples/text_summarization/bertabs_cnndm_distributed_train.py
+++ b/examples/text_summarization/bertabs_cnndm_distributed_train.py
@ -3,6 +3,7 @@

 import argparse
 import os
+import pickle
 import sys
 import time
 import torch
@ -68,6 +69,18 @@ parser.add_argument("--summary_filename", type=str, default="generated_summaries
                    help="Summary file name generated by prediction for evaluation.")
 parser.add_argument("--model_filename", type=str, default="dist_extsum_model.pt", 
                    help="model file name saved for evaluation.")
+parser.add_argument("--checkpoint_filename", type=str, default=None, 
+                    help="filename of a checkpoint where the trainging resumes from. \
+                            default path is at cache_dir")
+parser.add_argument("--report_every", type=int, default=10,
+                    help="number of steps between each loss report")
+parser.add_argument("--save_every", type=int, default=500,
+                    help="number of steps between each model save and validation")
+parser.add_argument("--fp16", type=str.lower, default='false', choices=['true', 'false'],
+                    help="Whether to use mixed precision training")
+parser.add_argument("--fp16_opt_level", type=str.upper, default='O2', choices=['O0', 'O1', 'O2', 'O3'],
+                    help="optimization level, refer to https://nvidia.github.io/apex/amp.html#opt-levels for details ")
+


 def pretrained_model():
@ -93,7 +106,6 @@ def main():
    print("output_dir is {}".format(args.output_dir))
    print("data_dir is {}".format(args.data_dir))
    print("cache_dir is {}".format(args.cache_dir))
-
    ngpus_per_node = torch.cuda.device_count()
    processor = AbsSumProcessor(cache_dir=args.cache_dir)
    summarizer = AbsSum(
@ -115,8 +127,11 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
        world_size=world_size,
        rank=rank,
      )
-
-    checkpoint = None
+    ## should not load checkpoint from this place, otherwise, huge memory increase 
+    if args.checkpoint_filename:
+        checkpoint = os.path.join(args.cache_dir, args.checkpoint_filename)
+    else:
+        checkpoint = None
    train_sum_dataset, test_sum_dataset = load_processed_cnndm_abs(args.data_dir)
    def this_validate(class_obj):
        return validate(class_obj, test_sum_dataset, args.cache_dir)
@ -125,8 +140,10 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
        save_every = -1
        this_validate = None
    else:
-        save_every = 400
-
+        save_every = args.save_every
+    
+    fp16 = args.fp16.lower()== 'true'
+    print("fp16 is {}".format(fp16))
    # total number of steps for training
    MAX_STEPS = 400
    # number of steps for warm up
@ -156,17 +173,17 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
        warmup_steps_bert=WARMUP_STEPS_BERT,
        warmup_steps_dec=WARMUP_STEPS_DEC,
        save_every=save_every,
-        report_every=10,
+        report_every=args.report_every,
        validation_function=this_validate,
-        fp16=True,
-        fp16_opt_level="O2",
-        checkpoint=None
+        fp16=fp16,
+        fp16_opt_level=args.fp16_opt_level,
+        checkpoint=checkpoint
    )

    end = time.time()
    print("rank {0}, duration {1:.6f}s".format(rank, end - start))
    if rank == 0 or local_rank == -1:
-        saved_model_path = os.path.join(args.output_dir, "summarizer_step{}_with_glocal_step.pt".format(MAX_STEPS))
+        saved_model_path = os.path.join(args.output_dir, "{}_step{}".format(args.model_filename, MAX_STEPS))
        summarizer.save_model(MAX_STEPS, saved_model_path)  
        top_n = 8
        prediction = summarizer.predict(shorten_dataset(test_sum_dataset, top_n=top_n),
--- a/tests/unit/test_bertabs_abssum.py
+++ b/tests/unit/test_bertabs_abssum.py
@ -126,7 +126,7 @@ def main():

    #shutil.rmtree(args.output_dir)
    args = parser.parse_args()
-    ngpus_per_node = torch.cuda.device_count()
+    ngpus_per_node = 1 #torch.cuda.device_count()
    processor = AbsSumProcessor(cache_dir=CACHE_PATH)
    summarizer = AbsSum(
        processor, cache_dir=CACHE_PATH
@ -168,7 +168,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
        num_gpus=None,
        local_rank=local_rank,
        rank=rank,
-        batch_size=6,
+        batch_size=8,
        max_steps=50000/world_size,
        learning_rate_bert=0.003,
        learning_rate_dec=0.3,
@ -178,7 +178,7 @@ def main_worker(local_rank, ngpus_per_node, summarizer, args):
        report_every=10,
        validation_function=this_validate,
        fp16=True,
-        fp16_opt_level="O1",
+        fp16_opt_level="O2",
        checkpoint=None
    )
    if rank == 0 or local_rank == -1:
@ -208,16 +208,16 @@ def test_train_model():
        return validate(class_obj, test_sum_dataset, CACHE_PATH)
    summarizer.fit(
        train_sum_dataset,
-        batch_size=8,
+        batch_size=6,
        max_steps=30000,
        local_rank=-1,
        learning_rate_bert=0.002,
        learning_rate_dec=0.2,
        warmup_steps_bert=20000,
        warmup_steps_dec=10000,
-        num_gpus=2,
+        num_gpus=1,
        report_every=10,
-        save_every=100,
+        save_every=400,
        validation_function=this_validate,
        fp16=False,
        fp16_opt_level="O1",
@ -259,7 +259,7 @@ def test_pretrained_model():
    checkpoint = torch.load(os.path.join(MODEL_PATH, "new_model_step_148000_torch1.4.0.pt"))
    
    #checkpoint = torch.load(os.path.join(MODEL_PATH, "summarizer_step20000_with_global_step.pt"))
-    
+    checkpoint = torch.load(os.path.join(MODEL_PATH, "bert-base-uncased_step_400.pt")) 
    summarizer = AbsSum(
        processor,
        cache_dir=CACHE_PATH,
@ -284,13 +284,14 @@ def test_pretrained_model():
    return
    """

-    top_n = 10
+    top_n = 8
    src = test_sum_dataset.source[0:top_n]
    reference_summaries = ["".join(t).rstrip("\n") for t in test_sum_dataset.target[0:top_n]]
    print("start prediction")
    generated_summaries = summarizer.predict(
-        shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=3, num_gpus=2
+        shorten_dataset(test_sum_dataset, top_n=top_n), batch_size=4, num_gpus=2
    )
+    print(generated_summaries[0])
    assert len(generated_summaries) == len(reference_summaries)
    RESULT_DIR = TemporaryDirectory().name
    rouge_score = get_rouge(generated_summaries, reference_summaries, RESULT_DIR)
--- a/utils_nlp/models/transformers/abssum.py
+++ b/utils_nlp/models/transformers/abssum.py
@ -9,6 +9,7 @@ from collections import namedtuple
 import itertools
 import logging
 import os
+import pickle
 import random

 import numpy as np
--- a/utils_nlp/models/transformers/common.py
+++ b/utils_nlp/models/transformers/common.py
@ -224,6 +224,7 @@ class Transformer:
        # init training
        tr_loss = 0.0
        accum_loss = 0
+        train_size = 0
        self.model.train()
        self.model.zero_grad()

@ -254,7 +255,7 @@ class Transformer:

                tr_loss += loss.item()
                accum_loss += loss.item()
-
+                train_size += list(inputs.values())[0].size()[0] 
                if (step + 1) % gradient_accumulation_steps == 0:

                    global_step += 1
@ -274,13 +275,14 @@ class Transformer:
                        endtime_string = datetime.datetime.fromtimestamp(end).strftime(
                            "%d/%m/%Y %H:%M:%S"
                        )
-                        log_line = """timestamp: {0:s}, loss: {1:.6f}, time duration: {2:f},
-                            number of examples in current step: {3:.0f}, step {4:.0f}
+                        log_line = """timestamp: {0:s}, average loss: {1:.6f}, time duration: {2:f},
+                            number of examples in current reporting: {3:.0f}, step {4:.0f}
                            out of total {5:.0f}""".format(
                            endtime_string,
                            accum_loss / report_every,
                            end - start,
-                            list(inputs.values())[0].size()[0],
+                            #list(inputs.values())[0].size()[0],
+                            train_size,
                            global_step,
                            max_steps,

@ -288,6 +290,7 @@ class Transformer:
                        logger.info(log_line)
                        print(log_line)
                        accum_loss = 0
+                        train_size = 0
                        start = end
                    if optimizer:
                        if type(optimizer) == list: