update notebook

2020-03-21 05:34:44 +00:00 · 2020-03-21 05:34:44 +00:00 · 084229b9be
--- a/examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb
+++ b/examples/text_summarization/extractive_summarization_cnndm_aml_distributed.ipynb
@ -37,7 +37,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -46,7 +46,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -76,8 +76,13 @@
    "if nlp_path not in sys.path:\n",
    "    sys.path.insert(0, nlp_path)\n",
    "from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
-    "from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\n",
-    "\n",
+    "from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n",
+    "from utils_nlp.eval import compute_rouge_python\n",
+    "from utils_nlp.models.transformers.extractive_summarization import (\n",
+    "    ExtractiveSummarizer,\n",
+    "    ExtSumProcessedData,\n",
+    "    ExtSumProcessor,\n",
+    ")\n",
    "# Check core SDK version number\n",
    "print(\"SDK version:\", azureml.core.VERSION)"
   ]
@ -91,7 +96,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -101,10 +106,9 @@
    "RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\"  # modifiy to use your own\n",
    "WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\"  # modifiy to use your own\n",
    "\n",
-    "\n",
    "# for creating Azure ML Compute Cluster\n",
-    "AMLCOMPUTE_CLUSTER_NAME = \"extsum5\"  # modifiy to use your own\n",
-    "NODE_COUNT = 4\n",
+    "AMLCOMPUTE_CLUSTER_NAME = \"bertsumext\"  # modifiy to use your own\n",
+    "NODE_COUNT = 2\n",
    "VM_SIZE = \"STANDARD_NC6\"  # this should be the VM that's supported by Azure and Azure ML\n",
    "\n",
    "\n",
@ -150,7 +154,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -165,9 +169,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Workspace name: daden1amlwseastus\n",
+      "Azure region: eastus\n",
+      "Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n",
+      "Resource group: daden1amleastus\n"
+     ]
+    }
+   ],
   "source": [
    "print(\n",
    "    \"Workspace name: \" + ws.name,\n",
@ -187,7 +202,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
@ -195,7 +210,7 @@
     "output_type": "stream",
     "text": [
      "Found existing compute target.\n",
-      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-01-30T22:43:39.856000+00:00', 'errors': None, 'creationTime': '2020-01-23T04:50:26.160743+00:00', 'modifiedTime': '2020-01-23T20:31:35.349184+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
+      "{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-03-21T04:18:26.559000+00:00', 'errors': None, 'creationTime': '2020-03-21T04:18:20.466141+00:00', 'modifiedTime': '2020-03-21T04:18:37.162465+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
     ]
    }
   ],
@ -206,7 +221,8 @@
    "except ComputeTargetException:\n",
    "    print(\"Creating a new compute target...\")\n",
    "    compute_config = AmlCompute.provisioning_configuration(\n",
-    "        vm_size=VM_SIZE, max_nodes=NODE_COUNT, NodeIdleTimeBeforeScaleDown=\"PT1200S\"\n",
+    "        vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
+    "        idle_seconds_before_scaledown=\"600\"\n",
    "    )\n",
    "\n",
    "    # create the cluster\n",
@ -229,79 +245,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "experiment = Experiment(ws, name=EXPERIMENT_NAME)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Download Dataset to Local File System"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!mkdir -p {LOCAL_DATA_FOLDER}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "bertsum_data.zip: 869MB [00:29, 29.7MB/s] \n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'./bertsumdata/'"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Upload the Downloaded Dataset to AML Workspace"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
+    "experiment = Experiment(ws, name=EXPERIMENT_NAME)\n",
    "ds = ws.get_default_datastore()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
-    "ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)"
+    "\"\"\" No need to download data \n",
+    "!mkdir -p {LOCAL_DATA_FOLDER}\n",
+    "CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)\n",
+    "\n",
+    "#ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)\n",
+    "\"\"\""
   ]
  },
  {
@ -314,7 +277,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -354,7 +317,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@ -364,15 +327,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "WARNING:azureml.train.estimator._framework_base_estimator:If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
-      "WARNING:azureml.train.estimator._framework_base_estimator:framework_version is not specified, defaulting to version 1.3.\n"
+      "WARNING - If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
+      "WARNING - framework_version is not specified, defaulting to version 1.3.\n"
     ]
    }
   ],
@ -402,7 +365,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -411,7 +374,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
@ -419,7 +382,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "97f3678284a44f7aab5c27fa3e19bb11",
+       "model_id": "c654a5b7687141c2bf29b0f8efb27f14",
       "version_major": 2,
       "version_minor": 0
      },
@ -429,6 +392,13 @@
     },
     "metadata": {},
     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/aml.mini.widget.v1": "{\"status\": \"Queued\", \"workbench_run_details_uri\": \"https://ml.azure.com/experiments/NLP-ExtSum/runs/NLP-ExtSum_1584763863_e337b0ee?wsid=/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourcegroups/daden1aml/workspaces/daden1amlws\", \"run_id\": \"NLP-ExtSum_1584763863_e337b0ee\", \"run_properties\": {\"run_id\": \"NLP-ExtSum_1584763863_e337b0ee\", \"created_utc\": \"2020-03-21T04:11:09.358987Z\", \"properties\": {\"_azureml.ComputeTargetType\": \"amlcompute\", \"ContentSnapshotId\": \"2394595f-3a71-4c08-9a86-355ade205ff3\", \"azureml.git.repository_uri\": \"https://github.com/microsoft/nlp-recipes.git\", \"mlflow.source.git.repoURL\": \"https://github.com/microsoft/nlp-recipes.git\", \"azureml.git.branch\": \"daden/bertsumext\", \"mlflow.source.git.branch\": \"daden/bertsumext\", \"azureml.git.commit\": \"2e6a9379a7dcb94262d6be7dd3e304a056ec03c5\", \"mlflow.source.git.commit\": \"2e6a9379a7dcb94262d6be7dd3e304a056ec03c5\", \"azureml.git.dirty\": \"True\", \"AzureML.DerivedImageName\": \"azureml/azureml_901415173bf81e758fea3fcc8a8a9c07\", \"ProcessInfoFile\": \"azureml-logs/process_info.json\", \"ProcessStatusFile\": \"azureml-logs/process_status.json\"}, \"tags\": {\"_aml_system_ComputeTargetStatus\": \"{\\\"AllocationState\\\":\\\"steady\\\",\\\"PreparingNodeCount\\\":0,\\\"RunningNodeCount\\\":0,\\\"CurrentNodeCount\\\":0}\"}, \"script_name\": null, \"arguments\": null, \"end_time_utc\": null, \"status\": \"Queued\", \"log_files\": {}, \"log_groups\": [], \"run_duration\": \"0:08:09\"}, \"child_runs\": [], \"children_metrics\": {}, \"run_metrics\": [], \"run_logs\": \"Your job is submitted in Azure cloud and we are monitoring to get logs...\", \"graph\": {}, \"widget_settings\": {\"childWidgetDisplay\": \"popup\", \"send_telemetry\": false, \"log_level\": \"INFO\", \"sdk_version\": \"1.0.85\"}, \"loading\": false}"
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@ -497,43 +467,6 @@
    "                   show_progress=True)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Evaluation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utils_nlp.eval.evaluate_summarization import get_rouge\n",
-    "from utils_nlp.models.transformers.extractive_summarization import ExtSumProcessedData\n",
-    "import pickle\n",
-    "from utils_nlp.models.transformers.extractive_summarization import ExtractiveSummarizer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_dataset, test_dataset = ExtSumProcessedData().splits(root=LOCAL_DATA_FOLDER)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "target = [i['tgt_txt'] for i in test_dataset]"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 30,
@ -546,6 +479,80 @@
    "        prediction.append(line[0:-1]) # remove the ending \"\\n\""
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prediction[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compare with gold summaries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TOP_N = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor = ExtSumProcessor()\n",
+    "_, test_dataset = CNNDMSummarizationDataset(top_n=TOP_N, local_cache_path=LOCAL_DATA_FOLDER)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source = []\n",
+    "temp_target = []\n",
+    "for i in ext_sum_test:\n",
+    "    source.append(i[\"src_txt\"]) \n",
+    "    temp_target.append(\" \".join(j) for j in i['tgt']) \n",
+    "target = [''.join(i) for i in list(temp_target)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "source[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download and evaluation the trained model"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 36,
@ -575,11 +582,14 @@
    "ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
    "               prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\n",
    "               show_progress=True)\n",
-    "summarizer = ExtractiveSummarizer(MODEL_NAME, ENCODER, LOCAL_OUTPUT_DIR)\n",
+    "\n",
+    "processor = ExtSumProcessor()\n",
+    "summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_OUTPUT_DIR)\n",
    "summarizer.model.load_state_dict(\n",
-    "    torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'))\n",
+    "    torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\n",
+    "               map_location=\"cpu\"))\n",
    ")\n",
-    "prediction = summarizer.predict(test_dataset, num_gpus=torch.cuda.device_count(), batch_size=128)\n",
+    "prediction = summarizer.predict(test_dataset[0:TOP_N], num_gpus=torch.cuda.device_count(), batch_size=128, sentence_separator = \"\\n\")\n",
    "#\"\"\""
   ]
  },
@ -616,7 +626,7 @@
    }
   ],
   "source": [
-    "test_dataset[0]['src_txt']"
+    "source[0]"
   ]
  },
  {
@ -659,15 +669,6 @@
    "target[0]"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "RESULT_DIR = TemporaryDirectory().name"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": 40,
@ -716,7 +717,8 @@
    }
   ],
   "source": [
-    "rouge_score = get_rouge(prediction, target, RESULT_DIR)"
+    "rouge_scores = compute_rouge_python(cand=prediction, ref=target)\n",
+    "pprint.pprint(rouge_scores)"
   ]
  },
  {
@ -738,9 +740,7 @@
    "if os.path.exists(LOCAL_OUTPUT_DIR):\n",
    "    shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\n",
    "if os.path.exists(PROJECT_FOLDER):\n",
-    "    shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)\n",
-    "if os.path.exists(RESULT_DIR):\n",
-    "    shutil.rmtree(RESULT_DIR, ignore_errors=True)"
+    "    shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)"
   ]
  }
 ],