This commit is contained in:
Daisy Deng 2020-03-21 05:34:44 +00:00
Родитель 65e5ab36e7
Коммит 084229b9be
1 изменённых файлов: 138 добавлений и 138 удалений

Просмотреть файл

@ -37,7 +37,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -46,7 +46,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -76,8 +76,13 @@
"if nlp_path not in sys.path:\n",
" sys.path.insert(0, nlp_path)\n",
"from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
"from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\n",
"\n",
"from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n",
"from utils_nlp.eval import compute_rouge_python\n",
"from utils_nlp.models.transformers.extractive_summarization import (\n",
" ExtractiveSummarizer,\n",
" ExtSumProcessedData,\n",
" ExtSumProcessor,\n",
")\n",
"# Check core SDK version number\n",
"print(\"SDK version:\", azureml.core.VERSION)"
]
@ -91,7 +96,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -101,10 +106,9 @@
"RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\" # modifiy to use your own\n",
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
"\n",
"\n",
"# for creating Azure ML Compute Cluster\n",
"AMLCOMPUTE_CLUSTER_NAME = \"extsum5\" # modifiy to use your own\n",
"NODE_COUNT = 4\n",
"AMLCOMPUTE_CLUSTER_NAME = \"bertsumext\" # modifiy to use your own\n",
"NODE_COUNT = 2\n",
"VM_SIZE = \"STANDARD_NC6\" # this should be the VM that's supported by Azure and Azure ML\n",
"\n",
"\n",
@ -150,7 +154,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@ -165,9 +169,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Workspace name: daden1amlwseastus\n",
"Azure region: eastus\n",
"Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n",
"Resource group: daden1amleastus\n"
]
}
],
"source": [
"print(\n",
" \"Workspace name: \" + ws.name,\n",
@ -187,7 +202,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -195,7 +210,7 @@
"output_type": "stream",
"text": [
"Found existing compute target.\n",
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-01-30T22:43:39.856000+00:00', 'errors': None, 'creationTime': '2020-01-23T04:50:26.160743+00:00', 'modifiedTime': '2020-01-23T20:31:35.349184+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-03-21T04:18:26.559000+00:00', 'errors': None, 'creationTime': '2020-03-21T04:18:20.466141+00:00', 'modifiedTime': '2020-03-21T04:18:37.162465+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
]
}
],
@ -206,7 +221,8 @@
"except ComputeTargetException:\n",
" print(\"Creating a new compute target...\")\n",
" compute_config = AmlCompute.provisioning_configuration(\n",
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, NodeIdleTimeBeforeScaleDown=\"PT1200S\"\n",
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
" idle_seconds_before_scaledown=\"600\"\n",
" )\n",
"\n",
" # create the cluster\n",
@ -229,79 +245,26 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"experiment = Experiment(ws, name=EXPERIMENT_NAME)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download Dataset to Local File System"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p {LOCAL_DATA_FOLDER}"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"bertsum_data.zip: 869MB [00:29, 29.7MB/s] \n"
]
},
{
"data": {
"text/plain": [
"'./bertsumdata/'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Upload the Downloaded Dataset to AML Workspace"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"experiment = Experiment(ws, name=EXPERIMENT_NAME)\n",
"ds = ws.get_default_datastore()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)"
"\"\"\" No need to download data \n",
"!mkdir -p {LOCAL_DATA_FOLDER}\n",
"CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)\n",
"\n",
"#ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)\n",
"\"\"\""
]
},
{
@ -314,7 +277,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -354,7 +317,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@ -364,15 +327,15 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:azureml.train.estimator._framework_base_estimator:If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
"WARNING:azureml.train.estimator._framework_base_estimator:framework_version is not specified, defaulting to version 1.3.\n"
"WARNING - If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
"WARNING - framework_version is not specified, defaulting to version 1.3.\n"
]
}
],
@ -402,7 +365,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -411,7 +374,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 13,
"metadata": {
"scrolled": true
},
@ -419,7 +382,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "97f3678284a44f7aab5c27fa3e19bb11",
"model_id": "c654a5b7687141c2bf29b0f8efb27f14",
"version_major": 2,
"version_minor": 0
},
@ -429,6 +392,13 @@
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/aml.mini.widget.v1": "{\"status\": \"Queued\", \"workbench_run_details_uri\": \"https://ml.azure.com/experiments/NLP-ExtSum/runs/NLP-ExtSum_1584763863_e337b0ee?wsid=/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourcegroups/daden1aml/workspaces/daden1amlws\", \"run_id\": \"NLP-ExtSum_1584763863_e337b0ee\", \"run_properties\": {\"run_id\": \"NLP-ExtSum_1584763863_e337b0ee\", \"created_utc\": \"2020-03-21T04:11:09.358987Z\", \"properties\": {\"_azureml.ComputeTargetType\": \"amlcompute\", \"ContentSnapshotId\": \"2394595f-3a71-4c08-9a86-355ade205ff3\", \"azureml.git.repository_uri\": \"https://github.com/microsoft/nlp-recipes.git\", \"mlflow.source.git.repoURL\": \"https://github.com/microsoft/nlp-recipes.git\", \"azureml.git.branch\": \"daden/bertsumext\", \"mlflow.source.git.branch\": \"daden/bertsumext\", \"azureml.git.commit\": \"2e6a9379a7dcb94262d6be7dd3e304a056ec03c5\", \"mlflow.source.git.commit\": \"2e6a9379a7dcb94262d6be7dd3e304a056ec03c5\", \"azureml.git.dirty\": \"True\", \"AzureML.DerivedImageName\": \"azureml/azureml_901415173bf81e758fea3fcc8a8a9c07\", \"ProcessInfoFile\": \"azureml-logs/process_info.json\", \"ProcessStatusFile\": \"azureml-logs/process_status.json\"}, \"tags\": {\"_aml_system_ComputeTargetStatus\": \"{\\\"AllocationState\\\":\\\"steady\\\",\\\"PreparingNodeCount\\\":0,\\\"RunningNodeCount\\\":0,\\\"CurrentNodeCount\\\":0}\"}, \"script_name\": null, \"arguments\": null, \"end_time_utc\": null, \"status\": \"Queued\", \"log_files\": {}, \"log_groups\": [], \"run_duration\": \"0:08:09\"}, \"child_runs\": [], \"children_metrics\": {}, \"run_metrics\": [], \"run_logs\": \"Your job is submitted in Azure cloud and we are monitoring to get logs...\", \"graph\": {}, \"widget_settings\": {\"childWidgetDisplay\": \"popup\", \"send_telemetry\": false, \"log_level\": \"INFO\", \"sdk_version\": \"1.0.85\"}, \"loading\": false}"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
@ -497,43 +467,6 @@
" show_progress=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"from utils_nlp.eval.evaluate_summarization import get_rouge\n",
"from utils_nlp.models.transformers.extractive_summarization import ExtSumProcessedData\n",
"import pickle\n",
"from utils_nlp.models.transformers.extractive_summarization import ExtractiveSummarizer"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"train_dataset, test_dataset = ExtSumProcessedData().splits(root=LOCAL_DATA_FOLDER)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"target = [i['tgt_txt'] for i in test_dataset]"
]
},
{
"cell_type": "code",
"execution_count": 30,
@ -546,6 +479,80 @@
" prediction.append(line[0:-1]) # remove the ending \"\\n\""
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"prediction[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Compare with gold summaries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"TOP_N = 100"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"processor = ExtSumProcessor()\n",
"_, test_dataset = CNNDMSummarizationDataset(top_n=TOP_N, local_cache_path=LOCAL_DATA_FOLDER)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"source = []\n",
"temp_target = []\n",
"for i in ext_sum_test:\n",
" source.append(i[\"src_txt\"]) \n",
" temp_target.append(\" \".join(j) for j in i['tgt']) \n",
"target = [''.join(i) for i in list(temp_target)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"target[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"source[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download and evaluation the trained model"
]
},
{
"cell_type": "code",
"execution_count": 36,
@ -575,11 +582,14 @@
"ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
" prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\n",
" show_progress=True)\n",
"summarizer = ExtractiveSummarizer(MODEL_NAME, ENCODER, LOCAL_OUTPUT_DIR)\n",
"\n",
"processor = ExtSumProcessor()\n",
"summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_OUTPUT_DIR)\n",
"summarizer.model.load_state_dict(\n",
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'))\n",
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\n",
" map_location=\"cpu\"))\n",
")\n",
"prediction = summarizer.predict(test_dataset, num_gpus=torch.cuda.device_count(), batch_size=128)\n",
"prediction = summarizer.predict(test_dataset[0:TOP_N], num_gpus=torch.cuda.device_count(), batch_size=128, sentence_separator = \"\\n\")\n",
"#\"\"\""
]
},
@ -616,7 +626,7 @@
}
],
"source": [
"test_dataset[0]['src_txt']"
"source[0]"
]
},
{
@ -659,15 +669,6 @@
"target[0]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"RESULT_DIR = TemporaryDirectory().name"
]
},
{
"cell_type": "code",
"execution_count": 40,
@ -716,7 +717,8 @@
}
],
"source": [
"rouge_score = get_rouge(prediction, target, RESULT_DIR)"
"rouge_scores = compute_rouge_python(cand=prediction, ref=target)\n",
"pprint.pprint(rouge_scores)"
]
},
{
@ -738,9 +740,7 @@
"if os.path.exists(LOCAL_OUTPUT_DIR):\n",
" shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\n",
"if os.path.exists(PROJECT_FOLDER):\n",
" shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)\n",
"if os.path.exists(RESULT_DIR):\n",
" shutil.rmtree(RESULT_DIR, ignore_errors=True)"
" shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)"
]
}
],