update notebook
This commit is contained in:
Родитель
65e5ab36e7
Коммит
084229b9be
|
@ -37,7 +37,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -46,7 +46,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -76,8 +76,13 @@
|
|||
"if nlp_path not in sys.path:\n",
|
||||
" sys.path.insert(0, nlp_path)\n",
|
||||
"from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
|
||||
"from utils_nlp.dataset.cnndm import CNNDMBertSumProcessedData, CNNDMSummarizationDataset\n",
|
||||
"\n",
|
||||
"from utils_nlp.dataset.cnndm import CNNDMSummarizationDataset\n",
|
||||
"from utils_nlp.eval import compute_rouge_python\n",
|
||||
"from utils_nlp.models.transformers.extractive_summarization import (\n",
|
||||
" ExtractiveSummarizer,\n",
|
||||
" ExtSumProcessedData,\n",
|
||||
" ExtSumProcessor,\n",
|
||||
")\n",
|
||||
"# Check core SDK version number\n",
|
||||
"print(\"SDK version:\", azureml.core.VERSION)"
|
||||
]
|
||||
|
@ -91,7 +96,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -101,10 +106,9 @@
|
|||
"RESOURCE_GROUP = \"YOUR_WORKSPACE_NAME\" # modifiy to use your own\n",
|
||||
"WORKSPACE_NAME = \"YOUR_WORKSPACE_REGION\" # modifiy to use your own\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# for creating Azure ML Compute Cluster\n",
|
||||
"AMLCOMPUTE_CLUSTER_NAME = \"extsum5\" # modifiy to use your own\n",
|
||||
"NODE_COUNT = 4\n",
|
||||
"AMLCOMPUTE_CLUSTER_NAME = \"bertsumext\" # modifiy to use your own\n",
|
||||
"NODE_COUNT = 2\n",
|
||||
"VM_SIZE = \"STANDARD_NC6\" # this should be the VM that's supported by Azure and Azure ML\n",
|
||||
"\n",
|
||||
"\n",
|
||||
|
@ -150,7 +154,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -165,9 +169,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Workspace name: daden1amlwseastus\n",
|
||||
"Azure region: eastus\n",
|
||||
"Subscription id: 9086b59a-02d7-4687-b3fd-e39fa5e0fd9b\n",
|
||||
"Resource group: daden1amleastus\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\n",
|
||||
" \"Workspace name: \" + ws.name,\n",
|
||||
|
@ -187,7 +202,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -195,7 +210,7 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"Found existing compute target.\n",
|
||||
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-01-30T22:43:39.856000+00:00', 'errors': None, 'creationTime': '2020-01-23T04:50:26.160743+00:00', 'modifiedTime': '2020-01-23T20:31:35.349184+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1200S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
|
||||
"{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-03-21T04:18:26.559000+00:00', 'errors': None, 'creationTime': '2020-03-21T04:18:20.466141+00:00', 'modifiedTime': '2020-03-21T04:18:37.162465+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT600S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -206,7 +221,8 @@
|
|||
"except ComputeTargetException:\n",
|
||||
" print(\"Creating a new compute target...\")\n",
|
||||
" compute_config = AmlCompute.provisioning_configuration(\n",
|
||||
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, NodeIdleTimeBeforeScaleDown=\"PT1200S\"\n",
|
||||
" vm_size=VM_SIZE, max_nodes=NODE_COUNT, \n",
|
||||
" idle_seconds_before_scaledown=\"600\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # create the cluster\n",
|
||||
|
@ -229,79 +245,26 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment = Experiment(ws, name=EXPERIMENT_NAME)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download Dataset to Local File System"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!mkdir -p {LOCAL_DATA_FOLDER}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"bertsum_data.zip: 869MB [00:29, 29.7MB/s] \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'./bertsumdata/'"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Upload the Downloaded Dataset to AML Workspace"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"experiment = Experiment(ws, name=EXPERIMENT_NAME)\n",
|
||||
"ds = ws.get_default_datastore()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)"
|
||||
"\"\"\" No need to download data \n",
|
||||
"!mkdir -p {LOCAL_DATA_FOLDER}\n",
|
||||
"CNNDMBertSumProcessedData.download(local_path=LOCAL_DATA_FOLDER)\n",
|
||||
"\n",
|
||||
"#ds.upload(src_dir=LOCAL_DATA_FOLDER, target_path=TARGET_DATA_FOLDER)\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -314,7 +277,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -354,7 +317,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -364,15 +327,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:azureml.train.estimator._framework_base_estimator:If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
|
||||
"WARNING:azureml.train.estimator._framework_base_estimator:framework_version is not specified, defaulting to version 1.3.\n"
|
||||
"WARNING - If environment_definition or conda_dependencies_file is specified, Azure ML will not install any framework related packages on behalf of the user.\n",
|
||||
"WARNING - framework_version is not specified, defaulting to version 1.3.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -402,7 +365,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -411,7 +374,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
|
@ -419,7 +382,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "97f3678284a44f7aab5c27fa3e19bb11",
|
||||
"model_id": "c654a5b7687141c2bf29b0f8efb27f14",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
|
@ -429,6 +392,13 @@
|
|||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/aml.mini.widget.v1": "{\"status\": \"Queued\", \"workbench_run_details_uri\": \"https://ml.azure.com/experiments/NLP-ExtSum/runs/NLP-ExtSum_1584763863_e337b0ee?wsid=/subscriptions/9086b59a-02d7-4687-b3fd-e39fa5e0fd9b/resourcegroups/daden1aml/workspaces/daden1amlws\", \"run_id\": \"NLP-ExtSum_1584763863_e337b0ee\", \"run_properties\": {\"run_id\": \"NLP-ExtSum_1584763863_e337b0ee\", \"created_utc\": \"2020-03-21T04:11:09.358987Z\", \"properties\": {\"_azureml.ComputeTargetType\": \"amlcompute\", \"ContentSnapshotId\": \"2394595f-3a71-4c08-9a86-355ade205ff3\", \"azureml.git.repository_uri\": \"https://github.com/microsoft/nlp-recipes.git\", \"mlflow.source.git.repoURL\": \"https://github.com/microsoft/nlp-recipes.git\", \"azureml.git.branch\": \"daden/bertsumext\", \"mlflow.source.git.branch\": \"daden/bertsumext\", \"azureml.git.commit\": \"2e6a9379a7dcb94262d6be7dd3e304a056ec03c5\", \"mlflow.source.git.commit\": \"2e6a9379a7dcb94262d6be7dd3e304a056ec03c5\", \"azureml.git.dirty\": \"True\", \"AzureML.DerivedImageName\": \"azureml/azureml_901415173bf81e758fea3fcc8a8a9c07\", \"ProcessInfoFile\": \"azureml-logs/process_info.json\", \"ProcessStatusFile\": \"azureml-logs/process_status.json\"}, \"tags\": {\"_aml_system_ComputeTargetStatus\": \"{\\\"AllocationState\\\":\\\"steady\\\",\\\"PreparingNodeCount\\\":0,\\\"RunningNodeCount\\\":0,\\\"CurrentNodeCount\\\":0}\"}, \"script_name\": null, \"arguments\": null, \"end_time_utc\": null, \"status\": \"Queued\", \"log_files\": {}, \"log_groups\": [], \"run_duration\": \"0:08:09\"}, \"child_runs\": [], \"children_metrics\": {}, \"run_metrics\": [], \"run_logs\": \"Your job is submitted in Azure cloud and we are monitoring to get logs...\", \"graph\": {}, \"widget_settings\": {\"childWidgetDisplay\": \"popup\", \"send_telemetry\": false, \"log_level\": \"INFO\", \"sdk_version\": \"1.0.85\"}, \"loading\": false}"
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
|
@ -497,43 +467,6 @@
|
|||
" show_progress=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from utils_nlp.eval.evaluate_summarization import get_rouge\n",
|
||||
"from utils_nlp.models.transformers.extractive_summarization import ExtSumProcessedData\n",
|
||||
"import pickle\n",
|
||||
"from utils_nlp.models.transformers.extractive_summarization import ExtractiveSummarizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dataset, test_dataset = ExtSumProcessedData().splits(root=LOCAL_DATA_FOLDER)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"target = [i['tgt_txt'] for i in test_dataset]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
|
@ -546,6 +479,80 @@
|
|||
" prediction.append(line[0:-1]) # remove the ending \"\\n\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prediction[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Compare with gold summaries"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TOP_N = 100"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"processor = ExtSumProcessor()\n",
|
||||
"_, test_dataset = CNNDMSummarizationDataset(top_n=TOP_N, local_cache_path=LOCAL_DATA_FOLDER)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"source = []\n",
|
||||
"temp_target = []\n",
|
||||
"for i in ext_sum_test:\n",
|
||||
" source.append(i[\"src_txt\"]) \n",
|
||||
" temp_target.append(\" \".join(j) for j in i['tgt']) \n",
|
||||
"target = [''.join(i) for i in list(temp_target)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"target[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"source[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Download and evaluation the trained model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
|
@ -575,11 +582,14 @@
|
|||
"ds.download(target_path=LOCAL_OUTPUT_DIR,\n",
|
||||
" prefix=f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}',\n",
|
||||
" show_progress=True)\n",
|
||||
"summarizer = ExtractiveSummarizer(MODEL_NAME, ENCODER, LOCAL_OUTPUT_DIR)\n",
|
||||
"\n",
|
||||
"processor = ExtSumProcessor()\n",
|
||||
"summarizer = ExtractiveSummarizer(processor, encoder=ENCODER, cache_dir=LOCAL_OUTPUT_DIR)\n",
|
||||
"summarizer.model.load_state_dict(\n",
|
||||
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'))\n",
|
||||
" torch.load(os.path.join(LOCAL_OUTPUT_DIR, f'{TARGET_OUTPUT_DIR}{MODEL_FILENAME}'),\n",
|
||||
" map_location=\"cpu\"))\n",
|
||||
")\n",
|
||||
"prediction = summarizer.predict(test_dataset, num_gpus=torch.cuda.device_count(), batch_size=128)\n",
|
||||
"prediction = summarizer.predict(test_dataset[0:TOP_N], num_gpus=torch.cuda.device_count(), batch_size=128, sentence_separator = \"\\n\")\n",
|
||||
"#\"\"\""
|
||||
]
|
||||
},
|
||||
|
@ -616,7 +626,7 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"test_dataset[0]['src_txt']"
|
||||
"source[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -659,15 +669,6 @@
|
|||
"target[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"RESULT_DIR = TemporaryDirectory().name"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
|
@ -716,7 +717,8 @@
|
|||
}
|
||||
],
|
||||
"source": [
|
||||
"rouge_score = get_rouge(prediction, target, RESULT_DIR)"
|
||||
"rouge_scores = compute_rouge_python(cand=prediction, ref=target)\n",
|
||||
"pprint.pprint(rouge_scores)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -738,9 +740,7 @@
|
|||
"if os.path.exists(LOCAL_OUTPUT_DIR):\n",
|
||||
" shutil.rmtree(LOCAL_OUTPUT_DIR, ignore_errors=True)\n",
|
||||
"if os.path.exists(PROJECT_FOLDER):\n",
|
||||
" shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)\n",
|
||||
"if os.path.exists(RESULT_DIR):\n",
|
||||
" shutil.rmtree(RESULT_DIR, ignore_errors=True)"
|
||||
" shutil.rmtree(PROJECT_FOLDER, ignore_errors=True)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
Загрузка…
Ссылка в новой задаче