use azureml_utils for workspace creation

2019-06-11 13:05:39 -04:00 · 2019-06-11 13:05:39 -04:00 · ba3ba5b5a8
--- a/scenarios/sentence_similarity/senteval_azureml.ipynb
+++ b/scenarios/sentence_similarity/senteval_azureml.ipynb
@ -5,13 +5,13 @@
   "metadata": {},
   "source": [
    "# SentEval with AzureML\n",
-    "SentEval is a widely used benchmarking tool for evaluating general-purpose sentence embeddings. It provides a simple interface for evaluating your embeddings on up to 17 supported downstream tasks (such as sentiment classification, natural language inference, semantic similarity, etc.)\n",
+    "[SentEval](https://github.com/facebookresearch/SentEval) is a widely used benchmarking tool for evaluating general-purpose sentence embeddings. It provides a simple interface for evaluating your embeddings on up to 17 supported downstream tasks (such as sentiment classification, natural language inference, semantic similarity, etc.)\n",
    "\n",
-    "This notebook shows how to use SentEval with the AzureML SDK. Running SentEval locally is easy, but not necessarily efficient depending on the model specs. For example, it can quickly become expensive if you are trying to benchmark a model that runs on GPU, even if you are starting with pretrained weights (loading the embeddings and vocabulary for inferencing can take a nontrivial amount of time). In this example we show how to run SentEval for Gensen, where\n",
-    "- the model weights are on AzureML Datastore\n",
-    "- the pretrained embeddings are on AzureML Datastore\n",
-    "- the data for the SentEval transfer tasks are on AzureML Datastore\n",
-    "- evaluation runs on the AzureML Workspace GPU Compute Target (no extra provisioning/config needed)"
+    "This notebook shows how to use SentEval with the AzureML SDK. Running SentEval locally is easy, but not necessarily efficient depending on the model specs. For example, it can quickly become expensive if you are trying to benchmark a model that runs on GPU, even if you are starting with pretrained weights (loading the embeddings and vocabulary for inferencing can take a nontrivial amount of time). In this example we show how to run SentEval for [Gensen](https://github.com/Maluuba/gensen), where\n",
+    "- the model weights are on AzureML Datastore. To download the pre-trained Gensen model, run `bash download_models.sh` from the gensen/data/models directory. \n",
+    "- the embeddings are on AzureML Datastore. To download the pre-trained embeddings, run `bash glove2h5.sh` from the gensen/data/embedding directory.\n",
+    "- the data for the SentEval transfer tasks are on AzureML Datastore. To download these datasets, run `bash get_transfer_data.bash` from the SentEval/data/downstream directory.\n",
+    "- evaluation runs on the AzureML Workspace GPU Compute Target (no extra provisioning/config needed)."
   ]
  },
  {
@ -28,12 +28,28 @@
   "outputs": [],
   "source": [
    "import os\n",
+    "import sys\n",
+    "\n",
+    "import azureml.core\n",
+    "from azureml.core.workspace import Workspace\n",
+    "\n",
+    "from azureml.core.compute import ComputeTarget, AmlCompute\n",
+    "from azureml.core.compute_target import ComputeTargetException\n",
+    "\n",
+    "from azureml.core import Datastore\n",
+    "import azureml.data\n",
+    "from azureml.data.azure_storage_datastore import AzureFileDatastore\n",
+    "\n",
+    "from azureml.train.dnn import PyTorch\n",
+    "from azureml.core.runconfig import MpiConfiguration\n",
+    "from azureml.core import Experiment\n",
+    "from azureml.widgets import RunDetails\n",
+    "\n",
+    "sys.path.append(\"../../\")\n",
+    "from utils_nlp.azureml.azureml_utils import get_or_create_workspace\n",
    "\n",
    "AZUREML_VERBOSE = False\n",
    "\n",
-    "src_dir = \"./senteval-pytorch-gensen\"\n",
-    "os.makedirs(src_dir, exist_ok=True)\n",
-    "\n",
    "PATH_TO_GENSEN = (\n",
    "    \"../../../gensen\"\n",
    ")  # Set this path to where you have cloned the gensen source code\n",
@ -41,8 +57,8 @@
    "    \"../../../SentEval\"\n",
    ")  # Set this path to where you have cloned the senteval source code\n",
    "\n",
-    "cluster_name = \"eval-gpu\"\n",
-    "ds_root = \"senteval_pytorch_gensen\"  # Root path for the datastore"
+    "cluster_name = \"eval-gpu\"  # Name of AzureML Compute Target cluster\n",
+    "ds_root = \"senteval_pytorch_gensen\"  # Name of root directory for the datastore"
   ]
  },
  {
@ -60,10 +76,13 @@
   },
   "outputs": [],
   "source": [
-    "import azureml.core\n",
-    "from azureml.core.workspace import Workspace\n",
+    "ws = get_or_create_workspace(\n",
+    "    subscription_id=\"<SUBSCRIPTION_ID>\",\n",
+    "    resource_group=\"<RESOURCE_GROUP>\",\n",
+    "    workspace_name=\"<WORKSPACE_NAME>\",\n",
+    "    workspace_region=\"<WORKSPACE_REGION>\",\n",
+    ")\n",
    "\n",
-    "ws = Workspace.from_config()\n",
    "if AZUREML_VERBOSE:\n",
    "    print(\"Workspace name: {}\".format(ws.name))\n",
    "    print(\"Resource group: {}\".format(ws.resource_group))"
@ -82,9 +101,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from azureml.core.compute import ComputeTarget, AmlCompute\n",
-    "from azureml.core.compute_target import ComputeTargetException\n",
-    "\n",
    "try:\n",
    "    compute_target = ComputeTarget(workspace=ws, name=cluster_name)\n",
    "    print(\"Found compute target: {}\".format(cluster_name))\n",
@ -117,8 +133,6 @@
   },
   "outputs": [],
   "source": [
-    "from azureml.core import Datastore\n",
-    "\n",
    "ds = ws.get_default_datastore()\n",
    "if AZUREML_VERBOSE:\n",
    "    print(\"Default datastore: {}\".format(ds.name))"
@ -132,9 +146,6 @@
   },
   "outputs": [],
   "source": [
-    "import azureml.data\n",
-    "from azureml.data.azure_storage_datastore import AzureFileDatastore\n",
-    "\n",
    "# Upload the gensen dependency\n",
    "ds.upload(\n",
    "    src_dir=os.path.join(PATH_TO_GENSEN),\n",
@ -174,6 +185,16 @@
    "### Create the evaluation script"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "src_dir = \"./senteval-pytorch-gensen\"\n",
+    "os.makedirs(src_dir, exist_ok=True)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -290,9 +311,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from azureml.train.dnn import PyTorch\n",
-    "from azureml.core.runconfig import MpiConfiguration\n",
-    "\n",
    "est = PyTorch(\n",
    "    source_directory=src_dir,\n",
    "    script_params={\n",
@ -325,8 +343,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from azureml.core import Experiment\n",
-    "\n",
    "experiment = Experiment(ws, name=\"senteval-pytorch-gensen\")\n",
    "run = experiment.submit(est)"
   ]
@ -344,8 +360,6 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from azureml.widgets import RunDetails\n",
-    "\n",
    "RunDetails(run).show()"
   ]
  },
@ -368,9 +382,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python (nlp_cpu)",
   "language": "python",
-   "name": "python3"
+   "name": "nlp_cpu"
  },
  "language_info": {
   "codemirror_mode": {
--- a/utils_nlp/azureml/azureml_utils.py
+++ b/utils_nlp/azureml/azureml_utils.py
@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+
+from azureml.core import Workspace
+
+
+def get_or_create_workspace(
+    config_path=None,
+    subscription_id=None,
+    resource_group=None,
+    workspace_name=None,
+    workspace_region=None,
+):
+    """Get or create AzureML Workspace this will save the config to the path specified for later use
+
+    Args:
+        config_path (str): optional directory to look for / store config.json file (defaults to current directory)
+        subscription_id (str): subscription id
+        resource_group (str): resource group
+        workspace_name (str): workspace name
+        workspace_region (str): region
+
+    Returns:
+        Workspace
+    """
+
+    # use environment variables if needed
+    if subscription_id is None:
+        subscription_id = os.getenv("SUBSCRIPTION_ID")
+    if resource_group is None:
+        resource_group = os.getenv("RESOURCE_GROUP")
+    if workspace_name is None:
+        workspace_name = os.getenv("WORKSPACE_NAME")
+    if workspace_region is None:
+        workspace_region = os.getenv("WORKSPACE_REGION")
+
+    # define fallback options in order to try
+    options = [
+        (
+            Workspace,
+            dict(
+                subscription_id=subscription_id,
+                resource_group=resource_group,
+                workspace_name=workspace_name,
+            ),
+        ),
+        (Workspace.from_config, dict(path=config_path)),
+        (
+            Workspace.create,
+            dict(
+                subscription_id=subscription_id,
+                resource_group=resource_group,
+                name=workspace_name,
+                location=workspace_region,
+                create_resource_group=True,
+                exist_ok=True,
+            ),
+        ),
+    ]
+
+    for function, kwargs in options:
+        try:
+            ws = function(**kwargs)
+            break
+        except Exception:
+            continue
+    else:
+        raise ValueError(
+            "Failed to get or create AzureML Workspace with the configuration information provided"
+        )
+
+    ws.write_config(path=config_path)
+    return ws