Automate deployment of Azure resources (#221)

This PR adds an Azure Resource Manager template to create the AzureML workspace and a compute cluster. The documentation has been updated to reflect that.
Also, the "location" argument in azure_config has been retired, because we assume that the workspace is already created.
This commit is contained in:
Anton Schwaighofer 2020-09-16 20:24:24 +01:00 коммит произвёл GitHub
Родитель b654c23e0c
Коммит 92d9d8211e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 837 добавлений и 63 удалений

Просмотреть файл

@ -69,7 +69,6 @@ class AzureConfig(GenericConfig):
param.String(None, doc="The name of the keyvault secret that contains the dataset storage account key.")
datasets_container: str = param.String(None, doc="The blob storage container to use to access datasets in AML jobs")
workspace_name: str = param.String(None, doc="The name of the AzureML workspace that should be used.")
workspace_region: str = param.String(None, doc="The region to create AML workspaces in")
resource_group: str = param.String(None, doc="The resource group to create AML workspaces in")
docker_shm_size: str = param.String("440g", doc="The amount of memory available to experiments")
node_count: int = param.Integer(1, bounds=(1, None), doc="Number of concurrent runs to launch")

Просмотреть файл

@ -17,7 +17,6 @@ from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.datastore import Datastore
from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.exceptions import WorkspaceException
from azureml.train.dnn import PyTorch
from InnerEye.Azure import azure_util
@ -65,8 +64,8 @@ def submit_to_azureml(azure_config: AzureConfig,
for s in [signal.SIGINT, signal.SIGTERM]:
signal.signal(s, interrupt_handler)
# create an AzureML workspace
workspace = get_workspace(azure_config)
# Retrieve the AzureML workspace
workspace = azure_config.get_workspace()
# create train/test experiment
azure_run = create_and_submit_experiment(workspace, azure_config, source_config, model_config_overrides,
@ -80,27 +79,6 @@ def submit_to_azureml(azure_config: AzureConfig,
return azure_run
def get_workspace(azure_config: AzureConfig) -> Workspace:
"""
Gets an AzureML workspace for the build user to keep track of the experiments
:param azure_config: configurations for model execution ie: name, execution mode
:return: a configured workspace to run this experiment in
"""
try:
return azure_config.get_workspace()
except WorkspaceException:
auth = azure_config.get_service_principal_auth()
return Workspace.create(name=azure_util.to_azure_friendly_string(azure_config.workspace_name),
auth=auth,
subscription_id=azure_config.subscription_id,
storage_account=azure_config.storage_account,
resource_group=azure_config.resource_group,
location=azure_config.workspace_region,
exist_ok=True,
create_resource_group=True)
def set_run_tags(run: Run, azure_config: AzureConfig, model_config_overrides: str) -> None:
"""
Set metadata for the run

Просмотреть файл

@ -11,7 +11,6 @@ variables:
docker_shm_size: '440g'
node_count: 1
workers_per_node: 1
workspace_region: 'West Europe'
workspace_name: 'InnerEye-DeepLearning'
gpu_cluster_name: 'training-nd24'
model_configs_namespace: ''

Просмотреть файл

@ -0,0 +1,780 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"workspaceName": {
"type": "string",
"metadata": {
"description": "The name of Azure Machine Learning workspace. Use letters and numbers only, max 16 char"
}
},
"computeClusterName": {
"type": "string",
"defaultValue": "NC24-LowPrio",
"metadata": {
"description": "The name of the AML Compute cluster to create under Azure Machine Learning workspace."
}
},
"clusterMaxNodeCount": {
"type": "int",
"defaultValue": 10,
"metadata": {
"description": "Max number of nodes to use in the compute cluster"
}
},
"clusterMinNodeCount": {
"type": "int",
"defaultValue": 0,
"metadata": {
"description": "Min number of nodes that will always be kept running in the compute cluster"
}
},
"clusterNodeIdleTimeBeforeScaleDown": {
"type": "string",
"defaultValue": "P0Y0M0DT0H2M",
"metadata": {
"description": "Node Idle Time before scaling down AmlCompute in seconds. Format is xml duration schema https://www.w3.org/TR/xmlschema-2/#duration"
}
},
"clusterRemoteLoginPortPublicAccess": {
"type": "string",
"defaultValue": "NotSpecified",
"allowedValues": [
"Enabled",
"Disabled",
"NotSpecified"
],
"metadata": {
"description": "State of the public SSH port. Possible values are: Disabled - Indicates that the public ssh port is closed on all nodes of the cluster. Enabled - Indicates that the public ssh port is open on all nodes of the cluster. NotSpecified - Indicates that the public ssh port is closed on all nodes of the cluster if VNet is defined, else is open all public nodes. It can be default only during cluster creation time, after creation it will be either enabled or disabled."
}
},
"clusterVmPriority": {
"type": "string",
"defaultValue": "LowPriority",
"allowedValues": [
"Dedicated",
"LowPriority"
],
"metadata": {
"description": "The priority of the Virtual Machines in the cluster. Low priority is a lot cheaper."
}
},
"clusterVmSize": {
"type": "string",
"defaultValue": "Standard_NC24s_v3",
"allowedValues": [
"Standard_D1_v2",
"Standard_D2_v2",
"Standard_D3_v2",
"Standard_D4_v2",
"Standard_D11_v2",
"Standard_D12_v2",
"Standard_D13_v2",
"Standard_D14_v2",
"Standard_DS1_v2",
"Standard_DS2_v2",
"Standard_DS3_v2",
"Standard_DS4_v2",
"Standard_DS5_v2",
"Standard_DS11_v2",
"Standard_DS12_v2",
"Standard_DS13_v2",
"Standard_DS14_v2",
"Standard_M8-2ms",
"Standard_M8-4ms",
"Standard_M8ms",
"Standard_M16-4ms",
"Standard_M16-8ms",
"Standard_M16ms",
"Standard_M32-8ms",
"Standard_M32-16ms",
"Standard_M32ls",
"Standard_M32ms",
"Standard_M32ts",
"Standard_M64-16ms",
"Standard_M64-32ms",
"Standard_M64ls",
"Standard_M64ms",
"Standard_M64s",
"Standard_M128-32ms",
"Standard_M128-64ms",
"Standard_M128ms",
"Standard_M128s",
"Standard_M64",
"Standard_M64m",
"Standard_M128",
"Standard_M128m",
"Standard_D1",
"Standard_D2",
"Standard_D3",
"Standard_D4",
"Standard_D11",
"Standard_D12",
"Standard_D13",
"Standard_D14",
"Standard_DS15_v2",
"Standard_NV6",
"Standard_NV12",
"Standard_NV24",
"Standard_F2s_v2",
"Standard_F4s_v2",
"Standard_F8s_v2",
"Standard_F16s_v2",
"Standard_F32s_v2",
"Standard_F64s_v2",
"Standard_F72s_v2",
"Standard_NC6s_v3",
"Standard_NC12s_v3",
"Standard_NC24rs_v3",
"Standard_NC24s_v3",
"Standard_NC6",
"Standard_NC12",
"Standard_NC24",
"Standard_NC24r",
"Standard_ND6s",
"Standard_ND12s",
"Standard_ND24rs",
"Standard_ND24s",
"Standard_NC6s_v2",
"Standard_NC12s_v2",
"Standard_NC24rs_v2",
"Standard_NC24s_v2",
"Standard_ND40rs_v2",
"Standard_NV12s_v3",
"Standard_NV24s_v3",
"Standard_NV48s_v3"
],
"metadata": {
"description": "The type of Virtual Machines to use in the cluster."
}
},
"storageAccountOption": {
"type": "string",
"defaultValue": "new",
"allowedValues": [
"new",
"existing"
],
"metadata": {
"description": "Determines whether or not a new storage should be provisioned."
}
},
"storageAccountName": {
"type": "string",
"defaultValue": "[concat(toLower(parameters('workspaceName')), 'storage')]",
"metadata": {
"description": "Name of the storage account."
}
},
"storageAccountType": {
"type": "string",
"defaultValue": "Standard_LRS",
"allowedValues": [
"Standard_LRS",
"Standard_GRS",
"Standard_RAGRS",
"Standard_ZRS",
"Premium_LRS",
"Premium_ZRS",
"Standard_GZRS",
"Standard_RAGZRS"
]
},
"storageAccountBehindVNet": {
"type": "string",
"defaultValue": "false",
"allowedValues": [
"true",
"false"
],
"metadata": {
"description": "Determines whether or not to put the storage account behind VNet"
}
},
"storageAccountResourceGroupName": {
"type": "string",
"defaultValue": "[resourceGroup().name]"
},
"sku": {
"type": "string",
"defaultValue": "Basic",
"allowedValues": [
"Basic",
"Enterprise"
],
"metadata": {
"description": "Specifies the sku, also referred as 'edition' of the Azure Machine Learning workspace."
}
},
"keyVaultOption": {
"type": "string",
"defaultValue": "new",
"allowedValues": [
"new",
"existing"
],
"metadata": {
"description": "Determines whether or not a new key vault should be provisioned."
}
},
"keyVaultName": {
"type": "string",
"defaultValue": "[concat(toLower(parameters('workspaceName')), 'keys')]",
"metadata": {
"description": "Name of the key vault."
}
},
"keyVaultBehindVNet": {
"type": "string",
"defaultValue": "false",
"allowedValues": [
"true",
"false"
],
"metadata": {
"description": "Determines whether or not to put the storage account behind VNet"
}
},
"keyVaultResourceGroupName": {
"type": "string",
"defaultValue": "[resourceGroup().name]"
},
"applicationInsightsOption": {
"type": "string",
"defaultValue": "new",
"allowedValues": [
"new",
"existing"
],
"metadata": {
"description": "Determines whether or not new ApplicationInsights should be provisioned."
}
},
"applicationInsightsName": {
"type": "string",
"defaultValue": "[concat(parameters('workspaceName'), 'insights')]",
"metadata": {
"description": "Name of ApplicationInsights."
}
},
"applicationInsightsResourceGroupName": {
"type": "string",
"defaultValue": "[resourceGroup().name]"
},
"containerRegistryOption": {
"type": "string",
"defaultValue": "none",
"allowedValues": [
"new",
"existing",
"none"
],
"metadata": {
"description": "Determines whether or not a new container registry should be provisioned."
}
},
"containerRegistryName": {
"type": "string",
"defaultValue": "[concat('cr',uniqueString(resourceGroup().id, parameters('workspaceName')))]",
"metadata": {
"description": "The container registry bind to the workspace."
}
},
"containerRegistrySku": {
"type": "string",
"defaultValue": "Standard",
"allowedValues": [
"Basic",
"Standard",
"Premium"
]
},
"containerRegistryResourceGroupName": {
"type": "string",
"defaultValue": "[resourceGroup().name]"
},
"containerRegistryBehindVNet": {
"type": "string",
"defaultValue": "false",
"allowedValues": [
"true",
"false"
],
"metadata": {
"description": "Determines whether or not to put container registry behind VNet."
}
},
"vnetOption": {
"type": "string",
"defaultValue": "[if(equals(parameters('privateEndpointType'), 'none'), 'none', 'new')]",
"allowedValues": [
"new",
"existing",
"none"
],
"metadata": {
"description": "Determines whether or not a new VNet should be provisioned."
}
},
"vnetName": {
"type": "string",
"defaultValue": "[concat('vn',uniqueString(resourceGroup().id, parameters('workspaceName')))]",
"metadata": {
"description": "Name of the VNet"
}
},
"vnetResourceGroupName": {
"type": "string",
"defaultValue": "[resourceGroup().name]"
},
"addressPrefixes": {
"type": "array",
"defaultValue": [
"10.0.0.0/16"
],
"metadata": {
"description": "Address prefix of the virtual network"
}
},
"subnetOption": {
"type": "string",
"defaultValue": "[if(or(not(equals(parameters('privateEndpointType'), 'none')), equals(parameters('vnetOption'), 'new')), 'new', 'none')]",
"allowedValues": [
"new",
"existing",
"none"
],
"metadata": {
"description": "Determines whether or not a new subnet should be provisioned."
}
},
"subnetName": {
"type": "string",
"defaultValue": "[concat('sn',uniqueString(resourceGroup().id, parameters('workspaceName')))]",
"metadata": {
"description": "Name of the subnet"
}
},
"subnetPrefix": {
"type": "string",
"defaultValue": "10.0.0.0/24",
"metadata": {
"description": "Subnet prefix of the virtual network"
}
},
"adbWorkspace": {
"type": "string",
"defaultValue": "",
"metadata": {
"description": "Azure Databrick workspace to be linked to the workspace"
}
},
"confidential_data": {
"type": "string",
"defaultValue": "false",
"allowedValues": [
"false",
"true"
],
"metadata": {
"description": "Specifies that the Azure Machine Learning workspace holds highly confidential data."
}
},
"encryption_status": {
"type": "string",
"defaultValue": "Disabled",
"allowedValues": [
"Enabled",
"Disabled"
],
"metadata": {
"description": "Specifies if the Azure Machine Learning workspace should be encrypted with customer managed key."
}
},
"cmk_keyvault": {
"type": "string",
"defaultValue": "",
"metadata": {
"description": "Specifies the customer managed keyVault arm id."
}
},
"resource_cmk_uri": {
"type": "string",
"defaultValue": "",
"metadata": {
"description": "Specifies if the customer managed keyvault key uri."
}
},
"privateEndpointType": {
"type": "string",
"defaultValue": "none",
"allowedValues": [
"AutoApproval",
"ManualApproval",
"none"
]
},
"tagValues": {
"type": "object",
"defaultValue": {}
},
"privateEndpointName": {
"type": "string",
"defaultValue": "pe",
"metadata": {
"description": "Name of the private end point added to the workspace"
}
}
},
"variables": {
"tenantId": "[subscription().tenantId]",
"location": "[resourceGroup().location]",
"storageAccount": "[resourceId(parameters('storageAccountResourceGroupName'), 'Microsoft.Storage/storageAccounts', parameters('storageAccountName'))]",
"keyVault": "[resourceId(parameters('keyVaultResourceGroupName'), 'Microsoft.KeyVault/vaults', parameters('keyVaultName'))]",
"containerRegistry": "[resourceId(parameters('containerRegistryResourceGroupName'), 'Microsoft.ContainerRegistry/registries', parameters('containerRegistryName'))]",
"applicationInsights": "[resourceId(parameters('applicationInsightsResourceGroupName'), 'Microsoft.Insights/components', parameters('applicationInsightsName'))]",
"vnet": "[resourceId(parameters('vnetResourceGroupName'), 'Microsoft.Network/virtualNetworks', parameters('vnetName'))]",
"subnet": "[resourceId(parameters('vnetResourceGroupName'), 'Microsoft.Network/virtualNetworks/subnets', parameters('vnetName'), parameters('subnetName'))]",
"locationsPEAvailable": [
"centraluseuap",
"eastus",
"eastus2euap",
"westus2"
],
"enablePE": "[or(equals(parameters('privateEndpointType'), 'none'), contains(variables('locationsPEAvailable'), variables('location')))]",
"networkRuleSetBehindVNet": {
"defaultAction": "deny",
"virtualNetworkRules": [
{
"action": "Allow",
"id": "[variables('subnet')]"
}
]
},
"subnetPolicyForPE": {
"privateEndpointNetworkPolicies": "Disabled",
"privateLinkServiceNetworkPolicies": "Enabled"
},
"privateEndpointSettings": {
"name": "[concat(parameters('workspaceName'), '-PrivateEndpoint')]",
"properties": {
"privateLinkServiceId": "[resourceId('Microsoft.MachineLearningServices/workspaces', parameters('workspaceName'))]",
"groupIds": [
"amlworkspace"
]
}
},
"defaultPEConnections": "[array(variables('privateEndpointSettings'))]"
},
"resources": [
{
"condition": "[and(variables('enablePE'), equals(parameters('vnetOption'), 'new'))]",
"type": "Microsoft.Network/virtualNetworks",
"apiVersion": "2019-09-01",
"name": "[parameters('vnetName')]",
"location": "[variables('location')]",
"tags": "[parameters('tagValues')]",
"properties": {
"addressSpace": {
"addressPrefixes": "[parameters('addressPrefixes')]"
},
"enableDdosProtection": false,
"enableVmProtection": false
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('subnetOption'), 'new'))]",
"type": "Microsoft.Network/virtualNetworks/subnets",
"apiVersion": "2019-09-01",
"name": "[concat(parameters('vnetName'), '/', parameters('subnetName'))]",
"dependsOn": [
"[variables('vnet')]"
],
"properties": {
"addressPrefix": "[parameters('subnetPrefix')]",
"privateEndpointNetworkPolicies": "Disabled",
"privateLinkServiceNetworkPolicies": "Enabled",
"serviceEndpoints": [
{
"service": "Microsoft.Storage"
},
{
"service": "Microsoft.KeyVault"
},
{
"service": "Microsoft.ContainerRegistry"
}
]
}
},
{
"condition": "[and(equals(parameters('subnetOption'), 'existing'), not(equals(parameters('privateEndpointType'), 'none')))]",
"type": "Microsoft.Resources/deployments",
"apiVersion": "2019-10-01",
"name": "UpdateSubnetPolicy",
"dependsOn": [
"[variables('subnet')]"
],
"resourceGroup": "[parameters('vnetResourceGroupName')]",
"properties": {
"mode": "Incremental",
"template": {
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"resources": [
{
"type": "Microsoft.Network/virtualNetworks/subnets",
"apiVersion": "2019-09-01",
"name": "[concat(parameters('vnetName'), '/', parameters('subnetName'))]",
"properties": "[if(and(equals(parameters('subnetOption'), 'existing'), not(equals(parameters('privateEndpointType'), 'none'))), union(reference(variables('subnet'), '2019-09-01'), variables('subnetPolicyForPE')), json('null'))]"
}
]
}
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('storageAccountOption'), 'new'))]",
"type": "Microsoft.Storage/storageAccounts",
"apiVersion": "2019-04-01",
"name": "[parameters('storageAccountName')]",
"tags": "[parameters('tagValues')]",
"dependsOn": [
"[variables('subnet')]"
],
"location": "[variables('location')]",
"sku": {
"name": "[parameters('storageAccountType')]"
},
"kind": "StorageV2",
"properties": {
"encryption": {
"services": {
"blob": {
"enabled": true
},
"file": {
"enabled": true
}
},
"keySource": "Microsoft.Storage"
},
"supportsHttpsTrafficOnly": true,
"networkAcls": "[if(equals(parameters('storageAccountBehindVNet'), 'true'), variables('networkRuleSetBehindVNet'), json('null'))]"
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('keyVaultOption'), 'new'))]",
"type": "Microsoft.KeyVault/vaults",
"apiVersion": "2019-09-01",
"tags": "[parameters('tagValues')]",
"dependsOn": [
"[variables('subnet')]"
],
"name": "[parameters('keyVaultName')]",
"location": "[variables('location')]",
"properties": {
"tenantId": "[variables('tenantId')]",
"sku": {
"name": "standard",
"family": "A"
},
"accessPolicies": [],
"networkAcls": "[if(equals(parameters('keyVaultBehindVNet'), 'true'), variables('networkRuleSetBehindVNet'), json('null'))]"
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('containerRegistryOption'), 'new'))]",
"type": "Microsoft.ContainerRegistry/registries",
"apiVersion": "2019-05-01",
"tags": "[parameters('tagValues')]",
"name": "[parameters('containerRegistryName')]",
"dependsOn": [
"[variables('subnet')]"
],
"location": "[variables('location')]",
"sku": {
"name": "[parameters('containerRegistrySku')]"
},
"properties": {
"adminUserEnabled": true,
"networkRuleSet": "[if(equals(parameters('containerRegistryBehindVNet'), 'true'), variables('networkRuleSetBehindVNet'), json('null'))]"
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('applicationInsightsOption'), 'new'))]",
"type": "Microsoft.Insights/components",
"tags": "[parameters('tagValues')]",
"apiVersion": "2018-05-01-preview",
"name": "[parameters('applicationInsightsName')]",
"location": "[if(or(equals(variables('location'),'westcentralus'), equals(variables('location'),'eastus2euap'), equals(variables('location'),'centraluseuap')),'southcentralus',variables('location'))]",
"kind": "web",
"properties": {
"Application_Type": "web"
}
},
{
"condition": "[variables('enablePE')]",
"type": "Microsoft.MachineLearningServices/workspaces",
"apiVersion": "2020-04-01",
"tags": "[parameters('tagValues')]",
"name": "[parameters('workspaceName')]",
"location": "[variables('location')]",
"dependsOn": [
"[variables('storageAccount')]",
"[variables('keyVault')]",
"[variables('applicationInsights')]",
"[variables('containerRegistry')]"
],
"identity": {
"type": "systemAssigned"
},
"sku": {
"tier": "[parameters('sku')]",
"name": "[parameters('sku')]"
},
"properties": {
"friendlyName": "[parameters('workspaceName')]",
"storageAccount": "[variables('storageAccount')]",
"keyVault": "[variables('keyVault')]",
"applicationInsights": "[variables('applicationInsights')]",
"containerRegistry": "[if(not(equals(parameters('containerRegistryOption'), 'none')), variables('containerRegistry'), json('null'))]"
},
"resources":[{
"type": "Microsoft.MachineLearningServices/workspaces/computes",
"apiVersion": "2020-04-01",
"name": "[concat(parameters('workspaceName'), '/', parameters('computeClusterName'))]",
"location": "[variables('location')]",
"dependsOn": [
"[parameters('workspaceName')]"
],
"properties": {
"computeType": "AmlCompute",
"properties": {
"remoteLoginPortPublicAccess": "[parameters('clusterRemoteLoginPortPublicAccess')]",
"scaleSettings": {
"maxNodeCount": "[parameters('clusterMaxNodeCount')]",
"minNodeCount": "[parameters('clusterMinNodeCount')]",
"nodeIdleTimeBeforeScaleDown": "[parameters('clusterNodeIdleTimeBeforeScaleDown')]"
},
"vmPriority": "[parameters('clusterVmPriority')]",
"vmSize": "[parameters('clusterVmSize')]"
}
}
}]
},
{
"condition": "[and(variables('enablePE'), not(equals(parameters('privateEndpointType'), 'none')))]",
"apiVersion": "2020-04-01",
"name": "[parameters('privateEndpointName')]",
"type": "Microsoft.Network/privateEndpoints",
"location": "[variables('location')]",
"tags": "[parameters('tagValues')]",
"dependsOn": [
"[resourceId('Microsoft.MachineLearningServices/workspaces', parameters('workspaceName'))]",
"[variables('subnet')]"
],
"properties": {
"privateLinkServiceConnections": "[if(equals(parameters('privateEndpointType'), 'AutoApproval'), variables('defaultPEConnections'), json('null'))]",
"manualPrivateLinkServiceConnections": "[if(equals(parameters('privateEndpointType'), 'ManualApproval'), variables('defaultPEConnections'), json('null'))]",
"subnet": {
"id": "[variables('subnet')]"
}
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('privateEndpointType'), 'AutoApproval'))]",
"type": "Microsoft.Network/privateDnsZones",
"apiVersion": "2018-09-01",
"name": "privatelink.api.azureml.ms",
"tags": "[parameters('tagValues')]",
"dependsOn": [
"[resourceId('Microsoft.Network/privateEndpoints', parameters('privateEndpointName'))]"
],
"location": "global",
"properties": {}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('privateEndpointType'), 'AutoApproval'))]",
"type": "Microsoft.Network/privateDnsZones",
"apiVersion": "2018-09-01",
"name": "privatelink.notebooks.azure.net",
"tags": "[parameters('tagValues')]",
"dependsOn": [
"[resourceId('Microsoft.Network/privateEndpoints', parameters('privateEndpointName'))]"
],
"location": "global",
"properties": {}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('privateEndpointType'), 'AutoApproval'))]",
"type": "Microsoft.Network/privateDnsZones/virtualNetworkLinks",
"apiVersion": "2018-09-01",
"name": "[concat('privatelink.api.azureml.ms', '/', uniqueString(resourceId('Microsoft.Network/virtualNetworks', parameters('vnetName'))))]",
"location": "global",
"tags": "[parameters('tagValues')]",
"dependsOn": [
"[resourceid('Microsoft.Network/privateEndpoints', parameters('privateEndpointName'))]",
"privatelink.api.azureml.ms"
],
"properties": {
"virtualNetwork": {
"id": "[resourceId('Microsoft.Network/virtualNetworks', parameters('vnetName'))]"
},
"registrationEnabled": false
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('privateEndpointType'), 'AutoApproval'))]",
"type": "Microsoft.Network/privateDnsZones/virtualNetworkLinks",
"apiVersion": "2018-09-01",
"name": "[concat('privatelink.notebooks.azure.net', '/', uniqueString(resourceId('Microsoft.Network/virtualNetworks', parameters('vnetName'))))]",
"location": "global",
"tags": "[parameters('tagValues')]",
"dependsOn": [
"[resourceid('Microsoft.Network/privateEndpoints', parameters('privateEndpointName'))]",
"privatelink.notebooks.azure.net"
],
"properties": {
"virtualNetwork": {
"id": "[resourceId('Microsoft.Network/virtualNetworks', parameters('vnetName'))]"
},
"registrationEnabled": false
}
},
{
"condition": "[and(variables('enablePE'), equals(parameters('privateEndpointType'), 'AutoApproval'))]",
"type": "Microsoft.Network/privateEndpoints/privateDnsZoneGroups",
"apiVersion": "2020-03-01",
"name": "[concat(parameters('privateEndpointName'), '/', 'default')]",
"location": "[variables('location')]",
"dependsOn": [
"[resourceid('Microsoft.Network/privateEndpoints', parameters('privateEndpointName'))]",
"privatelink.notebooks.azure.net",
"privatelink.api.azureml.ms"
],
"properties": {
"privateDnsZoneConfigs": [
{
"name": "privatelink-api-azureml-ms",
"properties": {
"privateDnsZoneId": "[resourceid('Microsoft.Network/privateDnsZones', 'privatelink.api.azureml.ms')]"
}
},
{
"name": "privatelink-notebooks-azure-net",
"properties": {
"privateDnsZoneId": "[resourceid('Microsoft.Network/privateDnsZones', 'privatelink.notebooks.azure.net')]"
}
}
]
}
}
],
"outputs": {
"PrivateEndPointNotSupport": {
"condition": "[and(not(variables('enablePE')), not(equals(parameters('privateEndpointType'), 'none')))]",
"type": "string",
"value": "Private endpoint is not supported in the specified location."
}
}
}

Просмотреть файл

@ -5,34 +5,68 @@ In order to be able to train models on Azure Machine Learning (AML) you will nee
Azure Portal first. In this document we will walk you through this process step-by-step.
In short, you will need to:
* Set up an Azure Machine Learning Workspace.
* Register your application to create a Service Principal Object.
* Set up a storage account to store your data.
* Set up an Azure Machine Learning (AzureML) Workspace
* Create a compute cluster to run your experiments.
* Optional: Register your application to create a Service Principal Object.
* Optional: Set up a storage account to store your datasets.
* Update your [train_variables.yml](/InnerEye/train_variables.yml) file and KeyVault with your own credentials.
Once you're done with these steps, you will be ready for the next steps described in [Creating a dataset](https://github.com/microsoft/InnerEye-createdataset),
[Building models in Azure ML](building_models.md) and
[Sample segmentation and classification tasks](sample_tasks.md).
### Step 1: Create an AML workspace
Prerequisite: an Azure account and a corresponding Azure subscription. See the [Get started with Azure](https://azure.microsoft.com/en-us/get-started/) page
**Prerequisite**: an Azure account and a corresponding Azure subscription. See the [Get started with Azure](https://azure.microsoft.com/en-us/get-started/) page
for more information on how to set up your account and your subscription. Here are more detailed instructions on how to
[manage accounts and subscriptions with Azure](https://docs.microsoft.com/en-us/azure/cost-management-billing/manage/).
Assuming you have an Azure account and an Azure subscription, to create an AML workspace you will need to:
## Automatic Deployment
Click on this link to automatically create an AzureML workspace, an associated storage account, and a computer cluster
for training. This replaces steps 1 and 2 below.
[![Deploy To Azure](https://raw.githubusercontent.com/Azure/azure-quickstart-templates/master/1-CONTRIBUTION-GUIDE/images/deploytoazure.svg?sanitize=true)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fgithub.com%2FMicrosoft%2FInnerEye-DeepLearning%2Fblob%2Fantonsc%2Fdeploy%2Fazure-pipelines%2Fazure_deployment_template.json)
- You will be asked to create a new `Resource Group`, a logical grouping that will hold all the Azure resources that
the script will create. In doing that, you will need to choose a location where all your Azure resources live - here,
pick a location that is compliant with the legal requirements that your own datasets have (for example, your data may
need to be kept inside of the UK)
- Then choose a name for your AzureML workspace. Use letters and numbers only, because other resources will be created
using the workspace name as a prefix.
### Step 1: Create an AzureML workspace
You can skip this if you have chosen automatic deployment above.
Assuming you have an Azure account and an Azure subscription, to create an AzureML workspace you will need to:
1. Connect to the [Azure portal](https://aka.ms/portal) with your account.
2. At the top of the home page, you will see a list of Azure services (alternatively you can also use the search bar).
You will need to select "Machine Learning" and click `+ Create`. You will then have to select your subscription,
and create a new `Resource Group`. Then, give a name to your workspace a name, such as
`MyInnerEye-Workspace`, and choose the correct Region suitable for your location as well as the
desired `Workspace Edition`. Finish by clicking on `Review + Create` and then `Create`. You can find more details about how to set up an AML workspace in
desired `Workspace Edition`. Finish by clicking on `Review + Create` and then `Create`. You can find more details about
how to set up an AzureML workspace in
the Azure documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace).
### Step 2: Create a compute cluster for your experiments
### Step 2 (Optional): Register your application to create a Service Principal Authentication object.
In order to be able to run experiments you will need to create a compute cluster attached to your AzureML workspace.
We recommend using [low priority](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) clusters, since
they only cost a fraction of the dedicated VMs.
As a reference, the Prostate model and the Head and Neck model require VMs with 4 GPUs with at least 16GB of memory
per GPU, for example `Standard_ND24s`, `Standard_NC24s_v3` or `Standard_NC24s_v2`.
You need to ensure that your Azure subscription actually has a quota for accessing GPU machines. To see your quota,
find your newly created AzureML workspace in the [Azure portal](http://portal.azure.com), using the search bar at the
top. Then choose "Usage and Quotas" in the left hand navigation. You should see your actual core usage and your quota,
like "0/100" meaning that you are using 0 nodes out of a quota of 100. If you don't see a quota for both dedicated AND
low priority nodes, click on the "Request Quota" button at the bottom of the page to create a ticket with Azure support.
You can skip creating a compute cluster if you have chosen automatic deployment above. If you need to do this step
manually, please follow the steps described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-set-up-training-targets#set-up-in-azure-machine-learning-studio).
Note down the name of your compute target.
### Step 3 (Optional): Register your application to create a Service Principal Authentication object.
Training runs in AzureML can be submitted either under the name of user who started it, or as a generic identity called
"Service Principal". Using the generic identity is essential if you would like to submit training runs from code,
@ -67,22 +101,22 @@ To register the application:
1. You will need to share this application secret with your colleagues if they also want to use Service Principal
authentication. They will also either need to set the environment variable, or create the text file with the secret.
Now that your service principal is created, you need to give permission for it to access and manage your AML workspace.
Now that your service principal is created, you need to give permission for it to access and manage your AzureML workspace.
To do so:
1. Go to your AML workspace. To find it you can type the name of your workspace in the search bar above.
1. Go to your AzureML workspace. To find it you can type the name of your workspace in the search bar above.
2. On the left of the page go to `Access control`. Then click on `+ Add` > `Add role assignment`. A pane will appear on the
the right. Select `Role > Contributor` and leave `Assign access`. Finally in the `Select` field type the name
of your Service Principal and select it. Finish by clicking `Save` at the bottom of the pane.
Your Service Principal is now all set!
### Step 3: Get the key of the AML storage account
When you created your AML workspace, a [storage account](https://docs.microsoft.com/en-us/azure/storage/common/storage-account-overview)
### Step 4: Get the key of the AzureML storage account
When you created your AzureML workspace, a [storage account](https://docs.microsoft.com/en-us/azure/storage/common/storage-account-overview)
was automatically created for you. This storage account will be used to save all results of your experiments that will
be displayed in the Azure Dashboard. In order to let the code write to this storage account you will need
to retrieve the access key for this account.
1. Navigate to your AML workspace (created in step 1) by typing its name in the top search bar.
1. Navigate to your AzureML workspace (created in step 1) by typing its name in the top search bar.
2. In the `Overview` pane, at the top right, you will find a field called `Storage` with a storage account linked to it.
3. Click on the storage account name to open it. For the next steps you will
need to retrieve the `storage account ID`. For this go to the `Properties` tab of the storage account. There you will find
@ -91,9 +125,9 @@ the `Storage account resource ID`. Save this value somewhere for the next steps.
the storage account (in the left pane). You will need to temporarily save the value of the first key for the next step
in a secure location, preferably in your password manager.
### Step 4: Create a storage account for your datasets.
### Step 5: Create a storage account for your datasets.
In order to train your model in the cloud, you will need to upload your datasets to Azure. For this, you will have two options:
* Store your datasets in the storage account linked to your AML workspace (see Step 3 above).
* Store your datasets in the storage account linked to your AzureML workspace (see Step 3 above).
* Create a new storage account whom you will only use for dataset storage purposes.
You will need to create a blob container called `datasets` in whichever account you choose. InnerEye will look for datasets
@ -110,26 +144,10 @@ If you want to create a new storage account:
6. Click create.
7. Once your resource is created you can access it by typing its name in the top search bar. You will then need to retrieve the storage account ID
and the access key of this storage account following the same instructions as you did for the dataset storage account (cf. Step 3.7 above).
Be careful not to mix up the `dataset storage account` and the AML `storage account` IDs and keys.
### Step 5: Create a compute cluster for your experiments
In order to be able to run experiments you will need to create a compute cluster attached to your AML workspace. In order
to do so follow the steps described [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-set-up-training-targets#set-up-in-azure-machine-learning-studio).
Note down the name of your compute target.
We recommend using [low priority](https://docs.microsoft.com/en-us/azure/batch/batch-low-pri-vms) clusters, since
they only cost a fraction of the dedicated VMs.
As a reference, the Prostate model and the Head and Neck model require VMs with 4 GPUs with at least 16GB of memory
per GPU, for example `Standard_ND24s`, `Standard_NC24s_v3` or `Standard_NC24s_v2`.
You need to ensure that your Azure subscription actually has a quota for accessing GPU machines. To see your quota,
find your newly created AzureML workspace in the [Azure portal](http://portal.azure.com), using the search bar at the
top. Then choose "Usage and Quotas" in the left hand navigation. You should see your actual core usage and your quota,
like "0/100" meaning that you are using 0 nodes out of a quota of 100. If you don't see a quota for both dedicated AND
low priority nodes, click on the "Request Quota" button at the bottom of the page to create a ticket with Azure support.
Be careful not to mix up the `dataset storage account` and the AzureML `storage account` IDs and keys.
### Step 6: Create a datastore
You will need to create a datastore in AzureML. Go to the `Datastores` tab in AML, and then click `+ New datastore`.
You will need to create a datastore in AzureML. Go to the `Datastores` tab in AzureML, and then click `+ New datastore`.
Create a datastore called `innereyedatasets`. In the fields for storage account, type in your dataset storage account name,
and under blob container, type `datasets` (this is the blob you created in Step 4).
@ -145,12 +163,12 @@ look for the subscription you are using for your workspace. Copy the value of th
field of [train_variables.yml](/InnerEye/train_variables.yml).
3. Copy the application ID of your Service Principal that you retrieved earlier (cf. Step 2.4) to the `application_id` field.
If you did not set up a Service Principal, fill that with an empty string or leave out altogether.
4. In the `storage_account:` field copy the ID of the AML storage account (retrieved in Step 3).
4. In the `storage_account:` field copy the ID of the AzureML storage account (retrieved in Step 3).
5. Similarly in the `datasets_storage_account:` field copy the ID of the dataset storage account (retrieved in Step 4). If
you chose not to create a separate account for your dataset in Step 4, then specify the same value as in the
`storage_account` field, to tell the code to use the same storage account.
6. Update the `resource_group:` field with your resource group name (created in Step 1).
7. Update the `workspace_region:` and `workspace-name:` fields according to the values you chose in Step 1.
7. Update the `workspace-name:` fields according to the values you chose in Step 1.
8. Update the `gpu_cluster_name:` field with the name of your own compute cluster (Step 5).
Leave all other fields as they are for now.