Merge pull request #21 from Microsoft/fboylu_rev

Fboylu rev
This commit is contained in:
Mat 2018-08-08 12:33:08 +01:00 коммит произвёл GitHub
Родитель 1b80c2f2c9 e7aac21448
Коммит 43c0b6943c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
22 изменённых файлов: 1136 добавлений и 4895 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -33,7 +33,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 2,
"metadata": {},
"outputs": [
{
@ -142,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@ -167,14 +167,14 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:model_driver:Model loading time: 45338.1 ms\n"
"INFO:model_driver:Model loading time: 42520.51 ms\n"
]
}
],
@ -184,7 +184,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -196,8 +196,8 @@
"DEBUG:PIL.PngImagePlugin:iCCP profile name b'ICC Profile'\n",
"DEBUG:PIL.PngImagePlugin:Compression method 0\n",
"DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 345 65536\n",
"INFO:model_driver:Predictions: {'image': [('n02127052', 'lynx', 0.9816483), ('n02128385', 'leopard', 0.0077441484), ('n02123159', 'tiger_cat', 0.003686138)]}\n",
"INFO:model_driver:Predictions took 5326.6 ms\n"
"INFO:model_driver:Predictions: {'image': [('n02127052', 'lynx', 0.9816483), ('n02128385', 'leopard', 0.0077441484), ('n02123159', 'tiger_cat', 0.0036861342)]}\n",
"INFO:model_driver:Predictions took 4221.36 ms\n"
]
}
],

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -20,6 +20,7 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from testing_utilities import to_img, img_url_to_json, plot_predictions_dict \n",
@ -27,17 +28,30 @@
"\n",
"%matplotlib inline\n",
"%load_ext autoreload\n",
"%autoreload 2"
"%autoreload 2\n",
"%load_ext dotenv\n",
"%dotenv"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'caia/kerastf-gpu'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"docker_login = 'fboylu'\n",
"image_name = docker_login + '/kerastf-gpu'"
"image_name = os.getenv('docker_login') + os.getenv('image_repo')\n",
"image_name"
]
},
{
@ -66,7 +80,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -83,7 +97,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@ -95,7 +109,7 @@
}
],
"source": [
"!curl 'http://0.0.0.0:80/version'"
"!curl 'http://0.0.0.0:80/version' #reports tensorflow version"
]
},
{
@ -107,7 +121,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@ -116,16 +130,16 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.image.AxesImage at 0x7f24b4bed3c8>"
"<matplotlib.image.AxesImage at 0x7fb7f0f29d30>"
]
},
"execution_count": 11,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
@ -146,7 +160,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -155,7 +169,7 @@
"'{\"input\": \"{\\\\\"image\\\\\": \\\\\"iVBORw0KGgoAAAANSUhEUgAAAOAAAADgCAIAAACVT/22AAABJGlDQ1BJQ0MgUHJvZmlsZQAAeJx'"
]
},
"execution_count": 12,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -167,25 +181,25 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.58 ms, sys: 2.77 ms, total: 5.35 ms\n",
"Wall time: 4.53 s\n",
"CPU times: user 4.63 ms, sys: 555 µs, total: 5.19 ms\n",
"Wall time: 4.59 s\n",
"<Response [200]>\n"
]
},
{
"data": {
"text/plain": [
"{'result': \"({'image': [('n02127052', 'lynx', 0.9816483), ('n02128385', 'leopard', 0.0077441484), ('n02123159', 'tiger_cat', 0.003686138)]}, 'Computed in 4518.73 ms')\"}"
"{'result': \"({'image': [('n02127052', 'lynx', 0.9816483), ('n02128385', 'leopard', 0.0077441484), ('n02123159', 'tiger_cat', 0.003686138)]}, 'Computed in 4578.19 ms')\"}"
]
},
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -206,7 +220,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@ -220,7 +234,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@ -230,7 +244,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@ -265,7 +279,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@ -274,7 +288,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -286,21 +300,21 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.07515318617224694,\n",
" 0.0732112062163651,\n",
" 0.07464460888877511,\n",
" 0.07395487250760198,\n",
" 0.07413298152387142,\n",
" 0.07386943288147449]"
"[0.074316783901304,\n",
" 0.0740386152639985,\n",
" 0.07445752304047346,\n",
" 0.0755898873321712,\n",
" 0.07515582954511046,\n",
" 0.07495034066960216]"
]
},
"execution_count": 21,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@ -311,14 +325,14 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average time taken: 74.16 ms\n"
"Average time taken: 74.75 ms\n"
]
}
],
@ -328,14 +342,14 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"7c63df6e807d\n"
"7852c67fad43\n"
]
}
],

Просмотреть файл

@ -19,7 +19,6 @@
"- Create resource group and create AKS\n",
"- Connect to AKS\n",
"- Deploy our app\n",
"- Tear it all down\n",
"\n",
"We assume that this notebook is running on Linux and Azure CLI is installed before proceeding."
]
@ -30,8 +29,10 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from testing_utilities import write_json_to_file"
"from testing_utilities import write_json_to_file\n",
"%load_ext dotenv"
]
},
{
@ -45,25 +46,40 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Below are the various name definitions for the resources needed to setup AKS as well as the name of the Docker image we will be using. If you wish to use the image that you previously pushed to your account, make sure you change this."
"Below are the various name definitions for the resources needed to setup AKS."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"%%writefile --append .env\n",
"# This cell is tagged `parameters`\n",
"# Please modify the values below as you see fit\n",
"\n",
"# If you have multiple subscriptions select the subscription you want to use \n",
"selected_subscription = \"YOUR_SUBSCRIPTION\"\n",
"\n",
"# Resource group, name and location for AKS cluster.\n",
"resource_group = \"RESOURCE_GROUP\" \n",
"aks_name = \"AKS_CLUSTER_NAME\"\n",
"location = \"eastus\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Please modify the below as you see fit\n",
"resource_group = \"<RESOURCE_GROUP>\" \n",
"aks_name = \"<AKS_CLUSTER_NAME>\"\n",
"location = \"eastus\"\n",
"\n",
"docker_login = '<YOUR_DOCKER_LOGIN>'\n",
"image_name = docker_login + '/kerastf-gpu' # 'fboylu/kerastf-gpu' Feel free to use this image if you want to \n",
" # skip creating your own container\n",
"selected_subscription = \"'<YOUR_SUBSCRIPTION>'\" # If you have multiple subscriptions select \n",
" # the subscription you want to use here"
"%dotenv\n",
"image_name = os.getenv('docker_login') + os.getenv('image_repo')"
]
},
{
@ -95,7 +111,7 @@
"metadata": {},
"outputs": [],
"source": [
"!az account set --subscription $selected_subscription"
"!az account set --subscription \"$selected_subscription\""
]
},
{
@ -171,72 +187,9 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K - Finished ..{\n",
" \"aadProfile\": null,\n",
" \"addonProfiles\": null,\n",
" \"agentPoolProfiles\": [\n",
" {\n",
" \"count\": 1,\n",
" \"dnsPrefix\": null,\n",
" \"fqdn\": null,\n",
" \"maxPods\": 110,\n",
" \"name\": \"nodepool1\",\n",
" \"osDiskSizeGb\": null,\n",
" \"osType\": \"Linux\",\n",
" \"ports\": null,\n",
" \"storageProfile\": \"ManagedDisks\",\n",
" \"vmSize\": \"Standard_NC6\",\n",
" \"vnetSubnetId\": null\n",
" }\n",
" ],\n",
" \"dnsPrefix\": \"fbAKSClust-fbaksrg-e984a9\",\n",
" \"enableRbac\": true,\n",
" \"fqdn\": \"fbaksclust-fbaksrg-e984a9-bf6af1df.hcp.eastus.azmk8s.io\",\n",
" \"id\": \"/subscriptions/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX/resourcegroups/fbaksrg/providers/Microsoft.ContainerService/managedClusters/fbAKSCluster\",\n",
" \"kubernetesVersion\": \"1.9.9\",\n",
" \"linuxProfile\": {\n",
" \"adminUsername\": \"azureuser\",\n",
" \"ssh\": {\n",
" \"publicKeys\": [\n",
" {\n",
" \"keyData\": \"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDA912ocZhmAUJbpPW/mOg5anphH0ehCLNlgofEvuCCwIV/2oRBL2uC6F2fDImkOjwSLQwOdhK+IvLOBUTdBOSXwOG+Dv6kd0n+YIKrykI60UJJjx+gYw3lFqeSgkjavKQpcQO1jhnemsn4wM6B3dPJVp8fDU86C3KM7KkoJkfgaQrJ1pfnzqEE+sFYrguhAjHJvJ+tz+JY0quG5tK5ARm7VMLW7JOxR1KFGNN4Kxcax0S4r2xG1px0rUFht4hG6isASbVsvIA2+3DnfWG7wNNeX5zKITJqUFH48uuZqN/QPam+2m03oibEzEu6B/C8KSOkGnpKsJfiGNykqlahNILz\"\n",
" }\n",
" ]\n",
" }\n",
" },\n",
" \"location\": \"eastus\",\n",
" \"name\": \"fbAKSCluster\",\n",
" \"networkProfile\": {\n",
" \"dnsServiceIp\": \"10.0.0.10\",\n",
" \"dockerBridgeCidr\": \"172.17.0.1/16\",\n",
" \"networkPlugin\": \"kubenet\",\n",
" \"networkPolicy\": null,\n",
" \"podCidr\": \"10.244.0.0/16\",\n",
" \"serviceCidr\": \"10.0.0.0/16\"\n",
" },\n",
" \"nodeResourceGroup\": \"MC_fbAKSClust_fbaksrg_eastus\",\n",
" \"provisioningState\": \"Succeeded\",\n",
" \"resourceGroup\": \"fbaksrg\",\n",
" \"servicePrincipalProfile\": {\n",
" \"clientId\": \"367110e1-3cb9-4259-ad39-314778cdde89\",\n",
" \"keyVaultSecretRef\": null,\n",
" \"secret\": null\n",
" },\n",
" \"tags\": null,\n",
" \"type\": \"Microsoft.ContainerService/ManagedClusters\"\n",
"}\n",
"\u001b[0mCPU times: user 8.19 s, sys: 3.59 s, total: 11.8 s\n",
"Wall time: 13min 48s\n"
]
}
],
"outputs": [],
"source": [
"%%time\n",
"!az aks create --resource-group $resource_group --name $aks_name --node-count 1 --generate-ssh-keys -s Standard_NC6"
@ -258,14 +211,14 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDownloading client to /usr/local/bin/kubectl from https://storage.googleapis.com/kubernetes-release/release/v1.11.0/bin/linux/amd64/kubectl\u001b[0m\n",
"\u001b[33mDownloading client to /usr/local/bin/kubectl from https://storage.googleapis.com/kubernetes-release/release/v1.11.1/bin/linux/amd64/kubectl\u001b[0m\n",
"\u001b[33mPlease ensure that /usr/local/bin is in your search PATH, so the `kubectl` command can be found.\u001b[0m\n"
]
}
@ -306,7 +259,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -314,7 +267,7 @@
"output_type": "stream",
"text": [
"NAME STATUS ROLES AGE VERSION\r\n",
"aks-nodepool1-28016997-0 Ready agent 55d v1.9.6\r\n"
"aks-nodepool1-28016997-0 Ready agent 60d v1.9.6\r\n"
]
}
],
@ -331,7 +284,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 8,
"metadata": {},
"outputs": [
{
@ -339,14 +292,14 @@
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 55d\r\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 55d\r\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 55d\r\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 55d\r\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 55d\r\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 55d\r\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 55d\r\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 27 55d\r\n"
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 60d\r\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 60d\r\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 60d\r\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 60d\r\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 60d\r\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 34 60d\r\n"
]
}
],
@ -370,7 +323,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@ -456,7 +409,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@ -465,7 +418,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@ -481,7 +434,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 14,
"metadata": {},
"outputs": [
{
@ -511,7 +464,7 @@
" \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.5/lib\"\r\n",
" }\r\n",
" ],\r\n",
" \"image\": \"fboylu/kerastf-gpu\",\r\n",
" \"image\": \"caia/kerastf-gpu\",\r\n",
" \"name\": \"azure-dl\",\r\n",
" \"ports\": [\r\n",
" {\r\n",
@ -583,7 +536,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 15,
"metadata": {},
"outputs": [
{
@ -608,23 +561,23 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\n",
"default azure-dl-b47cf8cdd-4gg6v 1/1 Running 0 3m\n",
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 55d\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 55d\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 55d\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 55d\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 55d\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 55d\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 55d\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 27 55d\n"
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"default azure-dl-5f6b7dfb6f-kbxz6 1/1 Running 0 6m\r\n",
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 60d\r\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 60d\r\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 60d\r\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 60d\r\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 60d\r\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 34 60d\r\n"
]
}
],
@ -641,7 +594,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 18,
"metadata": {},
"outputs": [
{
@ -649,24 +602,24 @@
"output_type": "stream",
"text": [
"LAST SEEN FIRST SEEN COUNT NAME KIND SUBOBJECT TYPE REASON SOURCE MESSAGE\r\n",
"25m 25m 1 azure-dl-66f69c4f79-mpmvh.15471bba8ebb833f Pod spec.containers{azure-dl} Normal Killing kubelet, aks-nodepool1-28016997-0 Killing container with id docker://azure-dl:Need to kill Pod\r\n",
"3m 3m 1 azure-dl-b47cf8cdd-4gg6v.15471ceb30164ee8 Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-b47cf8cdd-4gg6v to aks-nodepool1-28016997-0\r\n",
"3m 3m 1 azure-dl-b47cf8cdd-4gg6v.15471ceb3ec741e6 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"nvidia\" \r\n",
"3m 3m 1 azure-dl-b47cf8cdd-4gg6v.15471ceb3f716dfa Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"default-token-crgnj\" \r\n",
"2m 3m 2 azure-dl-b47cf8cdd-4gg6v.15471ceb6c6b5f4e Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-28016997-0 pulling image \"fboylu/kerastf-gpu\"\r\n",
"2m 2m 1 azure-dl-b47cf8cdd-4gg6v.15471cfc6c7c3a5a Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Failed to pull image \"fboylu/kerastf-gpu\": rpc error: code = Canceled desc = context canceled\r\n",
"2m 2m 1 azure-dl-b47cf8cdd-4gg6v.15471cfc6c7ca882 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Error: ErrImagePull\r\n",
"2m 2m 1 azure-dl-b47cf8cdd-4gg6v.15471cfcaf0fec65 Pod spec.containers{azure-dl} Normal BackOff kubelet, aks-nodepool1-28016997-0 Back-off pulling image \"fboylu/kerastf-gpu\"\r\n",
"2m 2m 1 azure-dl-b47cf8cdd-4gg6v.15471cfcaf10111d Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Error: ImagePullBackOff\r\n",
"59s 59s 1 azure-dl-b47cf8cdd-4gg6v.15471d110bef3520 Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-28016997-0 Successfully pulled image \"fboylu/kerastf-gpu\"\r\n",
"59s 59s 1 azure-dl-b47cf8cdd-4gg6v.15471d111ba4d4e7 Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-28016997-0 Created container\r\n",
"59s 59s 1 azure-dl-b47cf8cdd-4gg6v.15471d112535078f Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-28016997-0 Started container\r\n",
"3m 3m 1 azure-dl-b47cf8cdd.15471ceb2fafffc1 ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-b47cf8cdd-4gg6v\r\n",
"25m 25m 1 azure-dl.15471bb7ec2f73a1 Service Normal DeletingLoadBalancer service-controller Deleting load balancer\r\n",
"20m 20m 1 azure-dl.15471bfa248744cb Service Normal DeletedLoadBalancer service-controller Deleted load balancer\r\n",
"3m 3m 1 azure-dl.15471ceb2eb34f93 Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-b47cf8cdd to 1\r\n",
"3m 3m 1 azure-dl.15471ceb31bb2a89 Service Normal EnsuringLoadBalancer service-controller Ensuring load balancer\r\n",
"1m 1m 1 azure-dl.15471d0fba56258f Service Normal EnsuredLoadBalancer service-controller Ensured load balancer\r\n"
"46m 46m 1 aks-nodepool1-28016997-0.1548a985ff48b23b Node Normal RegisteredNode node-controller Node aks-nodepool1-28016997-0 event: Registered Node aks-nodepool1-28016997-0 in Controller\r\n",
"35m 35m 1 aks-nodepool1-28016997-0.1548aa2258f34dc5 Node Normal RegisteredNode node-controller Node aks-nodepool1-28016997-0 event: Registered Node aks-nodepool1-28016997-0 in Controller\r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abbc1c661966 Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-5f6b7dfb6f-kbxz6 to aks-nodepool1-28016997-0\r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abbc256b0973 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"nvidia\" \r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abbc2754e88a Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"default-token-crgnj\" \r\n",
"4m 6m 2 azure-dl-5f6b7dfb6f-kbxz6.1548abbc5412d897 Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-28016997-0 pulling image \"caia/kerastf-gpu\"\r\n",
"4m 4m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abd437671289 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Failed to pull image \"caia/kerastf-gpu\": rpc error: code = Canceled desc = context canceled\r\n",
"4m 4m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abd437675041 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Error: ErrImagePull\r\n",
"4m 4m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abd479665ad8 Pod Normal SandboxChanged kubelet, aks-nodepool1-28016997-0 Pod sandbox changed, it will be killed and re-created.\r\n",
"4m 4m 3 azure-dl-5f6b7dfb6f-kbxz6.1548abd4bccc3504 Pod spec.containers{azure-dl} Normal BackOff kubelet, aks-nodepool1-28016997-0 Back-off pulling image \"caia/kerastf-gpu\"\r\n",
"4m 4m 3 azure-dl-5f6b7dfb6f-kbxz6.1548abd4bccc6574 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Error: ImagePullBackOff\r\n",
"2m 2m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abf021a8ab22 Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-28016997-0 Successfully pulled image \"caia/kerastf-gpu\"\r\n",
"2m 2m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abf02e88d586 Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-28016997-0 Created container\r\n",
"2m 2m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abf037241533 Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-28016997-0 Started container\r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f.1548abbc1bbcf974 ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-5f6b7dfb6f-kbxz6\r\n",
"6m 6m 1 azure-dl.1548abbc1aaaccda Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-5f6b7dfb6f to 1\r\n",
"6m 6m 1 azure-dl.1548abbc284ca303 Service Normal EnsuringLoadBalancer service-controller Ensuring load balancer\r\n",
"2m 2m 1 azure-dl.1548abeeedade8ad Service Normal EnsuredLoadBalancer service-controller Ensured load balancer\r\n"
]
}
],
@ -683,7 +636,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@ -693,80 +646,80 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-02 16:09:17,655 CRIT Supervisor running as root (no user in config file)\r\n",
"2018-08-02 16:09:17,658 INFO supervisord started with pid 1\r\n",
"2018-08-02 16:09:18,660 INFO spawned: 'program_exit' with pid 9\r\n",
"2018-08-02 16:09:18,662 INFO spawned: 'nginx' with pid 10\r\n",
"2018-08-02 16:09:18,664 INFO spawned: 'gunicorn' with pid 11\r\n",
"2018-08-02 16:09:19,695 INFO success: program_exit entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)\r\n",
"2018-08-02 16:09:20.328936: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\r\n",
"2018-08-02 16:09:20.512472: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 0 with properties: \r\n",
"2018-08-07 17:58:41,382 CRIT Supervisor running as root (no user in config file)\r\n",
"2018-08-07 17:58:41,384 INFO supervisord started with pid 1\r\n",
"2018-08-07 17:58:42,387 INFO spawned: 'program_exit' with pid 9\r\n",
"2018-08-07 17:58:42,388 INFO spawned: 'nginx' with pid 10\r\n",
"2018-08-07 17:58:42,390 INFO spawned: 'gunicorn' with pid 11\r\n",
"2018-08-07 17:58:43,422 INFO success: program_exit entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)\r\n",
"2018-08-07 17:58:44.007138: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\r\n",
"2018-08-07 17:58:44.191739: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 0 with properties: \r\n",
"name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235\r\n",
"pciBusID: ddde:00:00.0\r\n",
"totalMemory: 11.17GiB freeMemory: 11.10GiB\r\n",
"2018-08-02 16:09:20.512518: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0\r\n",
"2018-08-02 16:09:20.877532: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:\r\n",
"2018-08-02 16:09:20.877589: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958] 0 \r\n",
"2018-08-02 16:09:20.877604: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: N \r\n",
"2018-08-02 16:09:20.877918: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10761 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: ddde:00:00.0, compute capability: 3.7)\r\n",
"2018-08-02 16:09:23,881 INFO success: nginx entered RUNNING state, process has stayed up for > than 5 seconds (startsecs)\r\n",
"2018-08-02 16:09:38,898 INFO success: gunicorn entered RUNNING state, process has stayed up for > than 20 seconds (startsecs)\r\n",
"2018-08-07 17:58:44.191801: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0\r\n",
"2018-08-07 17:58:44.533627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:\r\n",
"2018-08-07 17:58:44.533679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958] 0 \r\n",
"2018-08-07 17:58:44.533694: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: N \r\n",
"2018-08-07 17:58:44.533952: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10761 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: ddde:00:00.0, compute capability: 3.7)\r\n",
"2018-08-07 17:58:47,537 INFO success: nginx entered RUNNING state, process has stayed up for > than 5 seconds (startsecs)\r\n",
"2018-08-07 17:59:02,553 INFO success: gunicorn entered RUNNING state, process has stayed up for > than 20 seconds (startsecs)\r\n",
"Downloading data from https://github.com/adamcasson/resnet152/releases/download/v0.1/resnet152_weights_tf.h5\r\n",
"\r\n",
" 8192/243179624 [..............................] - ETA: 48s\r\n",
" 417792/243179624 [..............................] - ETA: 30s\r\n",
" 2326528/243179624 [..............................] - ETA: 10s\r\n",
" 6799360/243179624 [..............................] - ETA: 5s \r\n",
" 11788288/243179624 [>.............................] - ETA: 3s\r\n",
" 16777216/243179624 [=>............................] - ETA: 3s\r\n",
" 21168128/243179624 [=>............................] - ETA: 3s\r\n",
" 26419200/243179624 [==>...........................] - ETA: 2s\r\n",
" 31776768/243179624 [==>...........................] - ETA: 2s\r\n",
" 37289984/243179624 [===>..........................] - ETA: 2s\r\n",
" 44482560/243179624 [====>.........................] - ETA: 2s\r\n",
" 52264960/243179624 [=====>........................] - ETA: 2s\r\n",
" 58564608/243179624 [======>.......................] - ETA: 1s\r\n",
" 65781760/243179624 [=======>......................] - ETA: 1s\r\n",
" 74440704/243179624 [========>.....................] - ETA: 1s\r\n",
" 81690624/243179624 [=========>....................] - ETA: 1s\r\n",
" 89055232/243179624 [=========>....................] - ETA: 1s\r\n",
" 94945280/243179624 [==========>...................] - ETA: 1s\r\n",
"102735872/243179624 [===========>..................] - ETA: 1s\r\n",
"107839488/243179624 [============>.................] - ETA: 1s\r\n",
"114810880/243179624 [=============>................] - ETA: 1s\r\n",
"120995840/243179624 [=============>................] - ETA: 1s\r\n",
"126025728/243179624 [==============>...............] - ETA: 1s\r\n",
"130998272/243179624 [===============>..............] - ETA: 1s\r\n",
"131727360/243179624 [===============>..............] - ETA: 1s\r\n",
"139862016/243179624 [================>.............] - ETA: 1s\r\n",
"146309120/243179624 [=================>............] - ETA: 0s\r\n",
"154353664/243179624 [==================>...........] - ETA: 0s\r\n",
"160325632/243179624 [==================>...........] - ETA: 0s\r\n",
"165879808/243179624 [===================>..........] - ETA: 0s\r\n",
"173940736/243179624 [====================>.........] - ETA: 0s\r\n",
"178184192/243179624 [====================>.........] - ETA: 0s\r\n",
"186138624/243179624 [=====================>........] - ETA: 0s\r\n",
"194969600/243179624 [=======================>......] - ETA: 0s\r\n",
"200613888/243179624 [=======================>......] - ETA: 0s\r\n",
"205651968/243179624 [========================>.....] - ETA: 0s\r\n",
"210649088/243179624 [========================>.....] - ETA: 0s\r\n",
"215711744/243179624 [=========================>....] - ETA: 0s\r\n",
"220684288/243179624 [==========================>...] - ETA: 0s\r\n",
"225615872/243179624 [==========================>...] - ETA: 0s\r\n",
"230686720/243179624 [===========================>..] - ETA: 0s\r\n",
"235626496/243179624 [============================>.] - ETA: 0s\r\n",
"240697344/243179624 [============================>.] - ETA: 0s\r\n",
" 8192/243179624 [..............................] - ETA: 2s\r\n",
" 2670592/243179624 [..............................] - ETA: 4s\r\n",
" 8912896/243179624 [>.............................] - ETA: 2s\r\n",
" 16171008/243179624 [>.............................] - ETA: 2s\r\n",
" 23740416/243179624 [=>............................] - ETA: 1s\r\n",
" 32129024/243179624 [==>...........................] - ETA: 1s\r\n",
" 40280064/243179624 [===>..........................] - ETA: 1s\r\n",
" 48472064/243179624 [====>.........................] - ETA: 1s\r\n",
" 56614912/243179624 [=====>........................] - ETA: 1s\r\n",
" 64569344/243179624 [======>.......................] - ETA: 1s\r\n",
" 70533120/243179624 [=======>......................] - ETA: 1s\r\n",
" 74948608/243179624 [========>.....................] - ETA: 1s\r\n",
" 77963264/243179624 [========>.....................] - ETA: 1s\r\n",
" 83402752/243179624 [=========>....................] - ETA: 1s\r\n",
" 88875008/243179624 [=========>....................] - ETA: 1s\r\n",
" 95723520/243179624 [==========>...................] - ETA: 1s\r\n",
"101130240/243179624 [===========>..................] - ETA: 1s\r\n",
"106102784/243179624 [============>.................] - ETA: 1s\r\n",
"110903296/243179624 [============>.................] - ETA: 1s\r\n",
"116129792/243179624 [=============>................] - ETA: 1s\r\n",
"121176064/243179624 [=============>................] - ETA: 1s\r\n",
"126164992/243179624 [==============>...............] - ETA: 0s\r\n",
"130932736/243179624 [===============>..............] - ETA: 1s\r\n",
"137437184/243179624 [===============>..............] - ETA: 0s\r\n",
"144523264/243179624 [================>.............] - ETA: 0s\r\n",
"152428544/243179624 [=================>............] - ETA: 0s\r\n",
"158539776/243179624 [==================>...........] - ETA: 0s\r\n",
"163553280/243179624 [===================>..........] - ETA: 0s\r\n",
"168517632/243179624 [===================>..........] - ETA: 0s\r\n",
"173539328/243179624 [====================>.........] - ETA: 0s\r\n",
"178610176/243179624 [=====================>........] - ETA: 0s\r\n",
"183623680/243179624 [=====================>........] - ETA: 0s\r\n",
"188628992/243179624 [======================>.......] - ETA: 0s\r\n",
"193658880/243179624 [======================>.......] - ETA: 0s\r\n",
"196231168/243179624 [=======================>......] - ETA: 0s\r\n",
"203907072/243179624 [========================>.....] - ETA: 0s\r\n",
"210108416/243179624 [========================>.....] - ETA: 0s\r\n",
"215138304/243179624 [=========================>....] - ETA: 0s\r\n",
"220168192/243179624 [==========================>...] - ETA: 0s\r\n",
"225148928/243179624 [==========================>...] - ETA: 0s\r\n",
"230211584/243179624 [===========================>..] - ETA: 0s\r\n",
"235200512/243179624 [============================>.] - ETA: 0s\r\n",
"239239168/243179624 [============================>.] - ETA: 0s\r\n",
"243187712/243179624 [==============================] - 2s 0us/step\r\n",
"{\"logger\": \"model_driver\", \"message\": \"Model loading time: 34560.9 ms\", \"host\": \"azure-dl-b47cf8cdd-4gg6v\", \"stack_info\": null, \"level\": \"INFO\", \"timestamp\": \"2018-08-02T16:09:54.847018Z\", \"path\": \"/code/driver.py\", \"tags\": []}\r\n",
"{\"path\": \"/code/driver.py\", \"message\": \"Model loading time: 34161.21 ms\", \"timestamp\": \"2018-08-07T17:59:18.129430Z\", \"logger\": \"model_driver\", \"host\": \"azure-dl-5f6b7dfb6f-kbxz6\", \"level\": \"INFO\", \"stack_info\": null, \"tags\": []}\r\n",
"Initialising\r\n",
"{\"logger\": \"werkzeug\", \"message\": \" * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\", \"host\": \"azure-dl-b47cf8cdd-4gg6v\", \"stack_info\": null, \"msg\": \" * Running on %s://%s:%d/ %s\", \"level\": \"INFO\", \"timestamp\": \"2018-08-02T16:09:54.852182Z\", \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/werkzeug/_internal.py\", \"tags\": []}\r\n"
"{\"msg\": \" * Running on %s://%s:%d/ %s\", \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/werkzeug/_internal.py\", \"message\": \" * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\", \"timestamp\": \"2018-08-07T17:59:18.134555Z\", \"logger\": \"werkzeug\", \"host\": \"azure-dl-5f6b7dfb6f-kbxz6\", \"level\": \"INFO\", \"stack_info\": null, \"tags\": []}\r\n"
]
}
],
@ -776,7 +729,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 21,
"metadata": {},
"outputs": [
{
@ -784,7 +737,7 @@
"output_type": "stream",
"text": [
"NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE\r\n",
"azure-dl 1 1 1 1 4m\r\n"
"azure-dl 1 1 1 1 6m\r\n"
]
}
],
@ -801,15 +754,15 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\r\n",
"azure-dl LoadBalancer 10.0.9.65 40.87.54.254 80:30390/TCP 4m\r\n"
"NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\r\n",
"azure-dl LoadBalancer 10.0.86.30 40.117.74.122 80:31341/TCP 6m\r\n"
]
}
],
@ -821,73 +774,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will [test our web application deployed on AKS](05_TestWebApp.ipynb). Once, we are done with all the notebooks of the tutorial, below instructions can be used to delete the cluster and free resources."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once you are done with your cluster you can use the following two commands to destroy it all."
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment.apps \"azure-dl\" deleted\n",
"service \"azure-dl\" deleted\n"
]
}
],
"source": [
"!kubectl delete -f az-dl.json"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K\u001b[0minished .."
]
}
],
"source": [
"!az aks delete -n $aks_name -g $resource_group -y"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K\u001b[0minished .."
]
}
],
"source": [
"!az group delete --name $resource_group -y"
"Next, we will [test our web application deployed on AKS](05_TestWebApp.ipynb)."
]
}
],

Просмотреть файл

@ -25,7 +25,6 @@
"from testing_utilities import to_img, img_url_to_json, plot_predictions_dict\n",
"import requests\n",
"import json\n",
"\n",
"%matplotlib inline"
]
},
@ -123,7 +122,7 @@
{
"data": {
"text/plain": [
"<matplotlib.image.AxesImage at 0x7f18a2f27630>"
"<matplotlib.image.AxesImage at 0x7f0af0c2b898>"
]
},
"execution_count": 7,
@ -154,15 +153,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 0 ns, sys: 2.44 ms, total: 2.44 ms\n",
"Wall time: 277 ms\n",
"CPU times: user 2.79 ms, sys: 0 ns, total: 2.79 ms\n",
"Wall time: 263 ms\n",
"<Response [200]>\n"
]
},
{
"data": {
"text/plain": [
"{'result': \"({'image': [('n02127052', 'lynx', 0.9816483), ('n02128385', 'leopard', 0.0077441484), ('n02123159', 'tiger_cat', 0.0036861342)]}, 'Computed in 83.87 ms')\"}"
"{'result': \"({'image': [('n02127052', 'lynx', 0.9816483), ('n02128385', 'leopard', 0.0077441484), ('n02123159', 'tiger_cat', 0.003686138)]}, 'Computed in 91.33 ms')\"}"
]
},
"execution_count": 8,
@ -218,7 +217,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -260,7 +259,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@ -269,7 +268,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@ -281,21 +280,21 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[0.290354760363698,\n",
" 0.2540861517190933,\n",
" 0.2862860821187496,\n",
" 0.2921275496482849,\n",
" 0.29557833168655634,\n",
" 0.2785734226927161]"
"[0.26876416709274054,\n",
" 0.2346069375053048,\n",
" 0.2727304482832551,\n",
" 0.2763230297714472,\n",
" 0.28207750245928764,\n",
" 0.2625489979982376]"
]
},
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@ -306,14 +305,14 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Average time taken: 282.83 ms\n"
"Average time taken: 266.18 ms\n"
]
}
],

Просмотреть файл

@ -121,7 +121,7 @@
{
"data": {
"text/plain": [
"<matplotlib.image.AxesImage at 0x7fcc63fdc048>"
"<matplotlib.image.AxesImage at 0x7fafbd8fd1d0>"
]
},
"execution_count": 7,
@ -248,15 +248,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100/100 [00:07<00:00, 13.35it/s]"
"100%|██████████| 100/100 [00:07<00:00, 13.27it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total Elapsed 7.4952521827071905\n",
"Avg time taken 74.95 ms\n"
"Total Elapsed 7.540950823575258\n",
"Avg time taken 75.41 ms\n"
]
},
{
@ -284,12 +284,12 @@
{
"data": {
"text/plain": [
"[({'result': \"({'image': [('n02127052', 'lynx', 0.9818038), ('n02128385', 'leopard', 0.007679795), ('n02123159', 'tiger_cat', 0.0036601948)]}, 'Computed in 99.07 ms')\"},\n",
" 0.2703360924497247),\n",
" ({'result': \"({'image': [('n02127052', 'lynx', 0.9816389), ('n02128385', 'leopard', 0.0077429586), ('n02123159', 'tiger_cat', 0.0036909534)]}, 'Computed in 97.58 ms')\"},\n",
" 0.3704738710075617),\n",
" ({'result': \"({'image': [('n02127052', 'lynx', 0.9816508), ('n02128385', 'leopard', 0.0077394205), ('n02123159', 'tiger_cat', 0.0036893478)]}, 'Computed in 89.65 ms')\"},\n",
" 0.46090409997850657)]"
"[({'result': \"({'image': [('n02127052', 'lynx', 0.9815654), ('n02128385', 'leopard', 0.0077465144), ('n02123159', 'tiger_cat', 0.0037241662)]}, 'Computed in 99.44 ms')\"},\n",
" 0.2683806661516428),\n",
" ({'result': \"({'image': [('n02127052', 'lynx', 0.9816606), ('n02128385', 'leopard', 0.0077482197), ('n02123159', 'tiger_cat', 0.003675996)]}, 'Computed in 97.08 ms')\"},\n",
" 0.3648473871871829),\n",
" ({'result': \"({'image': [('n02127052', 'lynx', 0.9816492), ('n02128385', 'leopard', 0.007743971), ('n02123159', 'tiger_cat', 0.0036858919)]}, 'Computed in 97.15 ms')\"},\n",
" 0.4713786570355296)]"
]
},
"execution_count": 15,
@ -321,17 +321,17 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"({'result': \"({'image': [('n02127052', 'lynx', 0.9818038), ('n02128385', 'leopard', 0.007679795), ('n02123159', 'tiger_cat', 0.0036601948)]}, 'Computed in 99.07 ms')\"},\n",
" 0.2703360924497247)"
"({'result': \"({'image': [('n02127052', 'lynx', 0.9815654), ('n02128385', 'leopard', 0.0077465144), ('n02123159', 'tiger_cat', 0.0037241662)]}, 'Computed in 99.44 ms')\"},\n",
" 0.2683806661516428)"
]
},
"execution_count": 18,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
},
@ -356,7 +356,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"To tear down the cluster and all related resources go to the last section of [deploy on AKS notebook](04_DeployOnAKS.ipynb)."
"To tear down the cluster and all related resources go to the [tear down the cluster](07_TearDown.ipynb) notebook."
]
}
],

Просмотреть файл

@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down\n",
"Once you are done with your cluster you can use the following two commands to destroy it all."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext dotenv\n",
"%dotenv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once you are done with your cluster you can use the following two commands to destroy it all. First, delete the application."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment.apps \"azure-dl\" deleted\n",
"service \"azure-dl\" deleted\n"
]
}
],
"source": [
"!kubectl delete -f az-dl.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, you delete the AKS cluster. This step may take a few minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az aks delete -n $aks_name -g $resource_group -y"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, you should delete the resource group. This also deletes the AKS cluster and can be used instead of the above command if the resource group is only used for this purpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az group delete --name $resource_group -y"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:AKSDeployment]",
"language": "python",
"name": "conda-env-AKSDeployment-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -3,7 +3,7 @@
In this folder are the tutorials for deploying a Keras model (with Tensorflow backend) on a Kubernetes cluster.
The tutorial is made up of seven notebooks:
The tutorial is made up of the following notebooks:
* [Model development](00_DevelopModel.ipynb) where we load the pretrained model and test it by using it to score images
* [Developing the interface](01_DevelopModelDriver.ipynb) our Flask app will use to load and call the model
* [Building the Docker Image](02_BuildImage.ipynb) with our Flask REST API and model
@ -11,10 +11,5 @@ The tutorial is made up of seven notebooks:
* [Creating our Kubernetes cluster](04_DeployOnAKS.ipynb) and deploying our application to it
* [Testing the deployed model](05_TestWebApp.ipynb)
* [Testing the throughput](06_SpeedTestWebApp.ipynb) of our model
* [Cleaning the resources](07_TearDown.ipynb) used
Before you start any notebooks, execute the below command in your terminal to create a conda environment.
```bash
conda env create -f environment.yml
```
When you start your notebooks, make sure to use the kernel corresponding to this environment.

Просмотреть файл

@ -1,15 +1,19 @@
name: AKSDeploymentKeras
channels:
- conda-forge
dependencies:
- python=3.5
- nb_conda==2.2.0
- tornado==4.5.3
- pip:
- ipykernel==4.8.2
- papermill==0.14.1
- python-dotenv==0.9.0
- Pillow==5.2.0
- wget==3.2
- matplotlib==2.2.2
- aiohttp==3.3.2
- toolz==0.9.0
- tqdm==4.23.4
- tornado==4.5.3
- azure-cli==2.0.41
- tensorflow-gpu==1.9.0
- keras==2.2.0

Просмотреть файл

@ -1,6 +1,7 @@
### Authors: Mathew Salvaris and Fidan Boylu Uz
# Deploy Deep Learning CNN on Kubernetes Cluster with GPUs
## Overview
In this repository there are a number of tutorials in Jupyter notebooks that have step-by-step instructions on how to deploy a pretrained deep learning model on a GPU enabled Kubernetes cluster. The tutorials cover how to deploy models from the following deep learning frameworks:
* [TensorFlow](Tensorflow)
* [Keras (TensorFlow backend)](Keras_Tensorflow)
@ -8,7 +9,7 @@ In this repository there are a number of tutorials in Jupyter notebooks that hav
![alt text](static/example.png "Example Classification")
For each framework we go through 7 steps:
For each framework, we go through the following steps:
* Model development where we load the pretrained model and test it by using it to score images
* Developing the interface our Flask app will use to load and call the model
* Building the Docker Image with our Flask REST API and model
@ -16,11 +17,35 @@ In this repository there are a number of tutorials in Jupyter notebooks that hav
* Creating our Kubernetes cluster and deploying our application to it
* Testing the deployed model
* Testing the throughput of our model
* Cleaning up resources
The application we will develop is a simple image classification service, where we will submit an image and get back what class the image belongs to.
If you already have a Docker image that you would like to deploy or you simply want to use the image we built you can skip the first four notebooks.
If you already have a Docker image that you would like to deploy you can skip the first four notebooks.
## Setting Up
1. Clone the repo:
```bash
git clone <repo web URL>
```
2. Login to Docker with your username and password.
```bash
docker login
```
3. Go to the framework folder you would like to run the notebooks for.
4. Create a conda environment:
```bash
conda env create -f environment.yml
```
5. Activate the environment:
```bash
source activate <environment name>
```
6. Run:
```bash
jupyter notebook
```
7. Start the first notebook and make sure the kernel corresponding to the above environment is selected.
# Contributing

Просмотреть файл

@ -10,7 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -21,6 +21,23 @@
"from tensorflow.contrib.slim.nets import resnet_v1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.9.0\n"
]
}
],
"source": [
"print(tf.__version__)"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -30,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {
"scrolled": true
},
@ -39,16 +56,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"--2018-07-13 17:35:18-- http://download.tensorflow.org/models/resnet_v1_152_2016_08_28.tar.gz\n",
"Resolving download.tensorflow.org... 216.58.193.80, 2607:f8b0:400a:800::2010\n",
"Connecting to download.tensorflow.org|216.58.193.80|:80... connected.\n",
"--2018-08-03 15:15:13-- http://download.tensorflow.org/models/resnet_v1_152_2016_08_28.tar.gz\n",
"Resolving download.tensorflow.org... 216.58.218.176, 2607:f8b0:4000:80a::2010\n",
"Connecting to download.tensorflow.org|216.58.218.176|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 224342140 (214M) [application/x-tar]\n",
"Saving to: resnet_v1_152_2016_08_28.tar.gz\n",
"\n",
"resnet_v1_152_2016_ 100%[===================>] 213.95M 71.0MB/s in 3.0s \n",
"resnet_v1_152_2016_ 100%[===================>] 213.95M 90.4MB/s in 2.4s \n",
"\n",
"2018-07-13 17:35:22 (71.0 MB/s) - resnet_v1_152_2016_08_28.tar.gz saved [224342140/224342140]\n",
"2018-08-03 15:15:16 (90.4 MB/s) - resnet_v1_152_2016_08_28.tar.gz saved [224342140/224342140]\n",
"\n"
]
}
@ -59,7 +76,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -83,23 +100,23 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2018-07-13 17:35:27-- http://data.dmlc.ml/mxnet/models/imagenet/synset.txt\n",
"--2018-08-03 15:15:44-- http://data.dmlc.ml/mxnet/models/imagenet/synset.txt\n",
"Resolving data.dmlc.ml... 54.208.175.7\n",
"Connecting to data.dmlc.ml|54.208.175.7|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 31675 (31K) [text/plain]\n",
"Saving to: synset.txt\n",
"\n",
"synset.txt 100%[===================>] 30.93K --.-KB/s in 0.09s \n",
"synset.txt 100%[===================>] 30.93K --.-KB/s in 0.03s \n",
"\n",
"2018-07-13 17:35:31 (342 KB/s) - synset.txt saved [31675/31675]\n",
"2018-08-03 15:15:45 (894 KB/s) - synset.txt saved [31675/31675]\n",
"\n"
]
}
@ -117,7 +134,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@ -127,7 +144,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@ -147,7 +164,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@ -163,7 +180,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -189,23 +206,23 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2018-07-13 17:36:35-- https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\n",
"Resolving upload.wikimedia.org... 198.35.26.112, 2620:0:863:ed1a::2:b\n",
"Connecting to upload.wikimedia.org|198.35.26.112|:443... connected.\n",
"--2018-08-03 15:18:18-- https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\n",
"Resolving upload.wikimedia.org... 208.80.153.240, 2620:0:860:ed1a::2:b\n",
"Connecting to upload.wikimedia.org|208.80.153.240|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 27183 (27K) [image/jpeg]\n",
"Saving to: 220px-Lynx_lynx_poing.jpg\n",
"\n",
"220px-Lynx_lynx_poi 100%[===================>] 26.55K --.-KB/s in 0.02s \n",
"220px-Lynx_lynx_poi 100%[===================>] 26.55K --.-KB/s in 0.01s \n",
"\n",
"2018-07-13 17:36:35 (1.12 MB/s) - 220px-Lynx_lynx_poing.jpg saved [27183/27183]\n",
"2018-08-03 15:18:18 (2.49 MB/s) - 220px-Lynx_lynx_poing.jpg saved [27183/27183]\n",
"\n"
]
}
@ -216,7 +233,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [
{
@ -226,7 +243,7 @@
"<IPython.core.display.Image object>"
]
},
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -244,7 +261,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@ -262,7 +279,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@ -278,7 +295,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -292,7 +309,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@ -301,7 +318,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@ -310,11 +327,11 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"labels=label_lookup(*np.flip(np.argsort(pred_proba.squeeze()), 0)[:3])"
"labels = label_lookup(*np.flip(np.argsort(pred_proba.squeeze()), 0)[:3])"
]
},
{
@ -326,7 +343,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"outputs": [
{
@ -337,7 +354,7 @@
" 'n02127052 lynx, catamount': 0.9979286}"
]
},
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -15,44 +15,62 @@
"* [Create resource group and create AKS](#section3)\n",
"* [Connect to AKS](#section4)\n",
"* [Deploy our app](#section5)\n",
"* [Tear it all down](#section6)\n",
"\n",
"This guide assumes is designed to be run on linux and requires that the Azure CLI is installed."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from testing_utilities import write_json_to_file\n",
"%load_ext dotenv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='section1'></a>\n",
"## Setup\n",
"Below are the various name definitions for the resources needed to setup ACS as well as the name of the Docker image we will be using."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Some outputs (and inputs) below have been hidden/masked for confidentiality**"
"Below are the various name definitions for the resources needed to setup AKS."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {
"collapsed": true
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Please modify the below as you see fit\n",
"resource_group = \"<RESOURCE_GROUP>\" \n",
"aks_name = \"<AKS_CLUSTER_NAME>\"\n",
"location = \"eastus\"\n",
"%%writefile --append .env\n",
"# This cell is tagged `parameters`\n",
"# Please modify the values below as you see fit\n",
"\n",
"image_name = '<YOUR_DOCKER_IMAGE>' # 'masalvar/tfresnet-gpu' Feel free to use this Image if you want to \n",
" # skip creating your own container\n",
"selected_subscription = \"'<YOUR SUBSCRIPTION>'\" # If you have multiple subscriptions select \n",
" # the subscription you want to use here"
"# If you have multiple subscriptions select the subscription you want to use \n",
"selected_subscription = \"YOUR_SUBSCRIPTION\"\n",
"\n",
"# Resource group, name and location for AKS cluster.\n",
"resource_group = \"RESOURCE_GROUP\" \n",
"aks_name = \"AKS_CLUSTER_NAME\"\n",
"location = \"eastus\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"%dotenv\n",
"image_name = os.getenv('docker_login') + os.getenv('image_repo')"
]
},
{
@ -77,21 +95,17 @@
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az account set --subscription $selected_subscription"
"!az account set --subscription \"$selected_subscription\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"!az account show"
@ -99,21 +113,22 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mRegistering is still on-going. You can monitor using 'az provider show -n Microsoft.ContainerService'\u001b[0m\r\n"
]
}
],
"outputs": [],
"source": [
"!az provider register -n Microsoft.ContainerService"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az provider show -n Microsoft.ContainerService"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -132,97 +147,25 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\r\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/msaksrg\",\r\n",
" \"location\": \"eastus\",\r\n",
" \"managedBy\": null,\r\n",
" \"name\": \"msaksrg\",\r\n",
" \"properties\": {\r\n",
" \"provisioningState\": \"Succeeded\"\r\n",
" },\r\n",
" \"tags\": null\r\n",
"}\r\n"
]
}
],
"outputs": [],
"source": [
"!az group create --name $resource_group --location $location"
" !az group create --name $resource_group --location $location"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below, we create the AKS cluster in the resource group we created earlier."
"Below, we create the AKS cluster in the resource group we created earlier. This can take up to 15 minutes."
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K{- Finished ..\n",
" \"additionalProperties\": {},\n",
" \"agentPoolProfiles\": [\n",
" {\n",
" \"additionalProperties\": {},\n",
" \"count\": 1,\n",
" \"dnsPrefix\": null,\n",
" \"fqdn\": null,\n",
" \"name\": \"nodepool1\",\n",
" \"osDiskSizeGb\": null,\n",
" \"osType\": \"Linux\",\n",
" \"ports\": null,\n",
" \"storageProfile\": \"ManagedDisks\",\n",
" \"vmSize\": \"Standard_NC6\",\n",
" \"vnetSubnetId\": null\n",
" }\n",
" ],\n",
" \"dnsPrefix\": \"msAKSTFClu-msaksrg-edf507\",\n",
" \"fqdn\": \"msakstfclu-msaksrg-edf507-9dc6365c.hcp.eastus.azmk8s.io\",\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourcegroups/msaksrg/providers/Microsoft.ContainerService/managedClusters/msAKSTFCluster\",\n",
" \"kubernetesVersion\": \"1.8.10\",\n",
" \"linuxProfile\": {\n",
" \"additionalProperties\": {},\n",
" \"adminUsername\": \"azureuser\",\n",
" \"ssh\": {\n",
" \"additionalProperties\": {},\n",
" \"publicKeys\": [\n",
" {\n",
" \"additionalProperties\": {},\n",
" \"keyData\": \"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDVfKBWPBKS84wluD3DJ0t3hepO2F13pz1VI5d4c7Tn4d80rSKJkF2L2HtAf3w9R7TM5TYcSlMqv+OFtB5iwfMk1k8sarGqmB1aLuEYBD60cqtdWD34DPWz8Y4eQ7x8eQ2joVRgMFpv+SfEuPBaQdTM7QtFiWRA1ZioXElyniL2Snhsd/ICcq5SIcZSPj3z9/eUcKGz/eImLkOYU28l8fLpVg48x70rGOtpfmDmZJ3KT/LImDWbPFF4VIRuiki4qVaMvDvwlEB7BmqM5D8qO7tOM3ncZ3TqUhrSQj9NbeC65xvB83+BiZts63VXAsMLnu+0wbAXnA4W66ly/5UyjC//\"\n",
" }\n",
" ]\n",
" }\n",
" },\n",
" \"location\": \"eastus\",\n",
" \"name\": \"msAKSTFCluster\",\n",
" \"provisioningState\": \"Succeeded\",\n",
" \"resourceGroup\": \"msaksrg\",\n",
" \"servicePrincipalProfile\": {\n",
" \"additionalProperties\": {},\n",
" \"clientId\": \"44ba57c3-4386-4788-a761-8d72faa493e2\",\n",
" \"keyVaultSecretRef\": null,\n",
" \"secret\": null\n",
" },\n",
" \"tags\": null,\n",
" \"type\": \"Microsoft.ContainerService/ManagedClusters\"\n",
"}\n",
"\u001b[0m"
]
}
],
"outputs": [],
"source": [
"!az aks create --resource-group $resource_group --name $aks_name --node-count 1 --generate-ssh-keys -s Standard_NC6"
]
@ -238,14 +181,14 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDownloading client to /usr/local/bin/kubectl from https://storage.googleapis.com/kubernetes-release/release/v1.10.0/bin/linux/amd64/kubectl\u001b[0m\n",
"\u001b[33mDownloading client to /usr/local/bin/kubectl from https://storage.googleapis.com/kubernetes-release/release/v1.11.1/bin/linux/amd64/kubectl\u001b[0m\n",
"\u001b[33mPlease ensure that /usr/local/bin is in your search PATH, so the `kubectl` command can be found.\u001b[0m\n"
]
}
@ -266,17 +209,9 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merged \"msAKSTFCluster\" as current context in /home/mat/.kube/config\r\n"
]
}
],
"outputs": [],
"source": [
"!az aks get-credentials --resource-group $resource_group --name $aks_name"
]
@ -290,7 +225,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 13,
"metadata": {},
"outputs": [
{
@ -298,7 +233,7 @@
"output_type": "stream",
"text": [
"NAME STATUS ROLES AGE VERSION\r\n",
"aks-nodepool1-27496346-0 Ready agent 3m v1.8.10\r\n"
"aks-nodepool1-28016997-0 Ready agent 59d v1.9.6\r\n"
]
}
],
@ -315,7 +250,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 16,
"metadata": {},
"outputs": [
{
@ -323,13 +258,14 @@
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"kube-system heapster-75f8df9884-vlt25 2/2 Running 0 1m\r\n",
"kube-system kube-dns-v20-5bf84586f4-9jd9r 3/3 Running 0 1m\r\n",
"kube-system kube-dns-v20-5bf84586f4-f8nsn 3/3 Running 0 1m\r\n",
"kube-system kube-proxy-x64jp 1/1 Running 0 1m\r\n",
"kube-system kube-svc-redirect-mkwss 1/1 Running 0 1m\r\n",
"kube-system kubernetes-dashboard-665f768455-npsfh 1/1 Running 0 1m\r\n",
"kube-system tunnelfront-5c48644fb8-4c6dt 1/1 Running 0 1m\r\n"
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 59d\r\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 59d\r\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 59d\r\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 59d\r\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 59d\r\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 59d\r\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 59d\r\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 28 59d\r\n"
]
}
],
@ -349,10 +285,8 @@
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"app_template = {\n",
@ -437,24 +371,8 @@
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def write_json_to_file(json_dict, filename, mode='w'):\n",
" with open(filename, mode) as outfile:\n",
" json.dump(json_dict, outfile, indent=4,sort_keys=True)\n",
" outfile.write('\\n\\n')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"write_json_to_file(app_template, 'az-dl.json') # We write the service template to the json file"
@ -462,10 +380,8 @@
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"write_json_to_file(service_temp, 'az-dl.json', mode='a') # We add the loadbelanacer template to the json file"
@ -480,7 +396,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 20,
"metadata": {},
"outputs": [
{
@ -510,7 +426,7 @@
" \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.6/lib\"\r\n",
" }\r\n",
" ],\r\n",
" \"image\": \"masalvar/tfresnet-gpu\",\r\n",
" \"image\": \"caia/tfresnet-gpu\",\r\n",
" \"name\": \"azure-dl\",\r\n",
" \"ports\": [\r\n",
" {\r\n",
@ -582,15 +498,15 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment.apps \"azure-dl\" created\n",
"service \"azure-dl\" created\n"
"deployment.apps/azure-dl created\n",
"service/azure-dl created\n"
]
}
],
@ -607,22 +523,23 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"default azure-dl-9db45b4f7-bqq5g 0/1 ImagePullBackOff 0 2m\r\n",
"kube-system heapster-75f8df9884-vlt25 2/2 Running 0 4m\r\n",
"kube-system kube-dns-v20-5bf84586f4-9jd9r 3/3 Running 0 5m\r\n",
"kube-system kube-dns-v20-5bf84586f4-f8nsn 3/3 Running 0 5m\r\n",
"kube-system kube-proxy-x64jp 1/1 Running 0 5m\r\n",
"kube-system kube-svc-redirect-mkwss 1/1 Running 0 5m\r\n",
"kube-system kubernetes-dashboard-665f768455-npsfh 1/1 Running 0 5m\r\n",
"kube-system tunnelfront-5c48644fb8-4c6dt 1/1 Running 0 5m\r\n"
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"default azure-dl-c6b866d47-mn8pr 1/1 Running 0 10m\r\n",
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 59d\r\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 59d\r\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 59d\r\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 59d\r\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 59d\r\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 59d\r\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 59d\r\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 28 59d\r\n"
]
}
],
@ -639,35 +556,28 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LAST SEEN FIRST SEEN COUNT NAME KIND SUBOBJECT TYPE REASON SOURCE MESSAGE\r\n",
"13m 13m 1 aks-nodepool1-27496346-0.152457f2d4c9a0c1 Node Normal Starting kubelet, aks-nodepool1-27496346-0 Starting kubelet.\r\n",
"11m 13m 8 aks-nodepool1-27496346-0.152457f2d68e13e7 Node Normal NodeHasSufficientDisk kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasSufficientDisk\r\n",
"11m 13m 8 aks-nodepool1-27496346-0.152457f2d68e3bbf Node Normal NodeHasSufficientMemory kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasSufficientMemory\r\n",
"12m 13m 7 aks-nodepool1-27496346-0.152457f2d68ed1bf Node Normal NodeHasNoDiskPressure kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasNoDiskPressure\r\n",
"13m 13m 1 aks-nodepool1-27496346-0.152457f2d6a0f2ce Node Normal NodeAllocatableEnforced kubelet, aks-nodepool1-27496346-0 Updated Node Allocatable limit across pods\r\n",
"9m 9m 1 aks-nodepool1-27496346-0.152458224f43e592 Node Normal RegisteredNode controllermanager Node aks-nodepool1-27496346-0 event: Registered Node aks-nodepool1-27496346-0 in Controller\r\n",
"9m 9m 1 aks-nodepool1-27496346-0.15245824056578a2 Node Normal Starting kube-proxy, aks-nodepool1-27496346-0 Starting kube-proxy.\r\n",
"6m 6m 1 azure-dl-9db45b4f7-bqq5g.152458484af7e9af Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-9db45b4f7-bqq5g to aks-nodepool1-27496346-0\r\n",
"6m 6m 1 azure-dl-9db45b4f7-bqq5g.1524584856327c4a Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"nvidia\" \r\n",
"6m 6m 1 azure-dl-9db45b4f7-bqq5g.1524584857145997 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"default-token-bb2wg\" \r\n",
"4m 6m 2 azure-dl-9db45b4f7-bqq5g.1524584884dda263 Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-27496346-0 pulling image \"masalvar/tfresnet-gpu\"\r\n",
"5m 5m 1 azure-dl-9db45b4f7-bqq5g.152458607ea65616 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-27496346-0 Failed to pull image \"masalvar/tfresnet-gpu\": rpc error: code = Canceled desc = context canceled\r\n",
"5m 5m 2 azure-dl-9db45b4f7-bqq5g.152458607ea7d5af Pod Warning FailedSync kubelet, aks-nodepool1-27496346-0 Error syncing pod\r\n",
"5m 5m 1 azure-dl-9db45b4f7-bqq5g.15245860c6aafe6f Pod spec.containers{azure-dl} Normal BackOff kubelet, aks-nodepool1-27496346-0 Back-off pulling image \"masalvar/tfresnet-gpu\"\r\n",
"1m 1m 1 azure-dl-9db45b4f7-bqq5g.152458908b326961 Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-27496346-0 Successfully pulled image \"masalvar/tfresnet-gpu\"\r\n",
"1m 1m 1 azure-dl-9db45b4f7-bqq5g.152458909644c422 Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-27496346-0 Created container\r\n",
"1m 1m 1 azure-dl-9db45b4f7-bqq5g.152458909ed20818 Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-27496346-0 Started container\r\n",
"6m 6m 1 azure-dl-9db45b4f7.152458484a8f532a ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-9db45b4f7-bqq5g\r\n",
"6m 6m 1 azure-dl.1524584848faa7c7 Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-9db45b4f7 to 1\r\n",
"6m 6m 1 azure-dl.152458484c36803c Service Normal EnsuringLoadBalancer service-controller Ensuring load balancer\r\n",
"3m 3m 1 azure-dl.152458762827d674 Service Normal EnsuredLoadBalancer service-controller Ensured load balancer\r\n"
"LAST SEEN FIRST SEEN COUNT NAME KIND SUBOBJECT TYPE REASON SOURCE MESSAGE\r\n",
"12m 12m 1 azure-dl-b47cf8cdd-4gg6v.15485effd8145de4 Pod spec.containers{azure-dl} Normal Killing kubelet, aks-nodepool1-28016997-0 Killing container with id docker://azure-dl:Need to kill Pod\r\n",
"10m 10m 1 azure-dl-c6b866d47-mn8pr.15485f14253c78d6 Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-c6b866d47-mn8pr to aks-nodepool1-28016997-0\r\n",
"10m 10m 1 azure-dl-c6b866d47-mn8pr.15485f1432f1e899 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"nvidia\" \r\n",
"10m 10m 1 azure-dl-c6b866d47-mn8pr.15485f1434ed3ff5 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"default-token-crgnj\" \r\n",
"10m 10m 1 azure-dl-c6b866d47-mn8pr.15485f145faeeb7b Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-28016997-0 pulling image \"caia/tfresnet-gpu\"\r\n",
"7m 7m 1 azure-dl-c6b866d47-mn8pr.15485f3ec9c43c59 Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-28016997-0 Successfully pulled image \"caia/tfresnet-gpu\"\r\n",
"7m 7m 1 azure-dl-c6b866d47-mn8pr.15485f3ed4d030d6 Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-28016997-0 Created container\r\n",
"7m 7m 1 azure-dl-c6b866d47-mn8pr.15485f3edc58897b Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-28016997-0 Started container\r\n",
"10m 10m 1 azure-dl-c6b866d47.15485f1424ed5001 ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-c6b866d47-mn8pr\r\n",
"12m 12m 1 azure-dl.15485efd312f727a Service Normal DeletingLoadBalancer service-controller Deleting load balancer\r\n",
"10m 10m 1 azure-dl.15485f14238f49da Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-c6b866d47 to 1\r\n",
"9m 9m 1 azure-dl.15485f1eb448e622 Service Normal DeletedLoadBalancer service-controller Deleted load balancer\r\n",
"9m 9m 1 azure-dl.15485f1eb449d728 Service Normal EnsuringLoadBalancer service-controller Ensuring load balancer\r\n",
"6m 6m 1 azure-dl.15485f4a6f913ac5 Service Normal EnsuredLoadBalancer service-controller Ensured load balancer\r\n"
]
}
],
@ -677,32 +587,35 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-04-11 09:45:52,173 CRIT Supervisor running as root (no user in config file)\r\n",
"2018-04-11 09:45:52,175 INFO supervisord started with pid 1\r\n",
"2018-04-11 09:45:53,178 INFO spawned: 'program_exit' with pid 9\r\n",
"2018-04-11 09:45:53,179 INFO spawned: 'nginx' with pid 10\r\n",
"2018-04-11 09:45:53,180 INFO spawned: 'gunicorn' with pid 11\r\n",
"2018-04-11 09:45:54,211 INFO success: program_exit entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)\r\n",
"2018-04-11 09:45:54.734234: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA\r\n",
"2018-04-11 09:45:58,739 INFO success: nginx entered RUNNING state, process has stayed up for > than 5 seconds (startsecs)\r\n",
"2018-04-11 09:46:01.556833: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: \r\n",
"2018-08-06 18:33:16,788 CRIT Supervisor running as root (no user in config file)\r\n",
"2018-08-06 18:33:16,790 INFO supervisord started with pid 1\r\n",
"2018-08-06 18:33:17,792 INFO spawned: 'program_exit' with pid 9\r\n",
"2018-08-06 18:33:17,794 INFO spawned: 'nginx' with pid 10\r\n",
"2018-08-06 18:33:17,796 INFO spawned: 'gunicorn' with pid 11\r\n",
"2018-08-06 18:33:18,828 INFO success: program_exit entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)\r\n",
"2018-08-06 18:33:22.752422: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\r\n",
"2018-08-06 18:33:22.947363: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 0 with properties: \r\n",
"name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235\r\n",
"pciBusID: 1705:00:00.0\r\n",
"pciBusID: ddde:00:00.0\r\n",
"totalMemory: 11.17GiB freeMemory: 11.10GiB\r\n",
"2018-04-11 09:46:01.556879: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla K80, pci bus id: 1705:00:00.0, compute capability: 3.7)\r\n",
"INFO:tensorflow:Restoring parameters from resnet_v1_152.ckpt\r\n",
"{\"level\": \"INFO\", \"host\": \"azure-dl-9db45b4f7-bqq5g\", \"timestamp\": \"2018-04-11T09:46:07.076248Z\", \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/tensorflow/python/platform/tf_logging.py\", \"msg\": \"Restoring parameters from %s\", \"logger\": \"tensorflow\", \"message\": \"Restoring parameters from resnet_v1_152.ckpt\", \"stack_info\": null, \"tags\": []}\r\n",
"{\"level\": \"INFO\", \"host\": \"azure-dl-9db45b4f7-bqq5g\", \"timestamp\": \"2018-04-11T09:46:08.969746Z\", \"path\": \"/code/driver.py\", \"logger\": \"model_driver\", \"message\": \"Model loading time: 14236.73 ms\", \"stack_info\": null, \"tags\": []}\r\n",
"2018-08-06 18:33:22.947410: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0\r\n",
"2018-08-06 18:33:22,947 INFO success: nginx entered RUNNING state, process has stayed up for > than 5 seconds (startsecs)\r\n",
"2018-08-06 18:33:23.298234: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:\r\n",
"2018-08-06 18:33:23.298552: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958] 0 \r\n",
"2018-08-06 18:33:23.298694: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: N \r\n",
"2018-08-06 18:33:23.299099: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10761 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: ddde:00:00.0, compute capability: 3.7)\r\n",
"{\"message\": \"Restoring parameters from resnet_v1_152.ckpt\", \"logger\": \"tensorflow\", \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/tensorflow/python/platform/tf_logging.py\", \"host\": \"azure-dl-c6b866d47-mn8pr\", \"level\": \"INFO\", \"stack_info\": null, \"tags\": [], \"timestamp\": \"2018-08-06T18:33:30.336488Z\", \"msg\": \"Restoring parameters from %s\"}\r\n",
"{\"message\": \"Model loading time: 8247.39 ms\", \"logger\": \"model_driver\", \"path\": \"/code/driver.py\", \"host\": \"azure-dl-c6b866d47-mn8pr\", \"level\": \"INFO\", \"stack_info\": null, \"tags\": [], \"timestamp\": \"2018-08-06T18:33:30.998172Z\"}\r\n",
"Initialising\r\n",
"{\"level\": \"INFO\", \"host\": \"azure-dl-9db45b4f7-bqq5g\", \"timestamp\": \"2018-04-11T09:46:08.974735Z\", \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/werkzeug/_internal.py\", \"msg\": \" * Running on %s://%s:%d/ %s\", \"logger\": \"werkzeug\", \"message\": \" * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\", \"stack_info\": null, \"tags\": []}\r\n",
"2018-04-11 09:46:13,980 INFO success: gunicorn entered RUNNING state, process has stayed up for > than 20 seconds (startsecs)\r\n"
"{\"message\": \" * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\", \"logger\": \"werkzeug\", \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/werkzeug/_internal.py\", \"host\": \"azure-dl-c6b866d47-mn8pr\", \"level\": \"INFO\", \"stack_info\": null, \"tags\": [], \"timestamp\": \"2018-08-06T18:33:31.003895Z\", \"msg\": \" * Running on %s://%s:%d/ %s\"}\r\n",
"2018-08-06 18:33:38,015 INFO success: gunicorn entered RUNNING state, process has stayed up for > than 20 seconds (startsecs)\r\n"
]
}
],
@ -721,15 +634,15 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\r\n",
"azure-dl LoadBalancer 10.0.63.93 13.82.95.158 80:31941/TCP 7m\r\n"
"NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\r\n",
"azure-dl LoadBalancer 10.0.153.149 40.121.110.33 80:32087/TCP 11m\r\n"
]
}
],
@ -741,69 +654,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that we have our deployed service we can move onto [testing it](05_TestWebApp.ipynb) \n",
"Below are the instructions to tear everything down once we are done with the cluster"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='section6'></a>\n",
"## Tear it all down \n",
"Once you are done with your cluster you can use the following two commands to destroy it all."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment.apps \"azure-dl\" deleted\n",
"service \"azure-dl\" deleted\n"
]
}
],
"source": [
"!kubectl delete -f az-dl.json"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K\u001b[0minished .."
]
}
],
"source": [
"!az aks delete -n $aks_name -g $resource_group -y"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K\u001b[0minished .."
]
}
],
"source": [
"!az group delete --name $resource_group -y"
"Next, we will [test our web application](05_TestWebApp.ipynb) deployed on AKS. "
]
}
],

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down\n",
"Once you are done with your cluster you can use the following two commands to destroy it all."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext dotenv\n",
"%dotenv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once you are done with your cluster you can use the following two commands to destroy it all. First, delete the application."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment.apps \"azure-dl\" deleted\n",
"service \"azure-dl\" deleted\n"
]
}
],
"source": [
"!kubectl delete -f az-dl.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, you delete the AKS cluster. This step may take a few minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az aks delete -n $aks_name -g $resource_group -y"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, you should delete the resource group. This also deletes the AKS cluster and can be used instead of the above command if the resource group is only used for this purpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az group delete --name $resource_group -y"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:AKSDeployment]",
"language": "python",
"name": "conda-env-AKSDeployment-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -1,10 +1,11 @@
# Deploy ResNet 152 Tensorflow model on GPU enaled Kubernetes cluster
In this folder are the tutorials for deploying a Tensorflow model on a Kubernetes cluster.
The tutorial is made up of seven notebooks:
The tutorial is made up of the following notebooks:
* [Model development](00_DevelopModel.ipynb) where we load the pretrained model and test it by using it to score images
* [Developing the interface](01_DevelopModelDriver.ipynb) our Flask app will use to load and call the model
* [Building the Docker Image](02_BuildImage.ipynb) with our Flask REST API and model
* [Testing our Docker image](03_TestLocally.ipynb) before deployment
* [Creating our Kubernetes cluster](04_DeployOnAKS.ipynb) and deploying our application to it
* [Testing the deployed model](05_TestWebApp.ipynb)
* [Testing the throughput](06_SpeedTestWebApp.ipynb) of our model
* [Testing the throughput](06_SpeedTestWebApp.ipynb) of our model
* [Cleaning the resources](07_TearDown.ipynb) used

Просмотреть файл

@ -1,14 +1,18 @@
name: AKSDeployment
channels:
- conda-forge
dependencies:
- python=3.5
- nb_conda==2.2.0
- tornado==4.5.3
- pip:
- ipykernel==4.8.2
- papermill==0.14.1
- python-dotenv==0.9.0
- Pillow==5.2.0
- wget==3.2
- matplotlib==2.2.2
- aiohttp==3.3.2
- toolz==0.9.0
- tqdm==4.23.4
- tornado==4.5.3
- azure-cli==2.0.41
- tensorflow-gpu==1.9.0

Просмотреть файл

@ -7,7 +7,7 @@ import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import toolz
from PIL import Image, ImageOps
import random
def read_image_from(url):
return toolz.pipe(url,
@ -89,4 +89,22 @@ def plot_predictions(images, classification_results):
ax = fig.add_subplot(gg2[3, 1:9])
_plot_prediction_bar(ax, r)
def write_json_to_file(json_dict, filename, mode='w'):
with open(filename, mode) as outfile:
json.dump(json_dict, outfile, indent=4,sort_keys=True)
outfile.write('\n\n')
def gen_variations_of_one_image(IMAGEURL, num, label='image'):
out_images = []
img = to_img(IMAGEURL).convert('RGB')
# Flip the colours for one-pixel
# "Different Image"
for i in range(num):
diff_img = img.copy()
rndm_pixel_x_y = (random.randint(0, diff_img.size[0]-1),
random.randint(0, diff_img.size[1]-1))
current_color = diff_img.getpixel(rndm_pixel_x_y)
diff_img.putpixel(rndm_pixel_x_y, current_color[::-1])
b64img = to_base64(diff_img)
out_images.append(json.dumps({'input':{label:'\"{0}\"'.format(b64img)}}))
return out_images