From b6ee4eb541e7b5ec71137ed3af08818f76d07ad9 Mon Sep 17 00:00:00 2001 From: Mathew Salvaris Date: Thu, 22 Mar 2018 15:50:24 +0000 Subject: [PATCH] Adds notebook that deploys our model and docker container to AKS on GPU --- 04_DeployOnAKS.ipynb | 884 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 884 insertions(+) create mode 100644 04_DeployOnAKS.ipynb diff --git a/04_DeployOnAKS.ipynb b/04_DeployOnAKS.ipynb new file mode 100644 index 0000000..9a033d4 --- /dev/null +++ b/04_DeployOnAKS.ipynb @@ -0,0 +1,884 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "### Deploy Web App on Azure Container Services (AKS)\n", + "In this notebook we will set up an Azure Container Service which will be managed by Kubernetes. We will then take the Docker image we created earlier that contains our app and deploy it to the ACS cluster. Then we will check everything is working by sending an image to it and getting it scored.\n", + "\n", + "The process is split into the following steps:\n", + "* [Define our resource names](#section1)\n", + "* [Login to Azure](#section2)\n", + "* [Create the ACS](#section3)\n", + "* [Create a tunnel to the head node](#section4)\n", + "* [Create a JSON schema of our APP and push it to the cluster](#section5)\n", + "* [Test our app](TestWebApp.ipynb)\n", + "* [Tear it all down](#section7)\n", + "\n", + "This guide assumes is designed to be run on linux and requires that the Azure CLI is installed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "Below are the various name definitions for the resources needed to setup ACS as well as the name of the Docker image we will be using." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Some outputs (and inputs) below have been hidden/masked for confidentiality**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "resource_group = \"msaksrg\" # Feel free to modify these\n", + "aks_name = \"msAKSTFCluster\"\n", + "location = \"eastus\"\n", + "\n", + "image_name = 'masalvar/tfresnet-gpu' \n", + "selected_subscription = \"'Team Danielle Internal'\" # If you have multiple subscriptions select \n", + " # the subscription you want to use here" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Azure account login\n", + "The command below will initiate a login to your Azure account. It will pop up with an url to go to where you will enter a one off code and log into your Azure account using your browser." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mTo sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code FNGYXPRU3 to authenticate.\u001b[0m\n", + "CloudName Name State TenantId IsDefault\n", + "----------- ----------------------------- ------- ------------------------------------ -----------\n", + "AzureCloud Boston DS Dev Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Azure Internal - London Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Team Danielle Internal Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47 True\n", + "AzureCloud Visual Studio Enterprise Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Boston Engineering Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud ADLTrainingMS Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud PhillyExt Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Ads Eng Big Data Subscription Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Data Wrangling Preview Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Data Wrangling development Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud AzureML Client PROD Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud R portal - Production Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud PhillyInt Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Solution Template Testing Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Team Ilan Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n", + "AzureCloud Marketing Automation Enabled 72f988bf-86f1-41af-91ab-2d7cd011db47\n" + ] + } + ], + "source": [ + "!az login -o table" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "!az account set --subscription $selected_subscription" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\r\n", + " \"environmentName\": \"AzureCloud\",\r\n", + " \"id\": \"edf507a2-6235-46c5-b560-fd463ba2e771\",\r\n", + " \"isDefault\": true,\r\n", + " \"name\": \"Team Danielle Internal\",\r\n", + " \"state\": \"Enabled\",\r\n", + " \"tenantId\": \"72f988bf-86f1-41af-91ab-2d7cd011db47\",\r\n", + " \"user\": {\r\n", + " \"name\": \"masalvar@microsoft.com\",\r\n", + " \"type\": \"user\"\r\n", + " }\r\n", + "}\r\n" + ] + } + ], + "source": [ + "!az account show" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mRegistering is still on-going. You can monitor using 'az provider show -n Microsoft.ContainerService'\u001b[0m\r\n" + ] + } + ], + "source": [ + "!az provider register -n Microsoft.ContainerService" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create resources and dependencies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create resource group\n", + "Azure encourages the use of groups to organise all the Azure components you deploy. That way it is easier to find them but also we can deleted a number of resources simply by deleting the group." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\r\n", + " \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/msaksrg\",\r\n", + " \"location\": \"eastus\",\r\n", + " \"managedBy\": null,\r\n", + " \"name\": \"msaksrg\",\r\n", + " \"properties\": {\r\n", + " \"provisioningState\": \"Succeeded\"\r\n", + " },\r\n", + " \"tags\": null\r\n", + "}\r\n" + ] + } + ], + "source": [ + "!az group create --name $resource_group --location $location" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[K{- Finished ..\n", + " \"additionalProperties\": {},\n", + " \"agentPoolProfiles\": [\n", + " {\n", + " \"additionalProperties\": {},\n", + " \"count\": 1,\n", + " \"dnsPrefix\": null,\n", + " \"fqdn\": null,\n", + " \"name\": \"nodepool1\",\n", + " \"osDiskSizeGb\": null,\n", + " \"osType\": \"Linux\",\n", + " \"ports\": null,\n", + " \"storageProfile\": \"ManagedDisks\",\n", + " \"vmSize\": \"Standard_NC6\",\n", + " \"vnetSubnetId\": null\n", + " }\n", + " ],\n", + " \"dnsPrefix\": \"msAKSTFClu-msaksrg-edf507\",\n", + " \"fqdn\": \"msakstfclu-msaksrg-edf507-26f4c0b4.hcp.eastus.azmk8s.io\",\n", + " \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourcegroups/msaksrg/providers/Microsoft.ContainerService/managedClusters/msAKSTFCluster\",\n", + " \"kubernetesVersion\": \"1.7.9\",\n", + " \"linuxProfile\": {\n", + " \"additionalProperties\": {},\n", + " \"adminUsername\": \"azureuser\",\n", + " \"ssh\": {\n", + " \"additionalProperties\": {},\n", + " \"publicKeys\": [\n", + " {\n", + " \"additionalProperties\": {},\n", + " \"keyData\": \"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDVfKBWPBKS84wluD3DJ0t3hepO2F13pz1VI5d4c7Tn4d80rSKJkF2L2HtAf3w9R7TM5TYcSlMqv+OFtB5iwfMk1k8sarGqmB1aLuEYBD60cqtdWD34DPWz8Y4eQ7x8eQ2joVRgMFpv+SfEuPBaQdTM7QtFiWRA1ZioXElyniL2Snhsd/ICcq5SIcZSPj3z9/eUcKGz/eImLkOYU28l8fLpVg48x70rGOtpfmDmZJ3KT/LImDWbPFF4VIRuiki4qVaMvDvwlEB7BmqM5D8qO7tOM3ncZ3TqUhrSQj9NbeC65xvB83+BiZts63VXAsMLnu+0wbAXnA4W66ly/5UyjC//\"\n", + " }\n", + " ]\n", + " }\n", + " },\n", + " \"location\": \"eastus\",\n", + " \"name\": \"msAKSTFCluster\",\n", + " \"provisioningState\": \"Succeeded\",\n", + " \"resourceGroup\": \"msaksrg\",\n", + " \"servicePrincipalProfile\": {\n", + " \"additionalProperties\": {},\n", + " \"clientId\": \"44ba57c3-4386-4788-a761-8d72faa493e2\",\n", + " \"keyVaultSecretRef\": null,\n", + " \"secret\": null\n", + " },\n", + " \"tags\": null,\n", + " \"type\": \"Microsoft.ContainerService/ManagedClusters\"\n", + "}\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!az aks create --resource-group $resource_group --name $aks_name --node-count 1 --generate-ssh-keys -s Standard_NC6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install cli from comamnd prompt" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mDownloading client to /usr/local/bin/kubectl from https://storage.googleapis.com/kubernetes-release/release/v1.9.4/bin/linux/amd64/kubectl\u001b[0m\n", + "\u001b[33mPlease ensure that /usr/local/bin is in your search PATH, so the `kubectl` command can be found.\u001b[0m\n" + ] + } + ], + "source": [ + "!sudo az aks install-cli" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "app_template = {\n", + " \"apiVersion\": \"apps/v1beta1\",\n", + " \"kind\": \"Deployment\",\n", + " \"metadata\": {\n", + " \"name\": \"azure-dl\"\n", + " },\n", + " \"spec\":{\n", + " \"replicas\":1,\n", + " \"template\":{\n", + " \"metadata\":{\n", + " \"labels\":{\n", + " \"app\":\"azure-dl\"\n", + " }\n", + " },\n", + " \"spec\":{\n", + " \"containers\":[\n", + " {\n", + " \"name\": \"azure-dl\",\n", + " \"image\": \"masalvar/tfresnet-gpu\",\n", + " \"env\":[\n", + " {\n", + " \"name\": \"LD_LIBRARY_PATH\",\n", + " \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.6/lib\"\n", + " }\n", + " ],\n", + " \"ports\":[\n", + " {\n", + " \"containerPort\":80,\n", + " \"name\":\"model\"\n", + " }\n", + " ],\n", + " \"volumeMounts\":[\n", + " {\n", + " \"name\": \"bin\",\n", + " \"mountPath\":\"/usr/local/nvidia/bin\" \n", + " },\n", + " {\n", + " \"name\": \"lib\",\n", + " \"mountPath\":\"/usr/local/nvidia/lib64\" \n", + " },\n", + " {\n", + " \"name\": \"libcuda\",\n", + " \"mountPath\":\"/usr/lib/x86_64-linux-gnu/libcuda.so.1\" \n", + " },\n", + " ],\n", + " \"resources\":{\n", + " \"requests\":{\n", + " \"alpha.kubernetes.io/nvidia-gpu\": 1\n", + " },\n", + " \"limits\":{\n", + " \"alpha.kubernetes.io/nvidia-gpu\": 1\n", + " }\n", + " } \n", + " }\n", + " ],\n", + " \"volumes\":[\n", + " {\n", + " \"name\": \"bin\",\n", + " \"hostPath\":{\n", + " \"path\":\"/usr/lib/nvidia-384/bin\"\n", + " },\n", + " },\n", + " {\n", + " \"name\": \"lib\",\n", + " \"hostPath\":{\n", + " \"path\":\"/usr/lib/nvidia-384\"\n", + " },\n", + " },\n", + " {\n", + " \"name\": \"libcuda\",\n", + " \"hostPath\":{\n", + " \"path\":\"/usr/lib/x86_64-linux-gnu/libcuda.so.1\"\n", + " },\n", + " },\n", + " ]\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "service_temp = {\n", + " \"apiVersion\": \"v1\",\n", + " \"kind\": \"Service\",\n", + " \"metadata\": {\n", + " \"name\": \"azure-dl\"\n", + " },\n", + " \"spec\":{\n", + " \"type\": \"LoadBalancer\",\n", + " \"ports\":[\n", + " {\n", + " \"port\":80\n", + " }\n", + " ],\n", + " \"selector\":{\n", + " \"app\":\"azure-dl\"\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def write_json_to_file(json_dict, filename, mode='w'):\n", + " with open(filename, mode) as outfile:\n", + " json.dump(json_dict, outfile, indent=4,sort_keys=True)\n", + " outfile.write('\\n\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "write_json_to_file(app_template, 'az-dl.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "write_json_to_file(service_temp, 'az-dl.json', mode='a')" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\r\n", + " \"apiVersion\": \"apps/v1beta1\",\r\n", + " \"kind\": \"Deployment\",\r\n", + " \"metadata\": {\r\n", + " \"name\": \"azure-dl\"\r\n", + " },\r\n", + " \"spec\": {\r\n", + " \"replicas\": 1,\r\n", + " \"template\": {\r\n", + " \"metadata\": {\r\n", + " \"labels\": {\r\n", + " \"app\": \"azure-dl\"\r\n", + " }\r\n", + " },\r\n", + " \"spec\": {\r\n", + " \"containers\": [\r\n", + " {\r\n", + " \"env\": [\r\n", + " {\r\n", + " \"name\": \"LD_LIBRARY_PATH\",\r\n", + " \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.6/lib\"\r\n", + " }\r\n", + " ],\r\n", + " \"image\": \"masalvar/tfresnet-gpu\",\r\n", + " \"name\": \"azure-dl\",\r\n", + " \"ports\": [\r\n", + " {\r\n", + " \"containerPort\": 80,\r\n", + " \"name\": \"model\"\r\n", + " }\r\n", + " ],\r\n", + " \"resources\": {\r\n", + " \"limits\": {\r\n", + " \"alpha.kubernetes.io/nvidia-gpu\": 1\r\n", + " },\r\n", + " \"requests\": {\r\n", + " \"alpha.kubernetes.io/nvidia-gpu\": 1\r\n", + " }\r\n", + " },\r\n", + " \"volumeMounts\": [\r\n", + " {\r\n", + " \"mountPath\": \"/usr/local/nvidia/bin\",\r\n", + " \"name\": \"bin\"\r\n", + " },\r\n", + " {\r\n", + " \"mountPath\": \"/usr/local/nvidia/lib64\",\r\n", + " \"name\": \"lib\"\r\n", + " },\r\n", + " {\r\n", + " \"mountPath\": \"/usr/lib/x86_64-linux-gnu/libcuda.so.1\",\r\n", + " \"name\": \"libcuda\"\r\n", + " }\r\n", + " ]\r\n", + " }\r\n", + " ],\r\n", + " \"volumes\": [\r\n", + " {\r\n", + " \"hostPath\": {\r\n", + " \"path\": \"/usr/lib/nvidia-384/bin\"\r\n", + " },\r\n", + " \"name\": \"bin\"\r\n", + " },\r\n", + " {\r\n", + " \"hostPath\": {\r\n", + " \"path\": \"/usr/lib/nvidia-384\"\r\n", + " },\r\n", + " \"name\": \"lib\"\r\n", + " },\r\n", + " {\r\n", + " \"hostPath\": {\r\n", + " \"path\": \"/usr/lib/x86_64-linux-gnu/libcuda.so.1\"\r\n", + " },\r\n", + " \"name\": \"libcuda\"\r\n", + " }\r\n", + " ]\r\n", + " }\r\n", + " }\r\n", + " }\r\n", + "}\r\n", + "\r\n", + "{\r\n", + " \"apiVersion\": \"v1\",\r\n", + " \"kind\": \"Service\",\r\n", + " \"metadata\": {\r\n", + " \"name\": \"azure-dl\"\r\n", + " },\r\n", + " \"spec\": {\r\n", + " \"ports\": [\r\n", + " {\r\n", + " \"port\": 80\r\n", + " }\r\n", + " ],\r\n", + " \"selector\": {\r\n", + " \"app\": \"azure-dl\"\r\n", + " },\r\n", + " \"type\": \"LoadBalancer\"\r\n", + " }\r\n", + "}\r\n", + "\r\n" + ] + } + ], + "source": [ + "!cat az-dl.json" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merged \"msAKSTFCluster\" as current context in /home/mat/.kube/config\r\n" + ] + } + ], + "source": [ + "!az aks get-credentials --resource-group=$resource_group --name=$aks_name" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NAME STATUS ROLES AGE VERSION\r\n", + "aks-nodepool1-27496346-0 Ready agent 44m v1.7.9\r\n" + ] + } + ], + "source": [ + "!kubectl get nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NAMESPACE NAME READY STATUS RESTARTS AGE\r\n", + "kube-system heapster-2574232661-1dx42 2/2 Running 0 41m\r\n", + "kube-system kube-dns-v20-2253765213-3kb0s 3/3 Running 0 42m\r\n", + "kube-system kube-dns-v20-2253765213-p80ng 3/3 Running 0 42m\r\n", + "kube-system kube-proxy-9zd4s 1/1 Running 0 42m\r\n", + "kube-system kube-svc-redirect-c8klv 1/1 Running 0 42m\r\n", + "kube-system kubernetes-dashboard-2898242510-9l409 1/1 Running 0 42m\r\n", + "kube-system tunnelfront-180102643-hn69h 1/1 Running 0 42m\r\n" + ] + } + ], + "source": [ + "!kubectl get pods --all-namespaces" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deployment \"azure-dl\" created\n", + "service \"azure-dl\" created\n" + ] + } + ], + "source": [ + "!kubectl create -f az-dl.json" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NAMESPACE NAME READY STATUS RESTARTS AGE\r\n", + "default azure-dl-3880299103-jsn4n 1/1 Running 0 11m\r\n", + "kube-system heapster-2574232661-1dx42 2/2 Running 0 53m\r\n", + "kube-system kube-dns-v20-2253765213-3kb0s 3/3 Running 0 54m\r\n", + "kube-system kube-dns-v20-2253765213-p80ng 3/3 Running 0 54m\r\n", + "kube-system kube-proxy-9zd4s 1/1 Running 0 54m\r\n", + "kube-system kube-svc-redirect-c8klv 1/1 Running 0 54m\r\n", + "kube-system kubernetes-dashboard-2898242510-9l409 1/1 Running 0 54m\r\n", + "kube-system tunnelfront-180102643-hn69h 1/1 Running 0 54m\r\n" + ] + } + ], + "source": [ + "!kubectl get pods --all-namespaces" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LAST SEEN FIRST SEEN COUNT NAME KIND SUBOBJECT TYPE REASON SOURCE MESSAGE\r\n", + "56m 1h 7 aks-nodepool1-27496346-0.151e4321d9812de1 Node Normal NodeHasSufficientDisk kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasSufficientDisk\r\n", + "56m 1h 7 aks-nodepool1-27496346-0.151e4321d9818fef Node Normal NodeHasSufficientMemory kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasSufficientMemory\r\n", + "56m 1h 7 aks-nodepool1-27496346-0.151e4321d981b123 Node Normal NodeHasNoDiskPressure kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasNoDiskPressure\r\n", + "5s 1h 62 aks-nodepool1-27496346-0.151e4321d998eade Node Warning FailedNodeAllocatableEnforcement kubelet, aks-nodepool1-27496346-0 Failed to update Node Allocatable Limits \"\": failed to set supported cgroup subsystems for cgroup : Failed to set config for supported subsystems : failed to write 59076296704 to memory.limit_in_bytes: write /var/lib/docker/overlay2/5ee5687fca21ea5e2ffbdbbf82a839179a687ca508be84758a090e41fcb3ecf2/merged/sys/fs/cgroup/memory/memory.limit_in_bytes: invalid argument\r\n", + "54m 54m 1 aks-nodepool1-27496346-0.151e437c603ba0d3 Node Normal RegisteredNode controllermanager Node aks-nodepool1-27496346-0 event: Registered Node aks-nodepool1-27496346-0 in NodeController\r\n", + "54m 54m 1 aks-nodepool1-27496346-0.151e437e41f27985 Node Normal Starting kube-proxy, aks-nodepool1-27496346-0 Starting kube-proxy.\r\n", + "54m 54m 1 aks-nodepool1-27496346-0.151e438133293444 Node Normal NodeReady kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeReady\r\n", + "31m 31m 1 azure-dl-2914933029-stvh2.151e44c4846b082f Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-2914933029-stvh2 to aks-nodepool1-27496346-0\r\n", + "31m 31m 1 azure-dl-2914933029-stvh2.151e44c49605c701 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"libcuda\" \r\n", + "31m 31m 1 azure-dl-2914933029-stvh2.151e44c49606ceae Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"lib\" \r\n", + "31m 31m 1 azure-dl-2914933029-stvh2.151e44c496093aff Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"bin\" \r\n", + "31m 31m 1 azure-dl-2914933029-stvh2.151e44c49675d85e Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"default-token-hnhd0\" \r\n", + "29m 31m 2 azure-dl-2914933029-stvh2.151e44c4c33344c2 Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-27496346-0 pulling image \"masalvar/cntkresnet-gpu\"\r\n", + "29m 29m 1 azure-dl-2914933029-stvh2.151e44d7c373b7b1 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-27496346-0 Failed to pull image \"masalvar/cntkresnet-gpu\": rpc error: code = 2 desc = net/http: request canceled\r\n", + "26m 29m 3 azure-dl-2914933029-stvh2.151e44d7c37615f3 Pod Warning FailedSync kubelet, aks-nodepool1-27496346-0 Error syncing pod\r\n", + "29m 29m 1 azure-dl-2914933029-stvh2.151e44d80064f440 Pod spec.containers{azure-dl} Normal BackOff kubelet, aks-nodepool1-27496346-0 Back-off pulling image \"masalvar/cntkresnet-gpu\"\r\n", + "26m 26m 1 azure-dl-2914933029-stvh2.151e4508233ca91d Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-27496346-0 Successfully pulled image \"masalvar/cntkresnet-gpu\"\r\n", + "26m 26m 1 azure-dl-2914933029-stvh2.151e450823842f64 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-27496346-0 Error: Error response from daemon: {\"message\":\"No such container: 620a683f81da9738cee9242ffc83fd7dc71f76493efaac48bb263196ad117836\"}\r\n", + "26m 26m 1 azure-dl-2914933029-tlnmp.151e450845ebce8c Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-2914933029-tlnmp to aks-nodepool1-27496346-0\r\n", + "26m 26m 1 azure-dl-2914933029-tlnmp.151e45085074d34a Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"lib\" \r\n", + "26m 26m 1 azure-dl-2914933029-tlnmp.151e450850a6f5c0 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"bin\" \r\n", + "26m 26m 1 azure-dl-2914933029-tlnmp.151e450850b5de92 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"libcuda\" \r\n", + "26m 26m 1 azure-dl-2914933029-tlnmp.151e4508510e2849 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"default-token-hnhd0\" \r\n", + "25m 25m 1 azure-dl-2914933029-tlnmp.151e450fa7fbbb30 Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-27496346-0 pulling image \"masalvar/cntkresnet-gpu\"\r\n", + "25m 25m 1 azure-dl-2914933029-tlnmp.151e45100e809afd Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-27496346-0 Successfully pulled image \"masalvar/cntkresnet-gpu\"\r\n", + "25m 25m 1 azure-dl-2914933029-tlnmp.151e4510e3e2cf7b Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-27496346-0 Created container\r\n", + "25m 25m 1 azure-dl-2914933029-tlnmp.151e4510eda5194d Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-27496346-0 Started container\r\n", + "12m 12m 1 azure-dl-2914933029-tlnmp.151e45c888d4874e Pod spec.containers{azure-dl} Normal Killing kubelet, aks-nodepool1-27496346-0 Killing container with id docker://azure-dl:Need to kill Pod\r\n", + "31m 31m 1 azure-dl-2914933029.151e44c4828da01b ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-2914933029-stvh2\r\n", + "27m 27m 1 azure-dl-2914933029.151e44f0433ccc88 ReplicaSet Normal SuccessfulDelete replicaset-controller Deleted pod: azure-dl-2914933029-stvh2\r\n", + "26m 26m 1 azure-dl-2914933029.151e450845a59cd5 ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-2914933029-tlnmp\r\n", + "12m 12m 1 azure-dl-2914933029.151e45c61b7fd5a6 ReplicaSet Normal SuccessfulDelete replicaset-controller Deleted pod: azure-dl-2914933029-tlnmp\r\n", + "11m 11m 1 azure-dl-3880299103-jsn4n.151e45d4627d7eaf Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-3880299103-jsn4n to aks-nodepool1-27496346-0\r\n", + "11m 11m 1 azure-dl-3880299103-jsn4n.151e45d469b24e70 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"bin\" \r\n", + "11m 11m 1 azure-dl-3880299103-jsn4n.151e45d469b4f3ca Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"lib\" \r\n", + "11m 11m 1 azure-dl-3880299103-jsn4n.151e45d469cd4be1 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"libcuda\" \r\n", + "11m 11m 1 azure-dl-3880299103-jsn4n.151e45d46a16cf32 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"default-token-hnhd0\" \r\n", + "11m 11m 1 azure-dl-3880299103-jsn4n.151e45d4bceb6016 Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-27496346-0 pulling image \"masalvar/tfresnet-gpu\"\r\n", + "9m 9m 1 azure-dl-3880299103-jsn4n.151e45f3d0873cbd Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-27496346-0 Successfully pulled image \"masalvar/tfresnet-gpu\"\r\n", + "9m 9m 1 azure-dl-3880299103-jsn4n.151e45f3de5e2664 Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-27496346-0 Created container\r\n", + "9m 9m 1 azure-dl-3880299103-jsn4n.151e45f3e7cf4868 Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-27496346-0 Started container\r\n", + "11m 11m 1 azure-dl-3880299103.151e45d461b5dd14 ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-3880299103-jsn4n\r\n", + "31m 31m 1 azure-dl.151e44c4809ef854 Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-2914933029 to 1\r\n", + "31m 31m 1 azure-dl.151e44c4846c4ff9 Service Normal CreatingLoadBalancer service-controller Creating load balancer\r\n", + "28m 28m 1 azure-dl.151e44e49cf53e12 Service Normal CreatedLoadBalancer service-controller Created load balancer\r\n", + "27m 27m 1 azure-dl.151e44f042cb9ca1 Deployment Normal ScalingReplicaSet deployment-controller Scaled down replica set azure-dl-2914933029 to 0\r\n", + "27m 27m 1 azure-dl.151e44f10ce12363 Service Normal DeletingLoadBalancer service-controller Deleting load balancer\r\n", + "26m 26m 1 azure-dl.151e450843800b94 Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-2914933029 to 1\r\n", + "25m 25m 1 azure-dl.151e450fe1af53b3 Service Normal DeletedLoadBalancer service-controller Deleted load balancer\r\n", + "25m 25m 1 azure-dl.151e450fe1afa42b Service Normal CreatingLoadBalancer service-controller Creating load balancer\r\n", + "22m 22m 1 azure-dl.151e45376d5add13 Service Normal CreatedLoadBalancer service-controller Created load balancer\r\n", + "12m 12m 1 azure-dl.151e45c61a9bc760 Deployment Normal ScalingReplicaSet deployment-controller Scaled down replica set azure-dl-2914933029 to 0\r\n", + "12m 12m 1 azure-dl.151e45c6e4b8a1b9 Service Normal DeletingLoadBalancer service-controller Deleting load balancer\r\n", + "11m 11m 1 azure-dl.151e45d4604dbd4a Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-3880299103 to 1\r\n", + "9m 9m 1 azure-dl.151e45ef396adac1 Service Normal DeletedLoadBalancer service-controller Deleted load balancer\r\n", + "9m 9m 1 azure-dl.151e45ef396b3881 Service Normal CreatingLoadBalancer service-controller Creating load balancer\r\n", + "7m 7m 1 azure-dl.151e460b74c777ad Service Normal CreatedLoadBalancer service-controller Created load balancer\r\n" + ] + } + ], + "source": [ + "!kubectl get events" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2018-03-22 14:41:03,137 CRIT Supervisor running as root (no user in config file)\r\n", + "2018-03-22 14:41:03,139 INFO supervisord started with pid 7\r\n", + "2018-03-22 14:41:04,141 INFO spawned: 'program_exit' with pid 17\r\n", + "2018-03-22 14:41:04,143 INFO spawned: 'nginx' with pid 18\r\n", + "2018-03-22 14:41:04,144 INFO spawned: 'gunicorn' with pid 19\r\n", + "2018-03-22 14:41:05,174 INFO success: program_exit entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)\r\n", + "2018-03-22 14:41:09,192 INFO success: nginx entered RUNNING state, process has stayed up for > than 5 seconds (startsecs)\r\n", + "Selected GPU[0] Tesla K80 as the process wide default device.\r\n", + "Initialising\r\n", + "Model loading time: 13501.91 ms\r\n", + "{\"timestamp\": \"2018-03-22T14:41:18.246347Z\", \"message\": \"Model loading time: 13501.91 ms\", \"host\": \"azure-dl-2914933029-tlnmp\", \"path\": \"/code/driver.py\", \"tags\": [], \"level\": \"INFO\", \"logger\": \"cntk_svc_logger\", \"stack_info\": null}\r\n", + "{\"timestamp\": \"2018-03-22T14:41:18.250653Z\", \"message\": \" * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\", \"host\": \"azure-dl-2914933029-tlnmp\", \"path\": \"/opt/conda/envs/py3.6/lib/python3.6/site-packages/werkzeug/_internal.py\", \"tags\": [], \"level\": \"INFO\", \"logger\": \"werkzeug\", \"msg\": \" * Running on %s://%s:%d/ %s\", \"stack_info\": null}\r\n", + "2018-03-22 14:41:24,257 INFO success: gunicorn entered RUNNING state, process has stayed up for > than 20 seconds (startsecs)\r\n" + ] + } + ], + "source": [ + "!kubectl logs azure-dl-2914933029-tlnmp" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\r\n", + "azure-dl LoadBalancer 10.0.155.14 13.82.238.75 80:30532/TCP 11m\r\n" + ] + } + ], + "source": [ + "!kubectl get service azure-dl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tear it all down \n", + "Once you are done with your cluster you can use the following two commands to destroy it all." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "deployment \"azure-dl\" deleted\n", + "service \"azure-dl\" deleted\n" + ] + } + ], + "source": [ + "!kubectl delete -f az-dl.json" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[K\u001b[0minished .." + ] + } + ], + "source": [ + "!az aks delete -n $aks_name -g $resource_group -y" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[K\u001b[0minished .." + ] + } + ], + "source": [ + "!az group delete --name $resource_group -y" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3.5", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}