Updates model development and docker notebooks

This commit is contained in:
msalvaris 2018-10-08 08:46:23 +00:00
Родитель 286a9e753e
Коммит f3ea8e9a20
5 изменённых файлов: 1892 добавлений и 1199 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -46,75 +46,116 @@
],
"source": [
"%%writefile driver.py\n",
"\n",
"import tensorflow as tf\n",
"from resnet152 import ResNet152\n",
"from keras.preprocessing import image\n",
"from keras.applications.imagenet_utils import preprocess_input, decode_predictions\n",
"\n",
"import numpy as np\n",
"import timeit as t\n",
"import base64\n",
"import json\n",
"from PIL import Image, ImageOps\n",
"from io import BytesIO\n",
"import logging\n",
"import os\n",
"import timeit as t\n",
"from io import BytesIO\n",
"from pprint import pprint\n",
"import numpy as np\n",
"import torch\n",
"import torch.nn as nn\n",
"import numpy as np\n",
"import torchvision\n",
"from torchvision import datasets, models, transforms\n",
"import PIL\n",
"from PIL import Image, ImageOps\n",
"\n",
"number_results = 3\n",
"logger = logging.getLogger(\"model_driver\")\n",
"_LABEL_FILE = os.getenv('LABEL_FILE', \"synset.txt\")\n",
"_NUMBER_RESULTS = 3\n",
"\n",
"def _base64img_to_numpy(base64_img_string):\n",
" decoded_img = base64.b64decode(base64_img_string)\n",
" img_buffer = BytesIO(decoded_img)\n",
" imageData = Image.open(img_buffer).convert(\"RGB\")\n",
" img = ImageOps.fit(imageData, (224, 224), Image.ANTIALIAS)\n",
" img = image.img_to_array(img)\n",
" return img\n",
"\n",
"def create_scoring_func():\n",
" \"\"\" Initialize ResNet 152 Model \n",
" \"\"\" \n",
" start = t.default_timer()\n",
" model = ResNet152(weights='imagenet')\n",
" end = t.default_timer()\n",
"def _create_label_lookup(label_path):\n",
" with open(label_path, 'r') as f:\n",
" label_list = [l.rstrip() for l in f]\n",
" \n",
" def _label_lookup(*label_locks):\n",
" return [label_list[l] for l in label_locks]\n",
" \n",
" return _label_lookup\n",
"\n",
"\n",
"def _load_model():\n",
" # Load the model\n",
" model = models.resnet152(pretrained=True)\n",
" model = model.cuda()\n",
" softmax = nn.Softmax(dim=1).cuda()\n",
" model = model.eval()\n",
" \n",
" preprocess_input = transforms.Compose([\n",
" torchvision.transforms.Resize((224, 224), interpolation=PIL.Image.BICUBIC),\n",
" transforms.ToTensor(),\n",
" transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n",
" ])\n",
" \n",
" def predict_for(image):\n",
" image = preprocess_input(image)\n",
" with torch.no_grad():\n",
" image = image.expand(1,3,224,224)\n",
" image_gpu = image.type(torch.float).cuda()\n",
" outputs = model(image_gpu)\n",
" pred_proba = softmax(outputs)\n",
" return pred_proba.cpu().numpy().squeeze()\n",
" \n",
" return predict_for\n",
"\n",
"\n",
"def _base64img_to_pil_image(base64_img_string):\n",
" if base64_img_string.startswith('b\\''):\n",
" base64_img_string = base64_img_string[2:-1]\n",
" base64Img = base64_img_string.encode('utf-8')\n",
"\n",
" # Preprocess the input data \n",
" startPreprocess = t.default_timer()\n",
" decoded_img = base64.b64decode(base64Img)\n",
" img_buffer = BytesIO(decoded_img)\n",
"\n",
" # Load image with PIL (RGB)\n",
" pil_img = Image.open(img_buffer).convert('RGB')\n",
" return pil_img\n",
"\n",
"\n",
"def create_scoring_func(label_path=_LABEL_FILE):\n",
" logger = logging.getLogger(\"model_driver\")\n",
" \n",
" start = t.default_timer()\n",
" labels_for = _create_label_lookup(label_path)\n",
" predict_for = _load_model()\n",
" end = t.default_timer()\n",
"\n",
" loadTimeMsg = \"Model loading time: {0} ms\".format(round((end-start)*1000, 2))\n",
" logger.info(loadTimeMsg)\n",
" \n",
" def call_model(img_array):\n",
" img_array = np.expand_dims(img_array, axis=0)\n",
" img_array = preprocess_input(img_array)\n",
" preds = model.predict(img_array)\n",
" preds = decode_predictions(preds, top=number_results)[0] \n",
" return preds\n",
" \n",
" return call_model \n",
" def call_model(image, number_results=_NUMBER_RESULTS):\n",
" pred_proba = predict_for(image).squeeze()\n",
" selected_results = np.flip(np.argsort(pred_proba), 0)[:number_results]\n",
" labels = labels_for(*selected_results)\n",
" return list(zip(labels, pred_proba[selected_results].astype(np.float64)))\n",
" return call_model\n",
"\n",
"\n",
"def get_model_api():\n",
" logger = logging.getLogger(\"model_driver\")\n",
" scoring_func = create_scoring_func()\n",
" \n",
" def process_and_score(inputString):\n",
" \"\"\" Classify the input using the loaded model\n",
" \"\"\"\n",
" def process_and_score(images_dict, number_results=_NUMBER_RESULTS):\n",
" start = t.default_timer()\n",
"\n",
" base64Dict = json.loads(inputString) \n",
" for k, v in base64Dict.items():\n",
" img_file_name, base64Img = k, v \n",
" img_array = _base64img_to_numpy(base64Img)\n",
" preds = scoring_func(img_array)\n",
" responses = {img_file_name: preds}\n",
"\n",
" end = t.default_timer()\n",
" results = {}\n",
" for key, base64_img_string in images_dict.items():\n",
" rgb_image = _base64img_to_pil_image(base64_img_string)\n",
" results[key]=scoring_func(rgb_image, number_results=_NUMBER_RESULTS)\n",
" \n",
" logger.info(\"Predictions: {0}\".format(responses))\n",
" end = t.default_timer()\n",
"\n",
" logger.info(\"Predictions: {0}\".format(results))\n",
" logger.info(\"Predictions took {0} ms\".format(round((end-start)*1000, 2)))\n",
" return (responses, \"Computed in {0} ms\".format(round((end-start)*1000, 2)))\n",
" return (results, 'Computed in {0} ms'.format(round((end-start)*1000, 2)))\n",
" return process_and_score\n",
"\n",
"def version():\n",
" return tf.__version__"
" return torch.__version__"
]
},
{
@ -142,7 +183,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -158,34 +199,34 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"IMAGEURL = \"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:model_driver:Model loading time: 3960.87 ms\n"
]
}
],
"source": [
"predict_for = get_model_api()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:model_driver:Model loading time: 42520.51 ms\n"
]
}
],
"source": [
"predict_for = get_model_api()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
@ -196,8 +237,8 @@
"DEBUG:PIL.PngImagePlugin:iCCP profile name b'ICC Profile'\n",
"DEBUG:PIL.PngImagePlugin:Compression method 0\n",
"DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 345 65536\n",
"INFO:model_driver:Predictions: {'image': [('n02127052', 'lynx', 0.9816483), ('n02128385', 'leopard', 0.0077441484), ('n02123159', 'tiger_cat', 0.0036861342)]}\n",
"INFO:model_driver:Predictions took 4221.36 ms\n"
"INFO:model_driver:Predictions: {'image': [('n02127052 lynx, catamount', 0.9965722560882568), ('n02128757 snow leopard, ounce, Panthera uncia', 0.0013256857637315989), ('n02128385 leopard, Panthera pardus', 0.0009192737634293735)]}\n",
"INFO:model_driver:Predictions took 85.09 ms\n"
]
}
],
@ -208,6 +249,26 @@
"resp = predict_for(body)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'image': [('n02127052 lynx, catamount', 0.9965722560882568),\n",
" ('n02128757 snow leopard, ounce, Panthera uncia',\n",
" 0.0013256857637315989),\n",
" ('n02128385 leopard, Panthera pardus', 0.0009192737634293735)]}\n"
]
}
],
"source": [
"pprint(resp[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -218,9 +279,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:AKSDeploymentKeras]",
"display_name": "Python [conda env:AKSDeploymentPytorch]",
"language": "python",
"name": "conda-env-AKSDeploymentKeras-py"
"name": "conda-env-AKSDeploymentPytorch-py"
},
"language_info": {
"codemirror_mode": {
@ -232,7 +293,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
"version": "3.6.6"
}
},
"nbformat": 4,

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,947 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploy Web App on Azure Container Services (AKS)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we will set up an Azure Container Service which will be managed by Kubernetes. We will then take the Docker image we created earlier that contains our app and deploy it to the AKS cluster. Then, we will check everything is working by sending an image to it and getting it scored. \n",
"\n",
"The process is split into the following steps:\n",
"- Define our resource names\n",
"- Login to Azure\n",
"- Create resource group and create AKS\n",
"- Connect to AKS\n",
"- Deploy our app\n",
"\n",
"We assume that this notebook is running on Linux and Azure CLI is installed before proceeding."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from testing_utilities import write_json_to_file\n",
"%load_ext dotenv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below are the various name definitions for the resources needed to setup AKS."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 2,
"metadata": {
"tags": [
"parameters"
]
},
=======
"execution_count": null,
"metadata": {},
>>>>>>> Stashed changes
"outputs": [],
"source": [
"%%writefile --append .env\n",
"# This cell is tagged `parameters`\n",
"# Please modify the values below as you see fit\n",
"\n",
"# If you have multiple subscriptions select the subscription you want to use \n",
"selected_subscription = \"YOUR_SUBSCRIPTION\"\n",
"\n",
"# Resource group, name and location for AKS cluster.\n",
"resource_group = \"RESOURCE_GROUP\" \n",
"aks_name = \"AKS_CLUSTER_NAME\"\n",
"location = \"eastus\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"%dotenv\n",
"image_name = os.getenv('docker_login') + os.getenv('image_repo')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Azure account login"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you are not already logged in to an Azure account, the command below will initiate a login. This will pop up a browser where you can select your login."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"list=`az account list -o table`\n",
"if [ \"$list\" == '[]' ] || [ \"$list\" == '' ]; then \n",
" az login -o table\n",
"else\n",
" az account list -o table \n",
"fi"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az account set --subscription \"$selected_subscription\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az account show"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You will also need to register the container service resources on your subscription if you haven't already done so."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az provider register -n Microsoft.ContainerService"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az provider show -n Microsoft.ContainerService"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create resources and dependencies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create resource group and AKS cluster"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Azure encourages the use of groups to organize all the Azure components you deploy. That way it is easier to find them but also we can delete a number of resources simply by deleting the group."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az group create --name $resource_group --location $location"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below, we create the AKS cluster in the resource group we created earlier. This could take up to 15 minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"!az aks create --resource-group $resource_group --name $aks_name --node-count 1 --generate-ssh-keys -s Standard_NC6"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Install kubectl CLI"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To connect to the Kubernetes cluster, we will use kubectl, the Kubernetes command-line client. To install, run the following:"
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDownloading client to /usr/local/bin/kubectl from https://storage.googleapis.com/kubernetes-release/release/v1.11.1/bin/linux/amd64/kubectl\u001b[0m\n",
"\u001b[33mPlease ensure that /usr/local/bin is in your search PATH, so the `kubectl` command can be found.\u001b[0m\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!sudo env \"PATH=$PATH\" az aks install-cli"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Connect to AKS cluster"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To configure kubectl to connect to the Kubernetes cluster, run the following command:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az aks get-credentials --resource-group $resource_group --name $aks_name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify connection by listing the nodes."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME STATUS ROLES AGE VERSION\r\n",
"aks-nodepool1-28016997-0 Ready agent 60d v1.9.6\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl get nodes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the pods on our cluster."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 60d\r\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 60d\r\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 60d\r\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 60d\r\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 60d\r\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 34 60d\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy application"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we define our Kubernetes manifest file for our service and load balancer. Note that we have to specify the volume mounts to the drivers that are located on the node."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 11,
=======
"execution_count": null,
>>>>>>> Stashed changes
"metadata": {},
"outputs": [],
"source": [
"app_template = {\n",
" \"apiVersion\": \"apps/v1beta1\",\n",
" \"kind\": \"Deployment\",\n",
" \"metadata\": {\n",
" \"name\": \"azure-dl\"\n",
" },\n",
" \"spec\":{\n",
" \"replicas\":1,\n",
" \"template\":{\n",
" \"metadata\":{\n",
" \"labels\":{\n",
" \"app\":\"azure-dl\"\n",
" }\n",
" },\n",
" \"spec\":{\n",
" \"containers\":[\n",
" {\n",
" \"name\": \"azure-dl\",\n",
" \"image\": image_name,\n",
" \"env\":[\n",
" {\n",
" \"name\": \"LD_LIBRARY_PATH\",\n",
" \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.5/lib\"\n",
" }\n",
" ],\n",
" \"ports\":[\n",
" {\n",
" \"containerPort\":80,\n",
" \"name\":\"model\"\n",
" }\n",
" ],\n",
" \"volumeMounts\":[\n",
" {\n",
" \"mountPath\":\"/usr/local/nvidia\",\n",
" \"name\": \"nvidia\",\n",
" }\n",
" ],\n",
" \"resources\":{\n",
" \"requests\":{\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\n",
" },\n",
" \"limits\":{\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\n",
" }\n",
" } \n",
" }\n",
" ],\n",
" \"volumes\":[\n",
" {\n",
" \"name\": \"nvidia\",\n",
" \"hostPath\":{\n",
" \"path\":\"/usr/local/nvidia\"\n",
" },\n",
" },\n",
" ]\n",
" }\n",
" }\n",
" }\n",
"}\n",
"\n",
"service_temp = {\n",
" \"apiVersion\": \"v1\",\n",
" \"kind\": \"Service\",\n",
" \"metadata\": {\n",
" \"name\": \"azure-dl\"\n",
" },\n",
" \"spec\":{\n",
" \"type\": \"LoadBalancer\",\n",
" \"ports\":[\n",
" {\n",
" \"port\":80\n",
" }\n",
" ],\n",
" \"selector\":{\n",
" \"app\":\"azure-dl\"\n",
" }\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 12,
=======
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"def write_json_to_file(json_dict, filename, mode='w'):\n",
" with open(filename, mode) as outfile:\n",
" json.dump(json_dict, outfile, indent=4, sort_keys=True)\n",
" outfile.write('\\n\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
>>>>>>> Stashed changes
"metadata": {},
"outputs": [],
"source": [
"write_json_to_file(app_template, 'az-dl.json')"
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 13,
=======
"execution_count": null,
>>>>>>> Stashed changes
"metadata": {},
"outputs": [],
"source": [
"write_json_to_file(service_temp, 'az-dl.json', mode='a')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the manifest created."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\r\n",
" \"apiVersion\": \"apps/v1beta1\",\r\n",
" \"kind\": \"Deployment\",\r\n",
" \"metadata\": {\r\n",
" \"name\": \"azure-dl\"\r\n",
" },\r\n",
" \"spec\": {\r\n",
" \"replicas\": 1,\r\n",
" \"template\": {\r\n",
" \"metadata\": {\r\n",
" \"labels\": {\r\n",
" \"app\": \"azure-dl\"\r\n",
" }\r\n",
" },\r\n",
" \"spec\": {\r\n",
" \"containers\": [\r\n",
" {\r\n",
" \"env\": [\r\n",
" {\r\n",
" \"name\": \"LD_LIBRARY_PATH\",\r\n",
" \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.5/lib\"\r\n",
" }\r\n",
" ],\r\n",
" \"image\": \"caia/kerastf-gpu\",\r\n",
" \"name\": \"azure-dl\",\r\n",
" \"ports\": [\r\n",
" {\r\n",
" \"containerPort\": 80,\r\n",
" \"name\": \"model\"\r\n",
" }\r\n",
" ],\r\n",
" \"resources\": {\r\n",
" \"limits\": {\r\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\r\n",
" },\r\n",
" \"requests\": {\r\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\r\n",
" }\r\n",
" },\r\n",
" \"volumeMounts\": [\r\n",
" {\r\n",
" \"mountPath\": \"/usr/local/nvidia\",\r\n",
" \"name\": \"nvidia\"\r\n",
" }\r\n",
" ]\r\n",
" }\r\n",
" ],\r\n",
" \"volumes\": [\r\n",
" {\r\n",
" \"hostPath\": {\r\n",
" \"path\": \"/usr/local/nvidia\"\r\n",
" },\r\n",
" \"name\": \"nvidia\"\r\n",
" }\r\n",
" ]\r\n",
" }\r\n",
" }\r\n",
" }\r\n",
"}\r\n",
"\r\n",
"{\r\n",
" \"apiVersion\": \"v1\",\r\n",
" \"kind\": \"Service\",\r\n",
" \"metadata\": {\r\n",
" \"name\": \"azure-dl\"\r\n",
" },\r\n",
" \"spec\": {\r\n",
" \"ports\": [\r\n",
" {\r\n",
" \"port\": 80\r\n",
" }\r\n",
" ],\r\n",
" \"selector\": {\r\n",
" \"app\": \"azure-dl\"\r\n",
" },\r\n",
" \"type\": \"LoadBalancer\"\r\n",
" }\r\n",
"}\r\n",
"\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!cat az-dl.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will use kubectl create command to deploy our application."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment.apps/azure-dl created\n",
"service/azure-dl created\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl create -f az-dl.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check if the pod is deployed. It may take as many as 10 minutes for the container to be ready."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"default azure-dl-5f6b7dfb6f-kbxz6 1/1 Running 0 6m\r\n",
"kube-system azureproxy-79c5db744-r5ggd 1/1 Running 2 60d\r\n",
"kube-system heapster-55f855b47-4m7xr 2/2 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-4z4z6 3/3 Running 0 60d\r\n",
"kube-system kube-dns-v20-7c556f89c5-mp5fh 3/3 Running 0 60d\r\n",
"kube-system kube-proxy-k8t2c 1/1 Running 0 60d\r\n",
"kube-system kube-svc-redirect-z6ppp 1/1 Running 8 60d\r\n",
"kube-system kubernetes-dashboard-546f987686-8krxm 1/1 Running 2 60d\r\n",
"kube-system tunnelfront-695bcbdc68-t4l8t 1/1 Running 34 60d\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If anything goes wrong you can use the commands below to observe the events on the node as well as review the logs."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LAST SEEN FIRST SEEN COUNT NAME KIND SUBOBJECT TYPE REASON SOURCE MESSAGE\r\n",
"46m 46m 1 aks-nodepool1-28016997-0.1548a985ff48b23b Node Normal RegisteredNode node-controller Node aks-nodepool1-28016997-0 event: Registered Node aks-nodepool1-28016997-0 in Controller\r\n",
"35m 35m 1 aks-nodepool1-28016997-0.1548aa2258f34dc5 Node Normal RegisteredNode node-controller Node aks-nodepool1-28016997-0 event: Registered Node aks-nodepool1-28016997-0 in Controller\r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abbc1c661966 Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-5f6b7dfb6f-kbxz6 to aks-nodepool1-28016997-0\r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abbc256b0973 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"nvidia\" \r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abbc2754e88a Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-28016997-0 MountVolume.SetUp succeeded for volume \"default-token-crgnj\" \r\n",
"4m 6m 2 azure-dl-5f6b7dfb6f-kbxz6.1548abbc5412d897 Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-28016997-0 pulling image \"caia/kerastf-gpu\"\r\n",
"4m 4m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abd437671289 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Failed to pull image \"caia/kerastf-gpu\": rpc error: code = Canceled desc = context canceled\r\n",
"4m 4m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abd437675041 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Error: ErrImagePull\r\n",
"4m 4m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abd479665ad8 Pod Normal SandboxChanged kubelet, aks-nodepool1-28016997-0 Pod sandbox changed, it will be killed and re-created.\r\n",
"4m 4m 3 azure-dl-5f6b7dfb6f-kbxz6.1548abd4bccc3504 Pod spec.containers{azure-dl} Normal BackOff kubelet, aks-nodepool1-28016997-0 Back-off pulling image \"caia/kerastf-gpu\"\r\n",
"4m 4m 3 azure-dl-5f6b7dfb6f-kbxz6.1548abd4bccc6574 Pod spec.containers{azure-dl} Warning Failed kubelet, aks-nodepool1-28016997-0 Error: ImagePullBackOff\r\n",
"2m 2m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abf021a8ab22 Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-28016997-0 Successfully pulled image \"caia/kerastf-gpu\"\r\n",
"2m 2m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abf02e88d586 Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-28016997-0 Created container\r\n",
"2m 2m 1 azure-dl-5f6b7dfb6f-kbxz6.1548abf037241533 Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-28016997-0 Started container\r\n",
"6m 6m 1 azure-dl-5f6b7dfb6f.1548abbc1bbcf974 ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-5f6b7dfb6f-kbxz6\r\n",
"6m 6m 1 azure-dl.1548abbc1aaaccda Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-5f6b7dfb6f to 1\r\n",
"6m 6m 1 azure-dl.1548abbc284ca303 Service Normal EnsuringLoadBalancer service-controller Ensuring load balancer\r\n",
"2m 2m 1 azure-dl.1548abeeedade8ad Service Normal EnsuredLoadBalancer service-controller Ensured load balancer\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl get events"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check the logs for the application pod."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 19,
=======
"execution_count": null,
>>>>>>> Stashed changes
"metadata": {},
"outputs": [],
"source": [
"pod_json = !kubectl get pods -o json\n",
"pod_dict = json.loads(''.join(pod_json))"
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-07 17:58:41,382 CRIT Supervisor running as root (no user in config file)\r\n",
"2018-08-07 17:58:41,384 INFO supervisord started with pid 1\r\n",
"2018-08-07 17:58:42,387 INFO spawned: 'program_exit' with pid 9\r\n",
"2018-08-07 17:58:42,388 INFO spawned: 'nginx' with pid 10\r\n",
"2018-08-07 17:58:42,390 INFO spawned: 'gunicorn' with pid 11\r\n",
"2018-08-07 17:58:43,422 INFO success: program_exit entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)\r\n",
"2018-08-07 17:58:44.007138: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\r\n",
"2018-08-07 17:58:44.191739: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 0 with properties: \r\n",
"name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235\r\n",
"pciBusID: ddde:00:00.0\r\n",
"totalMemory: 11.17GiB freeMemory: 11.10GiB\r\n",
"2018-08-07 17:58:44.191801: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0\r\n",
"2018-08-07 17:58:44.533627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:\r\n",
"2018-08-07 17:58:44.533679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958] 0 \r\n",
"2018-08-07 17:58:44.533694: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: N \r\n",
"2018-08-07 17:58:44.533952: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 10761 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: ddde:00:00.0, compute capability: 3.7)\r\n",
"2018-08-07 17:58:47,537 INFO success: nginx entered RUNNING state, process has stayed up for > than 5 seconds (startsecs)\r\n",
"2018-08-07 17:59:02,553 INFO success: gunicorn entered RUNNING state, process has stayed up for > than 20 seconds (startsecs)\r\n",
"Downloading data from https://github.com/adamcasson/resnet152/releases/download/v0.1/resnet152_weights_tf.h5\r\n",
"\r\n",
" 8192/243179624 [..............................] - ETA: 2s\r\n",
" 2670592/243179624 [..............................] - ETA: 4s\r\n",
" 8912896/243179624 [>.............................] - ETA: 2s\r\n",
" 16171008/243179624 [>.............................] - ETA: 2s\r\n",
" 23740416/243179624 [=>............................] - ETA: 1s\r\n",
" 32129024/243179624 [==>...........................] - ETA: 1s\r\n",
" 40280064/243179624 [===>..........................] - ETA: 1s\r\n",
" 48472064/243179624 [====>.........................] - ETA: 1s\r\n",
" 56614912/243179624 [=====>........................] - ETA: 1s\r\n",
" 64569344/243179624 [======>.......................] - ETA: 1s\r\n",
" 70533120/243179624 [=======>......................] - ETA: 1s\r\n",
" 74948608/243179624 [========>.....................] - ETA: 1s\r\n",
" 77963264/243179624 [========>.....................] - ETA: 1s\r\n",
" 83402752/243179624 [=========>....................] - ETA: 1s\r\n",
" 88875008/243179624 [=========>....................] - ETA: 1s\r\n",
" 95723520/243179624 [==========>...................] - ETA: 1s\r\n",
"101130240/243179624 [===========>..................] - ETA: 1s\r\n",
"106102784/243179624 [============>.................] - ETA: 1s\r\n",
"110903296/243179624 [============>.................] - ETA: 1s\r\n",
"116129792/243179624 [=============>................] - ETA: 1s\r\n",
"121176064/243179624 [=============>................] - ETA: 1s\r\n",
"126164992/243179624 [==============>...............] - ETA: 0s\r\n",
"130932736/243179624 [===============>..............] - ETA: 1s\r\n",
"137437184/243179624 [===============>..............] - ETA: 0s\r\n",
"144523264/243179624 [================>.............] - ETA: 0s\r\n",
"152428544/243179624 [=================>............] - ETA: 0s\r\n",
"158539776/243179624 [==================>...........] - ETA: 0s\r\n",
"163553280/243179624 [===================>..........] - ETA: 0s\r\n",
"168517632/243179624 [===================>..........] - ETA: 0s\r\n",
"173539328/243179624 [====================>.........] - ETA: 0s\r\n",
"178610176/243179624 [=====================>........] - ETA: 0s\r\n",
"183623680/243179624 [=====================>........] - ETA: 0s\r\n",
"188628992/243179624 [======================>.......] - ETA: 0s\r\n",
"193658880/243179624 [======================>.......] - ETA: 0s\r\n",
"196231168/243179624 [=======================>......] - ETA: 0s\r\n",
"203907072/243179624 [========================>.....] - ETA: 0s\r\n",
"210108416/243179624 [========================>.....] - ETA: 0s\r\n",
"215138304/243179624 [=========================>....] - ETA: 0s\r\n",
"220168192/243179624 [==========================>...] - ETA: 0s\r\n",
"225148928/243179624 [==========================>...] - ETA: 0s\r\n",
"230211584/243179624 [===========================>..] - ETA: 0s\r\n",
"235200512/243179624 [============================>.] - ETA: 0s\r\n",
"239239168/243179624 [============================>.] - ETA: 0s\r\n",
"243187712/243179624 [==============================] - 2s 0us/step\r\n",
"{\"path\": \"/code/driver.py\", \"message\": \"Model loading time: 34161.21 ms\", \"timestamp\": \"2018-08-07T17:59:18.129430Z\", \"logger\": \"model_driver\", \"host\": \"azure-dl-5f6b7dfb6f-kbxz6\", \"level\": \"INFO\", \"stack_info\": null, \"tags\": []}\r\n",
"Initialising\r\n",
"{\"msg\": \" * Running on %s://%s:%d/ %s\", \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/werkzeug/_internal.py\", \"message\": \" * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\", \"timestamp\": \"2018-08-07T17:59:18.134555Z\", \"logger\": \"werkzeug\", \"host\": \"azure-dl-5f6b7dfb6f-kbxz6\", \"level\": \"INFO\", \"stack_info\": null, \"tags\": []}\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl logs {pod_dict['items'][0]['metadata']['name']}"
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE\r\n",
"azure-dl 1 1 1 1 6m\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl get deployment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It can take a few minutes for the service to populate the EXTERNAL-IP field below. This will be the IP you use to call the service. You can also specify an IP to use, please see the AKS documentation for further details."
]
},
{
"cell_type": "code",
<<<<<<< Updated upstream
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\r\n",
"azure-dl LoadBalancer 10.0.86.30 40.117.74.122 80:31341/TCP 6m\r\n"
]
}
],
=======
"execution_count": null,
"metadata": {},
"outputs": [],
>>>>>>> Stashed changes
"source": [
"!kubectl get service azure-dl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
<<<<<<< Updated upstream
"Next, we will [test our web application deployed on AKS](05_TestWebApp.ipynb)."
=======
"Next, we will [test our web application deployed on AKS](05_TestWebApp.ipynb). Once, we are done with all the notebooks of the tutorial, below instructions can be used to delete the cluster and free resources."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once you are done with your cluster you can use the following two commands to destroy it all."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl delete -f az-dl.json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az aks delete -n $aks_name -g $resource_group -y"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az group delete --name $resource_group -y"
>>>>>>> Stashed changes
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:AKSDeploymentKeras]",
"language": "python",
"name": "conda-env-AKSDeploymentKeras-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}