Merge pull request #2 from Microsoft/mat_tf_gpu_dev

Merge Tensorflow Tutorial
This commit is contained in:
Mat 2018-03-31 21:59:22 +01:00 коммит произвёл GitHub
Родитель b1b219f699 b5f81cf170
Коммит fc3cf6110c
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
12 изменённых файлов: 4347 добавлений и 27 удалений

Просмотреть файл

@ -1,14 +1,34 @@
### Authors: Mathew Salvaris and Fidan Boylu Uz
# Deploy Deep Learning CNN on Kubernetes Cluster with GPUs
In this repository there are a number of tutorials in Jupyter notebooks that have step-by-step instruction on how to deploy a pretrained deep learning model on a GPU enabled Kubernetes cluster. The tutorials cover how to deploy models from the following deep learning frameworks:
* [TensorFlow](Tensorflow)
* Keras (TensorFlow backend)
* Pytorch
![alt text](static/example.png "Example Classification")
For each framework we go through 7 steps:
* Model development where we load the pretrained model and test it by using it to score images
* Developing the interface our Flask app will use to load and call the model
* Building the Docker Image with our Flask REST API and model
* Testing our Docker image before deployment
* Creating our Kubernetes cluster and deploying our application to it
* Testing the deployed model
* Testing the throughput of out model
The application we will develop is a simple image classification service, where we will submit an image and get back what class the image belongs to.
If you already have a Docker image that you would like to deploy or you simply want to use the image we built you can skip the first four notebooks.
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repos using our CLA.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,324 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Develop Model Driver\n",
"In this notebook we will develop the API that will call our model. We need it to initialise the model and transform the input from the Flask app so that it is in the appropriate format to call the model. We expect the input to be JSON that will have the image encoded as a base64 string. The code below uses the writefile magic to write the contents of the cell to the file driver.py"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting driver.py\n"
]
}
],
"source": [
"%%writefile driver.py\n",
"import base64\n",
"import json\n",
"import logging\n",
"import os\n",
"import timeit as t\n",
"from io import BytesIO\n",
"\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from PIL import Image, ImageOps\n",
"from tensorflow.contrib.slim.nets import resnet_v1\n",
"\n",
"_MODEL_FILE = os.getenv('MODEL_FILE', \"resnet_v1_152.ckpt\")\n",
"_LABEL_FILE = os.getenv('LABEL_FILE', \"synset.txt\")\n",
"_NUMBER_RESULTS = 3\n",
"\n",
"\n",
"def _create_label_lookup(label_path):\n",
" with open(label_path, 'r') as f:\n",
" label_list = [l.rstrip() for l in f]\n",
" \n",
" def _label_lookup(*label_locks):\n",
" return [label_list[l] for l in label_locks]\n",
" \n",
" return _label_lookup\n",
"\n",
"\n",
"def _load_tf_model(checkpoint_file):\n",
" # Placeholder\n",
" input_tensor = tf.placeholder(tf.float32, shape=(None,224,224,3), name='input_image')\n",
" \n",
" # Load the model\n",
" sess = tf.Session()\n",
" arg_scope = resnet_v1.resnet_arg_scope()\n",
" with tf.contrib.slim.arg_scope(arg_scope):\n",
" logits, _ = resnet_v1.resnet_v1_152(input_tensor, num_classes=1000, is_training=False)\n",
" probabilities = tf.nn.softmax(logits)\n",
" \n",
" saver = tf.train.Saver()\n",
" saver.restore(sess, checkpoint_file)\n",
" \n",
" def predict_for(image):\n",
" pred, pred_proba = sess.run([logits,probabilities], feed_dict={input_tensor: image})\n",
" return pred_proba\n",
" \n",
" return predict_for\n",
"\n",
"\n",
"def _base64img_to_numpy(base64_img_string):\n",
" if base64_img_string.startswith('b\\''):\n",
" base64_img_string = base64_img_string[2:-1]\n",
" base64Img = base64_img_string.encode('utf-8')\n",
"\n",
" # Preprocess the input data \n",
" startPreprocess = t.default_timer()\n",
" decoded_img = base64.b64decode(base64Img)\n",
" img_buffer = BytesIO(decoded_img)\n",
"\n",
" # Load image with PIL (RGB)\n",
" pil_img = Image.open(img_buffer).convert('RGB')\n",
" pil_img = ImageOps.fit(pil_img, (224, 224), Image.ANTIALIAS)\n",
" return np.array(pil_img, dtype=np.float32)\n",
"\n",
"\n",
"def create_scoring_func(model_path=_MODEL_FILE, label_path=_LABEL_FILE):\n",
" logger = logging.getLogger(\"model_driver\")\n",
" \n",
" start = t.default_timer()\n",
" labels_for = _create_label_lookup(label_path)\n",
" predict_for = _load_tf_model(model_path)\n",
" end = t.default_timer()\n",
"\n",
" loadTimeMsg = \"Model loading time: {0} ms\".format(round((end-start)*1000, 2))\n",
" logger.info(loadTimeMsg)\n",
" \n",
" def call_model(image_array, number_results=_NUMBER_RESULTS):\n",
" pred_proba = predict_for(image_array).squeeze()\n",
" selected_results = np.flip(np.argsort(pred_proba), 0)[:number_results]\n",
" labels = labels_for(*selected_results)\n",
" return list(zip(labels, pred_proba[selected_results].astype(np.float64)))\n",
" return call_model\n",
"\n",
"\n",
"def get_model_api():\n",
" logger = logging.getLogger(\"model_driver\")\n",
" scoring_func = create_scoring_func()\n",
" \n",
" def process_and_score(inputString, number_results=_NUMBER_RESULTS):\n",
" start = t.default_timer()\n",
"\n",
" images = json.loads(inputString)\n",
" result = []\n",
" totalPreprocessTime = 0\n",
" totalEvalTime = 0\n",
" totalResultPrepTime = 0\n",
"\n",
" for base64_img_string in images:\n",
" rgb_image = _base64img_to_numpy(base64_img_string)\n",
" batch_image = np.expand_dims(rgb_image, 0)\n",
" result = scoring_func(batch_image, number_results=_NUMBER_RESULTS)\n",
" \n",
" end = t.default_timer()\n",
"\n",
" logger.info(\"Predictions: {0}\".format(result))\n",
" logger.info(\"Predictions took {0} ms\".format(round((end-start)*1000, 2)))\n",
" return (result, 'Computed in {0} ms'.format(round((end-start)*1000, 2)))\n",
" return process_and_score\n",
"\n",
"def version():\n",
" return tf.__version__\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import logging"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"logging.basicConfig(level=logging.DEBUG)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We run the file driver.py which will be everything into the context of the notebook."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%run driver.py"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will use the same Lynx image we used ealier to check that our driver works as expected."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from testing_utilities import img_url_to_json"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"IMAGEURL = \"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"jsonimg = img_url_to_json(IMAGEURL)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"json_lod= json.loads(jsonimg)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"INFO:tensorflow:Restoring parameters from resnet_v1_152.ckpt\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:tensorflow:Restoring parameters from resnet_v1_152.ckpt\n",
"INFO:model_driver:Model loading time: 12204.41 ms\n"
]
}
],
"source": [
"predict_for = get_model_api()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"DEBUG:PIL.PngImagePlugin:STREAM b'IHDR' 16 13\n",
"DEBUG:PIL.PngImagePlugin:STREAM b'iCCP' 41 292\n",
"DEBUG:PIL.PngImagePlugin:iCCP profile name b'ICC Profile'\n",
"DEBUG:PIL.PngImagePlugin:Compression method 0\n",
"DEBUG:PIL.PngImagePlugin:STREAM b'IDAT' 345 65536\n",
"DEBUG:model_driver:********DEBUG************\n",
"INFO:model_driver:Predictions: [('n02127052 lynx, catamount', 0.99745172262191772), ('n02128385 leopard, Panthera pardus', 0.0015076899435371161), ('n02128757 snow leopard, ounce, Panthera uncia', 0.00051647447980940342)]\n",
"INFO:model_driver:Predictions took 925.01 ms\n"
]
}
],
"source": [
"output = predict_for(json_lod['input'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The output of our prediction function is JSON that will be returned to our Flask app. It looks like our model predicted Lynx with over 99% probability."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"json.dumps(output)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can move onto [building our docker image](02_BuildImage.ipynb)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.5",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,856 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Deploy Web App on Azure Container Services (AKS)\n",
"In this notebook we will set up an Azure Container Service which will be managed by Kubernetes. We will then take the Docker image we created earlier that contains our app and deploy it to the ACS cluster. Then we will check everything is working by sending an image to it and getting it scored.\n",
"\n",
"The process is split into the following steps:\n",
"* [Define our resource names](#section1)\n",
"* [Login to Azure](#section2)\n",
"* [Create the ACS](#section3)\n",
"* [Create a tunnel to the head node](#section4)\n",
"* [Create a JSON schema of our APP and push it to the cluster](#section5)\n",
"* [Test our app](TestWebApp.ipynb)\n",
"* [Tear it all down](#section7)\n",
"\n",
"This guide assumes is designed to be run on linux and requires that the Azure CLI is installed."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='section1'></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup\n",
"Below are the various name definitions for the resources needed to setup ACS as well as the name of the Docker image we will be using."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Some outputs (and inputs) below have been hidden/masked for confidentiality**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Please modify the below as you see fit\n",
"resource_group = \"<RESOURCE_GROUP>\" \n",
"aks_name = \"<AKS_CLUSTER_NAME>\"\n",
"location = \"eastus\"\n",
"\n",
"image_name = '<YOUR_DOCKER_IMAGE>' # 'masalvar/tfresnet-gpu' Feel free to use this Image if you want to \n",
" # skip creating your own container\n",
"selected_subscription = \"'<YOUR SUBSCRIPTION>'\" # If you have multiple subscriptions select \n",
" # the subscription you want to use here"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"<a id='section2'></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Azure account login\n",
"The command below will initiate a login to your Azure account. It will pop up with an url to go to where you will enter a one off code and log into your Azure account using your browser."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"!az login -o table"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!az account set --subscription $selected_subscription"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az account show"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mRegistering is still on-going. You can monitor using 'az provider show -n Microsoft.ContainerService'\u001b[0m\r\n"
]
}
],
"source": [
"!az provider register -n Microsoft.ContainerService"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='section3'></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create resources and dependencies"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create resource group\n",
"Azure encourages the use of groups to organise all the Azure components you deploy. That way it is easier to find them but also we can deleted a number of resources simply by deleting the group."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\r\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/msaksrg\",\r\n",
" \"location\": \"eastus\",\r\n",
" \"managedBy\": null,\r\n",
" \"name\": \"msaksrg\",\r\n",
" \"properties\": {\r\n",
" \"provisioningState\": \"Succeeded\"\r\n",
" },\r\n",
" \"tags\": null\r\n",
"}\r\n"
]
}
],
"source": [
"!az group create --name $resource_group --location $location"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K{- Finished ..\n",
" \"additionalProperties\": {},\n",
" \"agentPoolProfiles\": [\n",
" {\n",
" \"additionalProperties\": {},\n",
" \"count\": 1,\n",
" \"dnsPrefix\": null,\n",
" \"fqdn\": null,\n",
" \"name\": \"nodepool1\",\n",
" \"osDiskSizeGb\": null,\n",
" \"osType\": \"Linux\",\n",
" \"ports\": null,\n",
" \"storageProfile\": \"ManagedDisks\",\n",
" \"vmSize\": \"Standard_NC6\",\n",
" \"vnetSubnetId\": null\n",
" }\n",
" ],\n",
" \"dnsPrefix\": \"msAKSTFClu-msaksrg-edf507\",\n",
" \"fqdn\": \"msakstfclu-msaksrg-edf507-1f197d36.hcp.eastus.azmk8s.io\",\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourcegroups/msaksrg/providers/Microsoft.ContainerService/managedClusters/msAKSTFCluster\",\n",
" \"kubernetesVersion\": \"1.7.9\",\n",
" \"linuxProfile\": {\n",
" \"additionalProperties\": {},\n",
" \"adminUsername\": \"azureuser\",\n",
" \"ssh\": {\n",
" \"additionalProperties\": {},\n",
" \"publicKeys\": [\n",
" {\n",
" \"additionalProperties\": {},\n",
" \"keyData\": \"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDVfKBWPBKS84wluD3DJ0t3hepO2F13pz1VI5d4c7Tn4d80rSKJkF2L2HtAf3w9R7TM5TYcSlMqv+OFtB5iwfMk1k8sarGqmB1aLuEYBD60cqtdWD34DPWz8Y4eQ7x8eQ2joVRgMFpv+SfEuPBaQdTM7QtFiWRA1ZioXElyniL2Snhsd/ICcq5SIcZSPj3z9/eUcKGz/eImLkOYU28l8fLpVg48x70rGOtpfmDmZJ3KT/LImDWbPFF4VIRuiki4qVaMvDvwlEB7BmqM5D8qO7tOM3ncZ3TqUhrSQj9NbeC65xvB83+BiZts63VXAsMLnu+0wbAXnA4W66ly/5UyjC//\"\n",
" }\n",
" ]\n",
" }\n",
" },\n",
" \"location\": \"eastus\",\n",
" \"name\": \"msAKSTFCluster\",\n",
" \"provisioningState\": \"Succeeded\",\n",
" \"resourceGroup\": \"msaksrg\",\n",
" \"servicePrincipalProfile\": {\n",
" \"additionalProperties\": {},\n",
" \"clientId\": \"44ba57c3-4386-4788-a761-8d72faa493e2\",\n",
" \"keyVaultSecretRef\": null,\n",
" \"secret\": null\n",
" },\n",
" \"tags\": null,\n",
" \"type\": \"Microsoft.ContainerService/ManagedClusters\"\n",
"}\n",
"\u001b[0m"
]
}
],
"source": [
"!az aks create --resource-group $resource_group --name $aks_name --node-count 1 --generate-ssh-keys -s Standard_NC6"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Install cli from comamnd prompt"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDownloading client to /usr/local/bin/kubectl from https://storage.googleapis.com/kubernetes-release/release/v1.10.0/bin/linux/amd64/kubectl\u001b[0m\n",
"\u001b[33mPlease ensure that /usr/local/bin is in your search PATH, so the `kubectl` command can be found.\u001b[0m\n"
]
}
],
"source": [
"!sudo az aks install-cli"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we define our manifest file for our service and load balancer. Note that we have to specify the volume mounts to the drivers that are located on the node."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"app_template = {\n",
" \"apiVersion\": \"apps/v1beta1\",\n",
" \"kind\": \"Deployment\",\n",
" \"metadata\": {\n",
" \"name\": \"azure-dl\"\n",
" },\n",
" \"spec\":{\n",
" \"replicas\":1,\n",
" \"template\":{\n",
" \"metadata\":{\n",
" \"labels\":{\n",
" \"app\":\"azure-dl\"\n",
" }\n",
" },\n",
" \"spec\":{\n",
" \"containers\":[\n",
" {\n",
" \"name\": \"azure-dl\",\n",
" \"image\": \"masalvar/tfresnet-gpu\",\n",
" \"env\":[\n",
" {\n",
" \"name\": \"LD_LIBRARY_PATH\",\n",
" \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.6/lib\"\n",
" }\n",
" ],\n",
" \"ports\":[\n",
" {\n",
" \"containerPort\":80,\n",
" \"name\":\"model\"\n",
" }\n",
" ],\n",
" \"volumeMounts\":[\n",
" {\n",
" \"name\": \"bin\",\n",
" \"mountPath\":\"/usr/local/nvidia/bin\" \n",
" },\n",
" {\n",
" \"name\": \"lib\",\n",
" \"mountPath\":\"/usr/local/nvidia/lib64\" \n",
" },\n",
" {\n",
" \"name\": \"libcuda\",\n",
" \"mountPath\":\"/usr/lib/x86_64-linux-gnu/libcuda.so.1\" \n",
" },\n",
" ],\n",
" \"resources\":{\n",
" \"requests\":{\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\n",
" },\n",
" \"limits\":{\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\n",
" }\n",
" } \n",
" }\n",
" ],\n",
" \"volumes\":[\n",
" {\n",
" \"name\": \"bin\",\n",
" \"hostPath\":{\n",
" \"path\":\"/usr/lib/nvidia-384/bin\"\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"lib\",\n",
" \"hostPath\":{\n",
" \"path\":\"/usr/lib/nvidia-384\"\n",
" },\n",
" },\n",
" {\n",
" \"name\": \"libcuda\",\n",
" \"hostPath\":{\n",
" \"path\":\"/usr/lib/x86_64-linux-gnu/libcuda.so.1\"\n",
" },\n",
" },\n",
" ]\n",
" }\n",
" }\n",
" }\n",
"}\n",
"\n",
"service_temp = {\n",
" \"apiVersion\": \"v1\",\n",
" \"kind\": \"Service\",\n",
" \"metadata\": {\n",
" \"name\": \"azure-dl\"\n",
" },\n",
" \"spec\":{\n",
" \"type\": \"LoadBalancer\",\n",
" \"ports\":[\n",
" {\n",
" \"port\":80\n",
" }\n",
" ],\n",
" \"selector\":{\n",
" \"app\":\"azure-dl\"\n",
" }\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def write_json_to_file(json_dict, filename, mode='w'):\n",
" with open(filename, mode) as outfile:\n",
" json.dump(json_dict, outfile, indent=4,sort_keys=True)\n",
" outfile.write('\\n\\n')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"write_json_to_file(app_template, 'az-dl.json') # We write the service template to the json file"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"write_json_to_file(service_temp, 'az-dl.json', mode='a') # We add the loadbelanacer template to the json file"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\r\n",
" \"apiVersion\": \"apps/v1beta1\",\r\n",
" \"kind\": \"Deployment\",\r\n",
" \"metadata\": {\r\n",
" \"name\": \"azure-dl\"\r\n",
" },\r\n",
" \"spec\": {\r\n",
" \"replicas\": 1,\r\n",
" \"template\": {\r\n",
" \"metadata\": {\r\n",
" \"labels\": {\r\n",
" \"app\": \"azure-dl\"\r\n",
" }\r\n",
" },\r\n",
" \"spec\": {\r\n",
" \"containers\": [\r\n",
" {\r\n",
" \"env\": [\r\n",
" {\r\n",
" \"name\": \"LD_LIBRARY_PATH\",\r\n",
" \"value\": \"$LD_LIBRARY_PATH:/usr/local/nvidia/lib64:/opt/conda/envs/py3.6/lib\"\r\n",
" }\r\n",
" ],\r\n",
" \"image\": \"masalvar/tfresnet-gpu\",\r\n",
" \"name\": \"azure-dl\",\r\n",
" \"ports\": [\r\n",
" {\r\n",
" \"containerPort\": 80,\r\n",
" \"name\": \"model\"\r\n",
" }\r\n",
" ],\r\n",
" \"resources\": {\r\n",
" \"limits\": {\r\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\r\n",
" },\r\n",
" \"requests\": {\r\n",
" \"alpha.kubernetes.io/nvidia-gpu\": 1\r\n",
" }\r\n",
" },\r\n",
" \"volumeMounts\": [\r\n",
" {\r\n",
" \"mountPath\": \"/usr/local/nvidia/bin\",\r\n",
" \"name\": \"bin\"\r\n",
" },\r\n",
" {\r\n",
" \"mountPath\": \"/usr/local/nvidia/lib64\",\r\n",
" \"name\": \"lib\"\r\n",
" },\r\n",
" {\r\n",
" \"mountPath\": \"/usr/lib/x86_64-linux-gnu/libcuda.so.1\",\r\n",
" \"name\": \"libcuda\"\r\n",
" }\r\n",
" ]\r\n",
" }\r\n",
" ],\r\n",
" \"volumes\": [\r\n",
" {\r\n",
" \"hostPath\": {\r\n",
" \"path\": \"/usr/lib/nvidia-384/bin\"\r\n",
" },\r\n",
" \"name\": \"bin\"\r\n",
" },\r\n",
" {\r\n",
" \"hostPath\": {\r\n",
" \"path\": \"/usr/lib/nvidia-384\"\r\n",
" },\r\n",
" \"name\": \"lib\"\r\n",
" },\r\n",
" {\r\n",
" \"hostPath\": {\r\n",
" \"path\": \"/usr/lib/x86_64-linux-gnu/libcuda.so.1\"\r\n",
" },\r\n",
" \"name\": \"libcuda\"\r\n",
" }\r\n",
" ]\r\n",
" }\r\n",
" }\r\n",
" }\r\n",
"}\r\n",
"\r\n",
"{\r\n",
" \"apiVersion\": \"v1\",\r\n",
" \"kind\": \"Service\",\r\n",
" \"metadata\": {\r\n",
" \"name\": \"azure-dl\"\r\n",
" },\r\n",
" \"spec\": {\r\n",
" \"ports\": [\r\n",
" {\r\n",
" \"port\": 80\r\n",
" }\r\n",
" ],\r\n",
" \"selector\": {\r\n",
" \"app\": \"azure-dl\"\r\n",
" },\r\n",
" \"type\": \"LoadBalancer\"\r\n",
" }\r\n",
"}\r\n",
"\r\n"
]
}
],
"source": [
"!cat az-dl.json"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Merged \"msAKSTFCluster\" as current context in /home/mat/.kube/config\r\n"
]
}
],
"source": [
"!az aks get-credentials --resource-group=$resource_group --name=$aks_name"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME STATUS ROLES AGE VERSION\r\n",
"aks-nodepool1-27496346-0 Ready agent 2m v1.7.9\r\n"
]
}
],
"source": [
"!kubectl get nodes"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"kube-system heapster-2574232661-07lzh 2/2 Running 0 1m\r\n",
"kube-system kube-dns-v20-2253765213-730n6 3/3 Running 0 2m\r\n",
"kube-system kube-dns-v20-2253765213-m9d9q 3/3 Running 0 2m\r\n",
"kube-system kube-proxy-3d25d 1/1 Running 0 2m\r\n",
"kube-system kube-svc-redirect-psp3n 1/1 Running 0 2m\r\n",
"kube-system kubernetes-dashboard-2898242510-7h28r 1/1 Running 0 2m\r\n",
"kube-system tunnelfront-527646831-lj63z 1/1 Running 0 2m\r\n"
]
}
],
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This command will create everything we specified in the az-dl.json manifest file."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment.apps \"azure-dl\" created\n",
"service \"azure-dl\" created\n"
]
}
],
"source": [
"!kubectl create -f az-dl.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After a few seconds you should see the pod start running on the cluster."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAMESPACE NAME READY STATUS RESTARTS AGE\r\n",
"default azure-dl-3880299103-v5mb7 1/1 Running 0 4m\r\n",
"kube-system heapster-2574232661-07lzh 2/2 Running 0 5m\r\n",
"kube-system kube-dns-v20-2253765213-730n6 3/3 Running 0 6m\r\n",
"kube-system kube-dns-v20-2253765213-m9d9q 3/3 Running 0 6m\r\n",
"kube-system kube-proxy-3d25d 1/1 Running 0 6m\r\n",
"kube-system kube-svc-redirect-psp3n 1/1 Running 0 6m\r\n",
"kube-system kubernetes-dashboard-2898242510-7h28r 1/1 Running 0 6m\r\n",
"kube-system tunnelfront-527646831-lj63z 1/1 Running 0 6m\r\n"
]
}
],
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If anything goes wrong you can use the commands below to observe the events on the node as well as review the logs."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"LAST SEEN FIRST SEEN COUNT NAME KIND SUBOBJECT TYPE REASON SOURCE MESSAGE\r\n",
"9m 9m 1 aks-nodepool1-27496346-0.1520fb005972710f Node Normal Starting kubelet, aks-nodepool1-27496346-0 Starting kubelet.\r\n",
"7m 8m 3 aks-nodepool1-27496346-0.1520fb08e765af3d Node Normal NodeHasSufficientDisk kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasSufficientDisk\r\n",
"7m 8m 3 aks-nodepool1-27496346-0.1520fb08e7663219 Node Normal NodeHasSufficientMemory kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasSufficientMemory\r\n",
"7m 8m 3 aks-nodepool1-27496346-0.1520fb08e7665b1e Node Normal NodeHasNoDiskPressure kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeHasNoDiskPressure\r\n",
"55s 8m 9 aks-nodepool1-27496346-0.1520fb08e780a4eb Node Warning FailedNodeAllocatableEnforcement kubelet, aks-nodepool1-27496346-0 Failed to update Node Allocatable Limits \"\": failed to set supported cgroup subsystems for cgroup : Failed to set config for supported subsystems : failed to write 59076296704 to memory.limit_in_bytes: write /var/lib/docker/overlay2/daad1bc683430e39749de19537b2702c53db1f36ba866537b8f76687375c368f/merged/sys/fs/cgroup/memory/memory.limit_in_bytes: invalid argument\r\n",
"6m 6m 1 aks-nodepool1-27496346-0.1520fb2740f627b6 Node Normal RegisteredNode controllermanager Node aks-nodepool1-27496346-0 event: Registered Node aks-nodepool1-27496346-0 in NodeController\r\n",
"6m 6m 1 aks-nodepool1-27496346-0.1520fb29877c82d1 Node Normal Starting kube-proxy, aks-nodepool1-27496346-0 Starting kube-proxy.\r\n",
"6m 6m 1 aks-nodepool1-27496346-0.1520fb2d38a1c12c Node Normal NodeReady kubelet, aks-nodepool1-27496346-0 Node aks-nodepool1-27496346-0 status is now: NodeReady\r\n",
"4m 4m 1 azure-dl-3880299103-v5mb7.1520fb46d4fdc9fa Pod Normal Scheduled default-scheduler Successfully assigned azure-dl-3880299103-v5mb7 to aks-nodepool1-27496346-0\r\n",
"4m 4m 1 azure-dl-3880299103-v5mb7.1520fb46e1cd117d Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"bin\" \r\n",
"4m 4m 1 azure-dl-3880299103-v5mb7.1520fb46e1cf3b05 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"libcuda\" \r\n",
"4m 4m 1 azure-dl-3880299103-v5mb7.1520fb46e1cf86ce Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"lib\" \r\n",
"4m 4m 1 azure-dl-3880299103-v5mb7.1520fb46e2516335 Pod Normal SuccessfulMountVolume kubelet, aks-nodepool1-27496346-0 MountVolume.SetUp succeeded for volume \"default-token-thxzk\" \r\n",
"4m 4m 1 azure-dl-3880299103-v5mb7.1520fb47102b3c32 Pod spec.containers{azure-dl} Normal Pulling kubelet, aks-nodepool1-27496346-0 pulling image \"masalvar/tfresnet-gpu\"\r\n",
"1m 1m 1 azure-dl-3880299103-v5mb7.1520fb73ea97742a Pod spec.containers{azure-dl} Normal Pulled kubelet, aks-nodepool1-27496346-0 Successfully pulled image \"masalvar/tfresnet-gpu\"\r\n",
"1m 1m 1 azure-dl-3880299103-v5mb7.1520fb75bccdb1f5 Pod spec.containers{azure-dl} Normal Created kubelet, aks-nodepool1-27496346-0 Created container\r\n",
"1m 1m 1 azure-dl-3880299103-v5mb7.1520fb76711f3dd5 Pod spec.containers{azure-dl} Normal Started kubelet, aks-nodepool1-27496346-0 Started container\r\n",
"4m 4m 1 azure-dl-3880299103.1520fb46d46b36d8 ReplicaSet Normal SuccessfulCreate replicaset-controller Created pod: azure-dl-3880299103-v5mb7\r\n",
"4m 4m 1 azure-dl.1520fb46d294f3d3 Deployment Normal ScalingReplicaSet deployment-controller Scaled up replica set azure-dl-3880299103 to 1\r\n",
"4m 4m 1 azure-dl.1520fb46d8ebdb8a Service Normal CreatingLoadBalancer service-controller Creating load balancer\r\n",
"2m 2m 1 azure-dl.1520fb66b2965ba7 Service Normal CreatedLoadBalancer service-controller Created load balancer\r\n"
]
}
],
"source": [
"!kubectl get events"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-03-31 10:45:57,344 CRIT Supervisor running as root (no user in config file)\r\n",
"2018-03-31 10:45:57,346 INFO supervisord started with pid 7\r\n",
"2018-03-31 10:45:58,348 INFO spawned: 'program_exit' with pid 15\r\n",
"2018-03-31 10:45:58,349 INFO spawned: 'nginx' with pid 16\r\n",
"2018-03-31 10:45:58,351 INFO spawned: 'gunicorn' with pid 17\r\n",
"2018-03-31 10:45:59,380 INFO success: program_exit entered RUNNING state, process has stayed up for > than 1 seconds (startsecs)\r\n",
"2018-03-31 10:45:59.971916: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA\r\n",
"2018-03-31 10:46:03,977 INFO success: nginx entered RUNNING state, process has stayed up for > than 5 seconds (startsecs)\r\n",
"2018-03-31 10:46:11.453255: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: \r\n",
"name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235\r\n",
"pciBusID: cff2:00:00.0\r\n",
"totalMemory: 11.17GiB freeMemory: 11.10GiB\r\n",
"2018-03-31 10:46:11.453299: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: Tesla K80, pci bus id: cff2:00:00.0, compute capability: 3.7)\r\n",
"INFO:tensorflow:Restoring parameters from resnet_v1_152.ckpt\r\n",
"{\"timestamp\": \"2018-03-31T10:46:17.203847Z\", \"level\": \"INFO\", \"stack_info\": null, \"host\": \"azure-dl-3880299103-v5mb7\", \"message\": \"Restoring parameters from resnet_v1_152.ckpt\", \"logger\": \"tensorflow\", \"msg\": \"Restoring parameters from %s\", \"tags\": [], \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/tensorflow/python/platform/tf_logging.py\"}\r\n",
"{\"timestamp\": \"2018-03-31T10:46:19.060001Z\", \"level\": \"INFO\", \"stack_info\": null, \"host\": \"azure-dl-3880299103-v5mb7\", \"message\": \"Model loading time: 19089.38 ms\", \"logger\": \"model_driver\", \"tags\": [], \"path\": \"/code/driver.py\"}\r\n",
"2018-03-31 10:46:19,060 INFO success: gunicorn entered RUNNING state, process has stayed up for > than 20 seconds (startsecs)\r\n",
"Initialising\r\n",
"{\"timestamp\": \"2018-03-31T10:46:19.065300Z\", \"level\": \"INFO\", \"stack_info\": null, \"host\": \"azure-dl-3880299103-v5mb7\", \"message\": \" * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)\", \"logger\": \"werkzeug\", \"msg\": \" * Running on %s://%s:%d/ %s\", \"tags\": [], \"path\": \"/opt/conda/envs/py3.5/lib/python3.5/site-packages/werkzeug/_internal.py\"}\r\n"
]
}
],
"source": [
"pod_json = !kubectl get pods -o json\n",
"pod_dict = json.loads(''.join(pod_json))\n",
"!kubectl logs {pod_dict['items'][0]['metadata']['name']}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It can take a few minutes for the service to populate the EXTERNAL-IP field. This will be the IP you use to call the service. You can also specify an IP to use please see the AKS documentation for further details."
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE\r\n",
"azure-dl LoadBalancer 10.0.204.221 40.71.172.160 80:32567/TCP 11m\r\n"
]
}
],
"source": [
"!kubectl get service azure-dl"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now that we have our deployed service we can move onto [testing it](05_TestWebApp.ipynb) \n",
"Below are the instructions to tear everything down once we are done with the cluster"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"<a id='section7'></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tear it all down \n",
"Once you are done with your cluster you can use the following two commands to destroy it all."
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"deployment \"azure-dl\" deleted\n",
"service \"azure-dl\" deleted\n"
]
}
],
"source": [
"!kubectl delete -f az-dl.json"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K\u001b[0minished .."
]
}
],
"source": [
"!az aks delete -n $aks_name -g $resource_group -y"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K\u001b[0minished .."
]
}
],
"source": [
"!az group delete --name $resource_group -y"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python 3.5",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

1
Tensorflow/README.md Normal file
Просмотреть файл

@ -0,0 +1 @@
# Deploy ResNet 152 Tensorflow model on GPU enaled Kubernetes cluster

Просмотреть файл

@ -0,0 +1,92 @@
import base64
import json
import urllib
from io import BytesIO
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import toolz
from PIL import Image, ImageOps
def read_image_from(url):
return toolz.pipe(url,
urllib.request.urlopen,
lambda x: x.read(),
BytesIO)
def to_rgb(img_bytes):
return Image.open(img_bytes).convert('RGB')
@toolz.curry
def resize(img_file, new_size=(100, 100)):
return ImageOps.fit(img_file, new_size, Image.ANTIALIAS)
def to_base64(img):
imgio = BytesIO()
img.save(imgio, 'PNG')
imgio.seek(0)
dataimg = base64.b64encode(imgio.read())
return dataimg.decode('utf-8')
def to_img(img_url):
return toolz.pipe(img_url,
read_image_from,
to_rgb,
resize(new_size=(224,224)))
def img_url_to_json(url):
img_data = toolz.pipe(url,
to_img,
to_base64)
return json.dumps({'input':'[\"{0}\"]'.format(img_data)})
def _plot_image(ax, img):
ax.imshow(to_img(img))
ax.tick_params(axis='both',
which='both',
bottom='off',
top='off',
left='off',
right='off',
labelleft='off',
labelbottom='off')
return ax
def _plot_prediction_bar(ax, r):
perf = list(c[1] for c in r.json()['result'][0])
ax.barh(range(3, 0, -1), perf, align='center', color='#55DD55')
ax.tick_params(axis='both',
which='both',
bottom='off',
top='off',
left='off',
right='off',
labelbottom='off')
tick_labels = reversed(list(' '.join(c[0].split()[1:]).split(',')[0] for c in r.json()['result'][0]))
ax.yaxis.set_ticks([1,2,3])
ax.yaxis.set_ticklabels(tick_labels, position=(0.5,0), minor=False, horizontalalignment='center')
def plot_predictions(images, classification_results):
if len(images)!=6:
raise Exception('This method is only designed for 6 images')
gs = gridspec.GridSpec(2, 3)
fig = plt.figure(figsize=(12, 9))
gs.update(hspace=0.1, wspace=0.001)
for gg,r, img in zip(gs, classification_results, images):
gg2 = gridspec.GridSpecFromSubplotSpec(4, 10, subplot_spec=gg)
ax = fig.add_subplot(gg2[0:3, :])
_plot_image(ax, img)
ax = fig.add_subplot(gg2[3, 1:9])
_plot_prediction_bar(ax, r)

Двоичные данные
static/example.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 410 KiB