This commit is contained in:
Dan Ciborowski 2019-08-27 02:51:05 -04:00
Родитель d6ca051133
Коммит bb840121be
756 изменённых файлов: 202518 добавлений и 0 удалений

9
.gitmodules поставляемый
Просмотреть файл

@ -25,3 +25,12 @@
[submodule "DeploySparkMLModelDatabricks"]
path = DeploySparkMLModelDatabricks
url = https://github.com/Azure/BatchSparkScoringPredictiveMaintenance
[submodule "recommenders"]
path = recommenders
url = https://github.com/microsoft/recommenders
[submodule "nlp"]
path = nlp
url = https://github.com/microsoft/nlp
[submodule "ComputerVision"]
path = ComputerVision
url = https://github.com/microsoft/ComputerVision

1
ComputerVision Submodule

@ -0,0 +1 @@
Subproject commit 3eab18177620cb15f18afd8e9cb02c63da1deba1

Просмотреть файл

@ -0,0 +1,29 @@
{
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"name": {
"type": "String"
},
"location": {
"type": "String"
},
"sku": {
"type": "String"
}
},
"variables": {},
"resources": [
{
"type": "Microsoft.CognitiveServices/accounts",
"apiVersion": "2016-02-01-preview",
"name": "[parameters('name')]",
"location": "[parameters('location')]",
"sku": {
"name": "[parameters('sku')]"
},
"kind": "TextAnalytics",
"properties": {}
}
]
}

344
archectures/Python-Keras-RealTimeServing/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,344 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
**/Properties/launchSettings.json
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
.ipynb_checkpoints
# env files
.env
.dev_env
# Binaries
*.jpg
*.h5
# AML files
aml_config
azureml-models

Просмотреть файл

@ -0,0 +1,98 @@
### Authors: Yan Zhang, Mathew Salvaris, and Fidan Boylu Uz
[![Build Status](https://dev.azure.com/customai/AKSDeploymentTutorialAML/_apis/build/status/Microsoft.AKSDeploymentTutorialAML?branchName=master)](https://dev.azure.com/customai/AKSDeploymentTutorialAML/_build/latest?definitionId=11&branchName=master)
# Deploy Deep Learning CNN using Azure Machine Learning
## Overview
In this repository there are a number of tutorials in Jupyter notebooks that have step-by-step instructions on how to deploy a pretrained deep learning model on a GPU enabled Kubernetes cluster throught Azure Machine Learning (AzureML). The tutorials cover how to deploy models from the following deep learning frameworks on specific deployment target:
* Keras (TensorFlow backend)
- [Azure Kubernetes Service (AKS) Cluster with GPUs](./{{cookiecutter.project_name}}/Keras_Tensorflow/aks)
- [Azure IoT Edge](./{{cookiecutter.project_name}}/Keras_Tensorflow/iotedge)
* [Pytorch](./{{cookiecutter.project_name}}/Pytorch) (coming soon)
![alt text](https://happypathspublic.blob.core.windows.net/aksdeploymenttutorialaml/example.png "Example Classification")
For each framework, we go through the following steps:
* Create an AzureML Workspace
* Model development where we load the pretrained model and test it by using it to score images
* Develop the API that will call our model
* Building the Docker Image with our REST API and model and testing the image
* AKS option
* Creating our Kubernetes cluster and deploying our application to it
* Testing the deployed model
* Testing the throughput of our model
* Cleaning up resources
* IOT Edge option
* Creating IoT hub and IoT Edge device identity, configuring the physical IOT Edge device, and deploying our application to it
* Cleaning up resources
## Design
As described on the associated [Azure Reference Architecture page](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/realtime-scoring-python), the application we will develop is a simple image classification service, where we will submit an image and get back what class the image belongs to. The application flow for the deep learning model is as follows:
1) Deep learning model is registered to AzureML model registry.
2) AzureML creates a docker image including the model and scoring script.
3) AzureML deploys the scoring image on the chosen deployment compute target (AKS or IoT Edge) as a web service.
4) The client sends a HTTP POST request with the encoded image data.
5) The web service created by AzureML preprocesses the image data and sends it to the model for scoring.
6) The predicted categories with their scores are then returned to the client.
**NOTE**: The tutorial goes through step by step how to deploy a deep learning model on Azure; it **does** **not** include enterprise best practices such as securing the endpoints and setting up remote logging etc.
**Deploying with GPUS:** For a detailed comparison of the deployments of various deep learning models, see the blog post [here](https://azure.microsoft.com/en-us/blog/gpus-vs-cpus-for-deployment-of-deep-learning-models/) which provides evidence that, at least in the scenarios tested, GPUs provide better throughput and stability at a lower cost.
# Getting Started
To get started with the tutorial, please proceed with following steps **in sequential order**.
* [Prerequisites](#prerequisites)
* [Setup](#setup)
<a id='prerequisites'></a>
## Prerequisites
1. Linux (x64) with GPU enabled.
2. [Anaconda Python](https://www.anaconda.com/download)
3. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed.
4. [Azure account](https://azure.microsoft.com).
The tutorial was developed on an [Azure Ubuntu
DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro),
which addresses the first three prerequisites.
<a id='setup'></a>
## Setup
To set up your environment to run these notebooks, please follow these steps.
1. Create a _Linux_ Ubuntu DSVM (NC6 or above to use GPU).
2. Install [cookiecutter](https://cookiecutter.readthedocs.io/en/latest/installation.html), a tool creates projects from project templates.
```bash
pip install cookiecutter
```
3. Clone and choose a specific framework and deployment option for this repository. You will obtain a repository tailored to your choice of framework and deployment compute target.
```bash
cookiecutter https://github.com/Microsoft/AKSDeploymentTutorialAML.git
```
You will be asked to choose or enter information such as *framework*, *project name*, *subsciption id*, *resource group*, etc. in an interactive way. If a dafault value is provided, you can press *Enter* to accept the default value and continue or enter value of your choice. For example, if you want to learn how to deploy deep learning model on AKS Cluster using Keras, you should have values "keras" as the value for variable *framework* and "aks" for variable *deployment_type*. Instead, if you want to learn deploying deep learning model on IoT Edge, you should select "iotedge" for variable *deployment_type*.
You must provide a value for "subscription_id", otherwise the project will fail with the error "ERROR: The subscription id is missing, please enter a valid subscription id" after all the questions are asked. The full list of questions can be found in [cookiecutter.json](./cookiecutter.json) file.
Please make sure all entered information are correct, as these information are used to customize the content of your repo.
4. With the generation of the project custom readmes will be created based on [aks-keras](./{{cookiecutter.project_name}}/Keras_Tensorflow/aks/README.md) or [iotedge-keras](./{{cookiecutter.project_name}}/Keras_Tensorflow/iotedge/README.md). Go find a README.md file in your project directory and proceed with instructions specified in it.
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -0,0 +1,17 @@
{
"framework":"keras",
"project_name":"dlmodeldeploy",
"subscription_id": "",
"resource_group": "aksdeploykerasrg",
"workspace_name": "workspace",
"workspace_region": [
"eastus",
"eastus2"
],
"image_name": "kerasimage",
"deployment_type": [
"aks",
"iotedge"
]
}

Просмотреть файл

@ -0,0 +1,36 @@
import os
import shutil
PROJECT_DIRECTORY = os.path.realpath(os.path.curdir)
def remove_file(filepath):
os.remove(os.path.join(PROJECT_DIRECTORY, filepath))
def remove_dir(dirpath):
shutil.rmtree(os.path.join(PROJECT_DIRECTORY, dirpath))
def move_files(parentdir, subdir):
root = os.path.join(PROJECT_DIRECTORY, parentdir)
for filename in os.listdir(os.path.join(root, subdir)):
shutil.move(os.path.join(root, subdir, filename), os.path.join(root, filename))
os.rmdir(os.path.join(root, subdir))
if __name__ == "__main__":
if "{{ cookiecutter.framework }}" == "keras":
remove_dir("./Pytorch")
if "{{ cookiecutter.deployment_type }}" == "aks":
remove_dir("./Keras_Tensorflow/iotedge")
move_files("./Keras_Tensorflow", "./aks")
if "{{ cookiecutter.deployment_type }}" == "iotedge":
remove_dir("./Keras_Tensorflow/aks")
move_files("./Keras_Tensorflow", "./iotedge")
else:
remove_dir("./Keras_Tensorflow")

Просмотреть файл

@ -0,0 +1,51 @@
import re
import sys
MODULE_REGEX = r"^[_a-zA-Z][_a-zA-Z0-9]+$"
def check_module(module_name):
if not re.match(MODULE_REGEX, module_name):
print(
"ERROR: The project slug {} is not a valid Python module name. Please do not use a - and use _ instead".format(
module_name
)
)
# Exit to cancel project
sys.exit(1)
def check_sub_id(sub_id):
if len(sub_id) == 0:
print(
"ERROR: The subscription id is missing, please enter a valid subscription id slug"
)
# Exit to cancel project
sys.exit(1)
def check_image_name(image_name):
if "_" in image_name:
print(
"ERROR: The image name must not have underscores in it {}".format(
image_name
)
)
# Exit to cancel project
sys.exit(1)
if __name__ == "__main__":
check_module("{{cookiecutter.project_name}}")
check_sub_id("{{cookiecutter.subscription_id}}")
check_image_name("{{cookiecutter.image_name}}")
print("All checks passed")
if "{{ cookiecutter.deployment_type }}" == "aks":
print("Creating AKS project...")
if "{{ cookiecutter.deployment_type }}" == "iotedge":
print("Creating IOT Edge project...")

Просмотреть файл

@ -0,0 +1,237 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Installation and configuration¶\n",
"\n",
"This notebook configures the notebooks in this tutorial to connect to an Azure Machine Learning (AML) Workspace. You can use an existing workspace or create a new one."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import azureml.core\n",
"from azureml.core import Workspace\n",
"from dotenv import set_key, get_key, find_dotenv\n",
"from pathlib import Path\n",
"from testing_utilities import get_auth"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you have already completed the prerequisites, you can execute following command to ensure you are using correct conda environment. The output of this command should contain \"tutorial_env\" in the path, e.g. `/anaconda/envs/tutorial_env/bin/python`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The AML Python SDK is already installed. Let's check the AML SDK version."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SDK Version: 1.0.21\n"
]
}
],
"source": [
"print(\"SDK Version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up your Azure Machine Learning workspace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To create or access an Azure ML Workspace, you will need the following information:\n",
"\n",
"* An Azure subscription id\n",
"* A resource group name\n",
"* A name for your workspace\n",
"* A region for your workspace\n",
"\n",
"We also require you to provide variable names that will be used to create images, deployment computes, etc. in later notebooks.\n",
"\n",
"**Note**: As with other Azure services, there are limits on certain resources like cluster size associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Replace the values in the following cell with your information. If you would like to use service principal authentication as described [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azure-ml.ipynb) make sure you provide the optional values as well."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Azure resources\n",
"subscription_id = \"{{cookiecutter.subscription_id}}\"\n",
"resource_group = \"{{cookiecutter.resource_group}}\" \n",
"workspace_name = \"{{cookiecutter.workspace_name}}\" \n",
"workspace_region = \"{{cookiecutter.workspace_region}}\" # e.g. workspace_region = \"{{cookiecutter.workspace_region}}\"\n",
"\n",
"# Docker image and Azure Kubernetes Service (AKS) Cluster - deployment compute\n",
"image_name = (\n",
" \"{{cookiecutter.image_name}}\"\n",
") # e.g. image_name = \"{{cookiecutter.image_name}} (avoid underscore in names)\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create and initialize a dotenv file for storing parameters used in multiple notebooks."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv()\n",
"if env_path == \"\":\n",
" Path(\".env\").touch()\n",
" env_path = find_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set_key(env_path, \"subscription_id\", subscription_id)\n",
"set_key(env_path, \"resource_group\", resource_group)\n",
"set_key(env_path, \"workspace_name\", workspace_name)\n",
"set_key(env_path, \"workspace_region\", workspace_region)\n",
"\n",
"set_key(env_path, \"image_name\", image_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create the workspace\n",
"This cell will create an AML workspace for you in a subscription, provided you have the correct permissions.\n",
"This will fail when:\n",
"\n",
"1. You do not have permission to create a workspace in the resource group\n",
"2. You do not have permission to create a resource group if it's non-existing.\n",
"3. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription\n",
"\n",
"If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to provision the required resources. If this cell succeeds, you're done configuring AML!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import the Workspace class and check the azureml SDK version\n",
"# from azureml.core import Workspace\n",
"\n",
"ws = Workspace.create(\n",
" name=workspace_name,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" location=workspace_region,\n",
" create_resource_group=True,\n",
" auth=get_auth(),\n",
" exist_ok=True,\n",
")\n",
"# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
"ws.write_config()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we will reload it just to make sure that everything is working."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load workspace configuratio from ./aml_config/config.json file.\n",
"ws = Workspace.from_config(auth=get_auth())\n",
"ws.get_details()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we created a \".env\" file to save and reuse the variables needed cross all the notebooks. We also created a new Azure resource group with name <YOUR\\_RESOURCE\\_GROUP>, where an AML workspace and a few other Azure resources are created. We can now move on to the next notebook [developing the model](01_DevelopModel.ipynb)."
]
}
],
"metadata": {
"celltoolbar": "Tags",
"jupytext": {
"formats": "ipynb"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,707 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we use a subset of [Stack Exchange network](https://archive.org/details/stackexchange) question data which includes original questions tagged as 'JavaScript', their duplicate questions and their answers. Here, we provide the steps to prepare the data to use in model development for training a model that will match a new question with an existing original question. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from utilities import read_csv_gz, clean_text, round_sample_strat, random_merge"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below, we define some parameters that will be used in the data cleaning as well as train and test set preparation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The size of the test set\n",
"test_size = 0.21\n",
"# The minimum length of clean text\n",
"min_text = 150\n",
"# The minimum number of duplicates per question\n",
"min_dupes = 12\n",
"# The maximum number of duplicate matches\n",
"match = 20\n",
"# The output files path\n",
"outputs_path = os.path.join('.', 'data_folder')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data cleaning"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we download the questions, duplicate questions and answers and load the datasets into pandas dataframes using the helper functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# URLs to original questions, duplicate questions, and answers.\n",
"data_url = \"https://bostondata.blob.core.windows.net/stackoverflow/{}\"\n",
"questions_url = data_url.format(\"orig-q.tsv.gz\")\n",
"dupes_url = data_url.format(\"dup-q.tsv.gz\")\n",
"answers_url = data_url.format(\"ans.tsv.gz\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load datasets.\n",
"questions = read_csv_gz(questions_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
"dupes = read_csv_gz(dupes_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
"answers = read_csv_gz(answers_url, names=('Id', 'Text0'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's now check the dataframes. Notice that questions and duplicates have \"AnswerID\" column that would help match with the index of answers dataframe."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"answers.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the first original question's text."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions.iloc[0,1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's now check the duplicates for that question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes[dupes.AnswerId == questions.iloc[0, 0]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below is the answer to the original question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"answers.at[questions.iloc[0,0],'Text0']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we use the helper functions to clean questions, duplicates and answers from unwanted text such as code, html tags and links. Notice that we add a new column 'Text' to each dataframe for clean text in lowercase."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Clean up all text, and keep only data with some clean text.\n",
"for df in (questions, dupes, answers):\n",
" df[\"Text\"] = df.Text0.apply(clean_text).str.lower()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions = questions[questions.Text.str.len() > 0]\n",
"answers = answers[answers.Text.str.len() > 0]\n",
"dupes = dupes[dupes.Text.str.len() > 0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's compare the first original question and cleaned version as an example."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Original question.\n",
"questions.iloc[0,1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# After cleaning.\n",
"questions.iloc[0,3]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"it turns out that some duplicate questions were also in original questions. Also, some original questions and some duplicate questions were duplicated in the datasets. In the following, we remove them from the dataframes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# First, remove dupes that are questions, then remove duplicated questions and dupes.\n",
"dupes = dupes[~dupes.index.isin(questions.index)]\n",
"questions = questions[~questions.index.duplicated(keep='first')]\n",
"dupes = dupes[~dupes.index.duplicated(keep='first')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We also make sure we keep questions with answers and duplicates."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only questions with answers and dupes, answers to questions, and dupes of questions.\n",
"questions = questions[\n",
" questions.AnswerId.isin(answers.index) & questions.AnswerId.isin(dupes.AnswerId)\n",
"]\n",
"answers = answers[answers.index.isin(questions.AnswerId)]\n",
"dupes = dupes[dupes.AnswerId.isin(questions.AnswerId)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Verify data integrity.\n",
"assert questions.AnswerId.isin(answers.index).all()\n",
"assert answers.index.isin(questions.AnswerId).all()\n",
"assert questions.AnswerId.isin(dupes.AnswerId).all()\n",
"assert dupes.AnswerId.isin(questions.AnswerId).all()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below are some statistics on the data. Notice that some questions have very low number of duplicates while others may have a large number. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the data.\n",
"print(\"Text statistics:\")\n",
"print(\n",
" pd.DataFrame(\n",
" [\n",
" questions.Text.str.len().describe().rename(\"questions\"),\n",
" answers.Text.str.len().describe().rename(\"answers\"),\n",
" dupes.Text.str.len().describe().rename(\"dupes\"),\n",
" ]\n",
" )\n",
")\n",
"print(\"\\nDuplication statistics:\")\n",
"print(pd.DataFrame([dupes.AnswerId.value_counts().describe().rename(\"duplications\")]))\n",
"print(\n",
" \"\\nLargest class: {:.2%}\".format(\n",
" dupes.AnswerId.value_counts().max() / dupes.shape[0]\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, we reset all indexes to use them as columns in the rest of the steps."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Reset each dataframe's index.\n",
"questions.reset_index(inplace=True)\n",
"answers.reset_index(inplace=True)\n",
"dupes.reset_index(inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We filter the questions and duplicates to have at least min_text number of characters."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Apply the minimum text length to questions and dupes.\n",
"questions = questions[questions.Text.str.len() >= min_text]\n",
"dupes = dupes[dupes.Text.str.len() >= min_text]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only questions with dupes, and dupes of questions.\n",
"label_column = \"AnswerId\"\n",
"questions = questions[questions[label_column].isin(dupes[label_column])]\n",
"dupes = dupes[dupes[label_column].isin(questions[label_column])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, we remove questions and their duplicates that are less than min_dupes parameter."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Restrict the questions to those with a minimum number of dupes.\n",
"answerid_count = dupes.groupby(label_column)[label_column].count()\n",
"answerid_min = answerid_count.index[answerid_count >= min_dupes]\n",
"questions = questions[questions[label_column].isin(answerid_min)]\n",
"dupes = dupes[dupes[label_column].isin(answerid_min)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # Verify data integrity.\n",
"assert questions[label_column].isin(dupes[label_column]).all()\n",
"assert dupes[label_column].isin(questions[label_column]).all()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here are some statistics on the resulting dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the data.\n",
"print(\"Restrictions: min_text={}, min_dupes={}\".format(min_text, min_dupes))\n",
"print(\"Restricted text statistics:\")\n",
"print(\n",
" pd.DataFrame(\n",
" [\n",
" questions.Text.str.len().describe().rename(\"questions\"),\n",
" dupes.Text.str.len().describe().rename(\"dupes\"),\n",
" ]\n",
" )\n",
")\n",
"print(\"\\nRestricted duplication statistics:\")\n",
"print(\n",
" pd.DataFrame([dupes[label_column].value_counts().describe().rename(\"duplications\")])\n",
")\n",
"print(\n",
" \"\\nRestricted largest class: {:.2%}\".format(\n",
" dupes[label_column].value_counts().max() / dupes.shape[0]\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare train and test sets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this part, we prepare train and test sets. For training a binary classification model, we will need to construct match and non-match pairs from duplicates and their questions. Finding matching pairs can be accomplished by joining each duplicate with its question. However, non-match examples need to be constructed randomly. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As a first step, to make sure we train and test the performance of the model on each question, we will need to have examples of match and non-match pairs for each question both in train and test sets. In order to achieve that, we split the duplicates in a stratified manner into train and test sets making sure at least 1 or more duplicates per question is in the test set depending on test_size parameter and number of duplicates per each question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Split dupes into train and test ensuring at least one of each label class is in test.\n",
"dupes_test = round_sample_strat(dupes, dupes[label_column], frac=test_size)\n",
"dupes_train = dupes[~dupes.Id.isin(dupes_test.Id)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert (dupes_test[label_column].unique().shape[0] == dupes[label_column].unique().shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The relevant columns for text pairs data.\n",
"balanced_pairs_columns = ['Id_x', 'AnswerId_x', 'Text_x', 'Id_y', 'Text_y', 'AnswerId_y', 'Label', 'n']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we pair each training duplicate in train set with its matching question and N-1 random questions using the helper function."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use AnswerId to pair each training dupe with its matching question and also with N-1 questions not its match.\n",
"%time balanced_pairs_train = random_merge(dupes_train, questions, N=match)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Labeling is done such that matching pairs are labeled as 1 and non-match pairs are labeled as 0."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Label records by matching AnswerIds.\n",
"balanced_pairs_train[\"Label\"] = (\n",
" balanced_pairs_train.AnswerId_x == balanced_pairs_train.AnswerId_y\n",
").astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only the relevant data.\n",
"balanced_pairs_train = balanced_pairs_train[balanced_pairs_columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"balanced_pairs_train.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sort the data by dupe ID and Label.\n",
"balanced_pairs_train.sort_values(by=['Id_x', 'Label'], ascending=[True, False], inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In testing set, we match each duplicate with all the original questions and label them same way as training set."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use AnswerId to pair each testing dupe with all questions.\n",
"%time balanced_pairs_test = random_merge(dupes_test, questions, N=questions.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Label records by matching AnswerIds.\n",
"balanced_pairs_test[\"Label\"] = (\n",
" balanced_pairs_test.AnswerId_x == balanced_pairs_test.AnswerId_y\n",
").astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only the relevant data.\n",
"balanced_pairs_test = balanced_pairs_test[balanced_pairs_columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"balanced_pairs_test.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sort the data by dupe ID and Label.\n",
"balanced_pairs_test.sort_values(\n",
" by=[\"Id_x\", \"Label\"], ascending=[True, False], inplace=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, we report the final train and test sets and save as text files to be used by modeling."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the datasets.\n",
"print(\n",
" \"balanced_pairs_train: {:,} rows with {:.2%} matches\".format(\n",
" balanced_pairs_train.shape[0], balanced_pairs_train.Label.mean()\n",
" )\n",
")\n",
"print(\n",
" \"balanced_pairs_test: {:,} rows with {:.2%} matches\".format(\n",
" balanced_pairs_test.shape[0], balanced_pairs_test.Label.mean()\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(outputs_path, exist_ok=True)\n",
"\n",
"# Save the data.\n",
"balanced_pairs_train_path = os.path.join(outputs_path, \"balanced_pairs_train.tsv\")\n",
"print(\n",
" \"Writing {:,} to {}\".format(\n",
" balanced_pairs_train.shape[0], balanced_pairs_train_path\n",
" )\n",
")\n",
"balanced_pairs_train.to_csv(\n",
" balanced_pairs_train_path, sep=\"\\t\", header=True, index=False\n",
")\n",
"\n",
"balanced_pairs_test_path = os.path.join(outputs_path, \"balanced_pairs_test.tsv\")\n",
"print(\n",
" \"Writing {:,} to {}\".format(balanced_pairs_test.shape[0], balanced_pairs_test_path)\n",
")\n",
"balanced_pairs_test.to_csv(balanced_pairs_test_path, sep=\"\\t\", header=True, index=False)\n",
"\n",
"# Save original questions to be used for scoring later.\n",
"questions_path = os.path.join(outputs_path, \"questions.tsv\")\n",
"print(\"Writing {:,} to {}\".format(questions.shape[0], questions_path))\n",
"questions.to_csv(questions_path, sep=\"\\t\", header=True, index=False)\n",
"\n",
"# Save the test duplicate questions to be used with the scoring function.\n",
"dupes_test_path = os.path.join(outputs_path, \"dupes_test.tsv\")\n",
"print(\"Writing {:,} to {}\".format(dupes_test.shape[0], dupes_test_path))\n",
"dupes_test.to_csv(dupes_test_path, sep=\"\\t\", header=True, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now move on to [train on local](02_TrainOnLocal.ipynb) notebook to train our model using Azure Machine Learning."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,246 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Develop Model\n",
"In this noteook, we will go through the steps to load the ResNet152 model, pre-process the images to the required format and call the model to find the top predictions.\n",
"\n",
" Note: Always make sure you don't have any lingering notebooks running (Shutdown previous notebooks). Otherwise it may cause GPU memory issue."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from PIL import Image, ImageOps\n",
"import wget\n",
"from resnet152 import ResNet152\n",
"from keras.applications.imagenet_utils import preprocess_input, decode_predictions\n",
"from azureml.core.workspace import Workspace\n",
"from dotenv import set_key, find_dotenv\n",
"from testing_utilities import get_auth\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If you see error msg \"InternalError: Dst tensor is not initialized.\", it indicates there are not enough memory.\n",
"model = ResNet152(weights=\"imagenet\")\n",
"print(\"model loaded\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"wget.download(\n",
" \"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"img_path = \"220px-Lynx_lynx_poing.jpg\"\n",
"print(Image.open(img_path).size)\n",
"Image.open(img_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below, we load the image by resizing to (224, 224) and then preprocessing using the methods from keras preprocessing and imagenet utilities."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Evaluate the model using the input data\n",
"img = Image.open(img_path).convert(\"RGB\")\n",
"img = ImageOps.fit(img, (224, 224), Image.ANTIALIAS)\n",
"img = np.array(img) # shape: (224, 224, 3)\n",
"img = np.expand_dims(img, axis=0)\n",
"img = preprocess_input(img)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, let's call the model on our image to predict the top 3 labels. This will take a few seconds."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"preds = model.predict(img)\n",
"decoded_predictions = decode_predictions(preds, top=3)\n",
"print(\"Predicted:\", decoded_predictions)\n",
"resp = {img_path: str(decoded_predictions)}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register the model\n",
"Register an existing trained model, add descirption and tags."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Get workspace\n",
"# Load existing workspace from the config file info.\n",
"\n",
"ws = Workspace.from_config(auth=get_auth())\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.save_weights(\"model_resnet_weights.h5\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Register the model\n",
"from azureml.core.model import Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Model.register(\n",
" model_path=\"model_resnet_weights.h5\", # this points to a local file\n",
" model_name=\"resnet_model\", # this is the name the model is registered as\n",
" tags={\"model\": \"dl\", \"framework\": \"resnet\"},\n",
" description=\"resnet 152 model\",\n",
" workspace=ws,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(model.name, model.description, model.version)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set_key(env_path, \"model_version\", str(model.version))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Clear GPU memory\n",
"from keras import backend as K"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"K.clear_session()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We have registred the trained ResNet152 model in Azure ML. We can now move on to [developing the model api for our model](02_DevelopModelDriver.ipynb)."
]
}
],
"metadata": {
"celltoolbar": "Tags",
"jupytext": {
"formats": "ipynb"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,327 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Develop Model Driver\n",
"\n",
"In this notebook, we will develop the API that will call our model. This module initializes the model, transforms the input so that it is in the appropriate format and defines the scoring method that will produce the predictions. The API will expect the input to be passed as an image. Once a request is received, the API will convert load the image preprocess it and pass it to the model. There are two main functions in the API: init() and run(). The init() function loads the model and returns a scoring function. The run() function processes the images and uses the first function to score them.\n",
"\n",
" Note: Always make sure you don't have any lingering notebooks running (Shutdown previous notebooks). Otherwise it may cause GPU memory issue."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"from azureml.core.model import Model\n",
"from dotenv import set_key, find_dotenv\n",
"import logging\n",
"from testing_utilities import get_auth"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import keras\n",
"import tensorflow"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Keras: 2.2.0\n",
"Tensorflow: 1.10.0\n"
]
}
],
"source": [
"print(\"Keras: \", keras.__version__)\n",
"print(\"Tensorflow: \", tensorflow.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Write and save driver script"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting driver.py\n"
]
}
],
"source": [
"%%writefile driver.py\n",
"\n",
"from resnet152 import ResNet152\n",
"from keras.preprocessing import image\n",
"from keras.applications.imagenet_utils import preprocess_input, decode_predictions\n",
"from azureml.contrib.services.aml_request import rawhttp\n",
"from azureml.core.model import Model\n",
"from toolz import compose\n",
"import numpy as np\n",
"import timeit as t\n",
"from PIL import Image, ImageOps\n",
"import logging\n",
"\n",
"_NUMBER_RESULTS = 3\n",
"\n",
"\n",
"def _image_ref_to_pil_image(image_ref):\n",
" \"\"\" Load image with PIL (RGB)\n",
" \"\"\"\n",
" return Image.open(image_ref).convert(\"RGB\")\n",
"\n",
"\n",
"def _pil_to_numpy(pil_image):\n",
" img = ImageOps.fit(pil_image, (224, 224), Image.ANTIALIAS)\n",
" img = image.img_to_array(img)\n",
" return img\n",
"\n",
"\n",
"def _create_scoring_func():\n",
" \"\"\" Initialize ResNet 152 Model\n",
" \"\"\"\n",
" logger = logging.getLogger(\"model_driver\")\n",
" start = t.default_timer()\n",
" model_name = \"resnet_model\"\n",
" model_path = Model.get_model_path(model_name)\n",
" model = ResNet152()\n",
" model.load_weights(model_path)\n",
" end = t.default_timer()\n",
"\n",
" loadTimeMsg = \"Model loading time: {0} ms\".format(round((end - start) * 1000, 2))\n",
" logger.info(loadTimeMsg)\n",
"\n",
" def call_model(img_array_list):\n",
" img_array = np.stack(img_array_list)\n",
" img_array = preprocess_input(img_array)\n",
" preds = model.predict(img_array)\n",
" # Converting predictions to float64 since we are able to serialize float64 but not float32\n",
" preds = decode_predictions(preds.astype(np.float64), top=_NUMBER_RESULTS)\n",
" return preds\n",
"\n",
" return call_model\n",
"\n",
"\n",
"def get_model_api():\n",
" logger = logging.getLogger(\"model_driver\")\n",
" scoring_func = _create_scoring_func()\n",
"\n",
" def process_and_score(images_dict):\n",
" \"\"\" Classify the input using the loaded model\n",
" \"\"\"\n",
" start = t.default_timer()\n",
" logger.info(\"Scoring {} images\".format(len(images_dict)))\n",
" transform_input = compose(_pil_to_numpy, _image_ref_to_pil_image)\n",
" transformed_dict = {\n",
" key: transform_input(img_ref) for key, img_ref in images_dict.items()\n",
" }\n",
" preds = scoring_func(list(transformed_dict.values()))\n",
" preds = dict(zip(transformed_dict.keys(), preds))\n",
" end = t.default_timer()\n",
"\n",
" logger.info(\"Predictions: {0}\".format(preds))\n",
" logger.info(\"Predictions took {0} ms\".format(round((end - start) * 1000, 2)))\n",
" return (preds, \"Computed in {0} ms\".format(round((end - start) * 1000, 2)))\n",
"\n",
" return process_and_score\n",
"\n",
"\n",
"def init():\n",
" \"\"\" Initialise the model and scoring function\n",
" \"\"\"\n",
" global process_and_score\n",
" process_and_score = get_model_api()\n",
"\n",
"\n",
"@rawhttp\n",
"def run(request):\n",
" \"\"\" Make a prediction based on the data passed in using the preloaded model\n",
" \"\"\"\n",
" return process_and_score(request.files)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test the driver¶\n",
"We test the driver by passing data."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"logging.basicConfig(level=logging.DEBUG)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"%run driver.py"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's load the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth())\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the model and score against an example image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_path = Model.get_model_path(\"resnet_model\", _workspace=ws)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"IMAGEURL = \"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Always make sure you don't have any lingering notebooks running. Otherwise it may cause GPU memory issue.\n",
"process_and_score = get_model_api()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:model_driver:Scoring 1 images\n",
"INFO:model_driver:Predictions: {'lynx': [('n02127052', 'lynx', 0.981648325920105), ('n02128385', 'leopard', 0.007744148373603821), ('n02123159', 'tiger_cat', 0.003686134237796068)]}\n",
"INFO:model_driver:Predictions took 3837.34 ms\n"
]
}
],
"source": [
"resp = process_and_score({\"lynx\": open(\"220px-Lynx_lynx_poing.jpg\", \"rb\")})"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Clear GPU memory\n",
"from keras import backend as K"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"K.clear_session()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will [build a docker image with this modle driver and other supporting files](03_BuildImage.ipynb)."
]
}
],
"metadata": {
"jupytext": {
"formats": "ipynb"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,460 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Build Image\n",
"\n",
"In this notebook, we show the following steps for deploying a web service using AML:\n",
"\n",
"- Create an image\n",
"- Test image locally\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import docker\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import requests\n",
"from azure.mgmt.containerregistry import ContainerRegistryManagementClient\n",
"from azureml._model_management._util import (get_docker_client, pull_docker_image)\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.core.image import ContainerImage\n",
"from dotenv import get_key, find_dotenv\n",
"from testing_utilities import to_img, plot_predictions, get_auth, wait_until_ready\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resource_group = get_key(env_path, 'resource_group')\n",
"model_name = 'resnet_model'\n",
"image_name = get_key(env_path, 'image_name')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get workspace\n",
"Load existing workspace from the config file info."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.workspace import Workspace\n",
"\n",
"ws = Workspace.from_config(auth=get_auth())\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Image"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create yml file to be used in the image\n",
"conda_pack = [\"tensorflow-gpu==1.10.0\"]\n",
"requirements = [\"keras==2.2.0\",\"Pillow==5.2.0\", \"azureml-defaults==1.0.21\", \"toolz==0.9.0\"]\n",
"\n",
"imgenv = CondaDependencies.create(conda_packages=conda_pack,pip_packages=requirements)\n",
"with open(\"img_env.yml\", \"w\") as f:\n",
" f.write(imgenv.serialize_to_string())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"lines_to_next_cell": 2
},
"outputs": [],
"source": [
"\n",
"image_config = ContainerImage.image_configuration(execution_script = \"driver.py\",\n",
" runtime = \"python\",\n",
" conda_file = \"img_env.yml\",\n",
" description = \"Image for AKS Deployment Tutorial\",\n",
" tags = {\"name\":\"AKS\",\"project\":\"AML\"}, \n",
" dependencies = [\"resnet152.py\"],\n",
" enable_gpu = True\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create image. It may take upto 15-20 minutes. \n",
"image = ContainerImage.create(name = image_name,\n",
" # this is the model object\n",
" models = [ws.models[model_name]], \n",
" image_config = image_config,\n",
" workspace = ws)\n",
"\n",
"image.wait_for_creation(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# You can find the logs of image creation\n",
"# image.image_build_log_uri\n",
"\n",
"# You can get the image object when not creating a new image\n",
"# image = ws.images['image1']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test image locally\n",
"- Pull the image from ACR registry to local host \n",
"- Start a container\n",
"- Test API call"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Getting your container details\n",
"container_reg = ws.get_details()[\"containerRegistry\"]\n",
"reg_name=container_reg.split(\"/\")[-1]\n",
"container_url = \"\\\"\" + image.image_location + \"\\\",\"\n",
"subscription_id = ws.subscription_id\n",
"\n",
"client = ContainerRegistryManagementClient(ws._auth,subscription_id)\n",
"result= client.registries.list_credentials(resource_group, reg_name, custom_headers=None, raw=False)\n",
"username = result.username\n",
"password = result.passwords[0].value\n",
"print('ContainerURL:{}'.format(image.image_location))\n",
"print('Servername: {}'.format(reg_name))\n",
"print('Username: {}'.format(username))\n",
"print('Password: {}'.format(password))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dc = get_docker_client(username, \n",
" password, \n",
" image.image_location.split(\"/\")[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pull_docker_image(dc, image.image_location, username, password)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# make sure port 80 is not occupied\n",
"container_labels = {'containerName': 'kerasgpu'}\n",
"container = dc.containers.run(image.image_location, \n",
" detach=True, \n",
" ports={'5001/tcp': 80},\n",
" labels=container_labels,\n",
" runtime='nvidia' )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for log_msg in container.logs(stream=True):\n",
" str_msg = log_msg.decode('UTF8')\n",
" print(str_msg)\n",
" if \"Model loading time:\" in str_msg:\n",
" print('Model loaded and container ready')\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client = docker.APIClient()\n",
"details = client.inspect_container(container.id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"service_ip = details['NetworkSettings']['Ports']['5001/tcp'][0]['HostIp']\n",
"service_port = details['NetworkSettings']['Ports']['5001/tcp'][0]['HostPort']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Wait a few seconds for the application to spin up and then check that everything works."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Checking service on {} port {}'.format(service_ip, service_port))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"endpoint=\"http://__service_ip:__service_port\"\n",
"endpoint = endpoint.replace('__service_ip', service_ip)\n",
"endpoint = endpoint.replace('__service_port', service_port)\n",
"\n",
"max_attempts = 50\n",
"output_str = wait_until_ready(endpoint, max_attempts)\n",
"print(output_str)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!curl 'http://{service_ip}:{service_port}/'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"IMAGEURL = \"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.imshow(to_img(IMAGEURL))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('220px-Lynx_lynx_poing.jpg', 'rb') as f:\n",
" img_data = f.read()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%time r = requests.post('http://0.0.0.0:80/score', files={'image': img_data})\n",
"print(r)\n",
"r.json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"images = (\n",
" \"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Lynx_lynx_poing.jpg/220px-Lynx_lynx_poing.jpg\",\n",
" \"https://upload.wikimedia.org/wikipedia/commons/3/3a/Roadster_2.5_windmills_trimmed.jpg\",\n",
" \"https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Harmony_of_the_Seas_%28ship%2C_2016%29_001.jpg/1920px-Harmony_of_the_Seas_%28ship%2C_2016%29_001.jpg\",\n",
" \"http://yourshot.nationalgeographic.com/u/ss/fQYSUbVfts-T7pS2VP2wnKyN8wxywmXtY0-FwsgxpiZv_E9ZfPsNV5B0ER8-bOdruvNfMD5EbP4SznWz4PYn/\",\n",
" \"https://cdn.arstechnica.net/wp-content/uploads/2012/04/bohol_tarsier_wiki-4f88309-intro.jpg\",\n",
" \"http://i.telegraph.co.uk/multimedia/archive/03233/BIRDS-ROBIN_3233998b.jpg\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from testing_utilities import read_image_from"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"url = \"http://0.0.0.0:80/score\"\n",
"results = [\n",
" requests.post(url, files={'image': read_image_from(img).read()}) for img in images\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plot_predictions(images, results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_data = list(map(lambda img: read_image_from(img).read(), images)) # Retrieve the images and data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"timer_results = list()\n",
"for img in image_data:\n",
" res=%timeit -r 1 -o -q requests.post(url, files={'image': img})\n",
" timer_results.append(res.best)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"timer_results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Average time taken: {0:4.2f} ms\".format(10 ** 3 * np.mean(timer_results)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"container.stop()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# remove stopped container\n",
"!docker system prune -f"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now move on to [Create kubenetes cluster and deploy web service](04_DeployOnAKS.ipynb) with the image we just built."
]
}
],
"metadata": {
"celltoolbar": "Tags",
"jupytext": {
"formats": "ipynb"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,184 @@
.ONESHELL:
SHELL=/bin/bash
define PROJECT_HELP_MSG
Makefile for testing notebooks
Make sure you have edited the dev_env_template files and renamed it to .dev_env
All the variables loaded in this makefile must come from the .dev_env file
Usage:
make test run all notebooks
make clean delete env and remove files
endef
export PROJECT_HELP_MSG
env_location=.dev_env
PWD:=$(shell pwd)
include ${env_location}
help:
echo "$$PROJECT_HELP_MSG" | less
test: setup test-notebook1 test-notebook2 test-notebook3 test-notebook4 test-notebook5 test-notebook6 test-notebook7 \
test-notebook-iot1 test-notebook8 test-notebook-iot2
@echo All Notebooks Passed
setup:
conda env create -f environment.yml
ifndef TENANT_ID
@echo starting interactive login
az login -o table
else
@echo using service principal login
az login -t ${TENANT_ID} --service-principal -u ${SP_USERNAME} --password ${SP_PASSWORD}
endif
test-notebook1:
source activate deployment_aml
@echo Testing 00_AMLSetup.ipynb
papermill 00_AMLSetup.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p subscription_id ${SUBSCRIPTION_ID} \
-p resource_group ${RESOURCE_GROUP} \
-p workspace_name ${WORKSPACE_NAME} \
-p workspace_region ${WORKSPACE_REGION} \
-p image_name ${IMAGE_NAME} \
-p aks_name ${AKS_NAME} \
-p aks_location ${WORKSPACE_REGION} \
-p aks_service_name ${AKS_SERVICE_NAME}
test-notebook2:
source activate deployment_aml
@echo Testing 01_DevelopModel.ipynb
papermill 01_DevelopModel.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook3:
source activate deployment_aml
@echo Testing 02_DevelopModelDriver.ipynb
papermill 02_DevelopModelDriver.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook4:
source activate deployment_aml
@echo Testing 03_BuildImage.ipynb
papermill 03_BuildImage.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook5:
source activate deployment_aml
@echo Testing 04_DeployOnAKS.ipynb
papermill aks/04_DeployOnAKS.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p aks_name ${AKS_NAME} \
-p aks_location ${WORKSPACE_REGION} \
-p aks_service_name ${AKS_SERVICE_NAME}
test-notebook6:
source activate deployment_aml
@echo Testing 05_TestWebApp.ipynb
papermill aks/05_TestWebApp.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p aks_name ${AKS_NAME} \
-p aks_location ${WORKSPACE_REGION} \
-p aks_service_name ${AKS_SERVICE_NAME}
test-notebook7:
source activate deployment_aml
@echo Testing 06_SpeedTestWebApp.ipynb
papermill aks/06_SpeedTestWebApp.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook-iot1:
source activate deployment_aml
@echo Testing 04_DeployOnIOTedge.ipynb
export PYTHONPATH=${PWD}:${PYTHONPATH}
cd iotedge
papermill 04_DeployOnIOTedge.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p iot_hub_name fstlstnameiothub \
-p device_id mygpudevice \
-p module_name mygpumodule
test-notebook8:
source activate deployment_aml
@echo Testing 07_TearDown.ipynb
papermill aks/07_TearDown.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook-iot2:
source activate deployment_aml
@echo Testing 05_TearDown.ipynb
export PYTHONPATH=${PWD}:${PYTHONPATH}
papermill iotedge/05_TearDown.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-cookiecutter-aks:
cookiecutter --no-input https://github.com/Microsoft/AKSDeploymentTutorialAML.git \
subscription_id="${SUBSCRIPTION_ID}" \
workspace_region=${WORKSPACE_REGION} \
deployment_type="aks"
test-cookiecutter-iot:
cookiecutter --no-input https://github.com/Microsoft/AKSDeploymentTutorialAML.git \
subscription_id=${SUBSCRIPTION_ID} \
workspace_region=${WORKSPACE_REGION} \
deployment_type="iotedge"
remove-notebook:
rm -f test.ipynb
clean: remove-notebook
conda remove --name deployment_aml -y --all
rm -rf aml_config
rm -rf __pycache__
rm -rf .ipynb_checkpoints
rm *.jpg
rm -rf azureml-models
rm driver.py img_env.yml model_resnet_weights.h5
notebook:
source activate deployment_aml
jupyter notebook --port 9999 --ip 0.0.0.0 --no-browser
install-jupytext:
source activate deployment_aml
conda install -c conda-forge jupytext
convert-to-py:
jupytext --set-formats ipynb,py_scripts//py --sync *.ipynb
sync:
jupytext --sync *.ipynb
convert-to-ipynb:
jupytext --set-formats ipynb *.ipynb
remove-py:
rm -r py_scripts
.PHONY: help test setup clean remove-notebook test-notebook1 test-notebook2 test-notebook3 test-notebook4 \
test-notebook5 test-notebook6 test-notebook7 test-notebook-iot test-notebook9

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,111 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down\n",
"Once you are done with your cluster you can use the following two commands to destroy it all."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"from dotenv import set_key, get_key, find_dotenv\n",
"from testing_utilities import get_auth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth())\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)\n",
"resource_group = get_key(env_path, 'resource_group')\n",
"aks_name = get_key(env_path, 'aks_name')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once you are done with your cluster you can use the following command to delete the AKS cluster. This step may take a few minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_target = AksCompute(name=aks_name,workspace=ws)\n",
"aks_aml_name = aks_target.cluster_resource_id.rsplit(\"/\")[-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az aks delete -n $aks_aml_name -g $resource_group -y"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, you should delete the resource group. This also deletes the AKS cluster and can be used instead of the above command if the resource group is only used for this purpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az group delete --name $resource_group -y"
]
}
],
"metadata": {
"jupytext": {
"formats": "ipynb"
},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,72 @@
# Deploy Deep Learning CNN on Kubernetes Cluster with GPUs - Keras
To get started with the tutorial, please proceed with following steps **in sequential order**.
* [Prerequisites](#prerequisites)
* [Steps](#steps)
* [Cleaning up](#cleanup)
<a id='prerequisites'></a>
## Prerequisites
1. Linux (x64) with GPU enabled.
2. [Anaconda Python](https://www.anaconda.com/download)
3. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed.
4. [Azure account](https://azure.microsoft.com).
The tutorial was developed on an [Azure Ubuntu
DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro),
which addresses the first three prerequisites.
<a id='steps'></a>
## Steps
Please follow these steps to set up your environment and run notebooks. They setup the notebooks to use Docker and Azure seamlessly.
1. Add your user to the docker group:
```
sudo usermod -aG docker $USER
newgrp docker
```
To verify whether you have correct configuration, try executing `docker ps` command. You should not get `permission denied` errors.
2. Navigate to the directory which is the framework you have chosen (e.g. Keras_Tensorflow).
3. Create the Python virtual environment using the environment.yml:
```
conda env create -f environment.yml
```
4. Activate the virtual environment:
```
source activate deployment_aml
```
5. Login to Azure:
```
az login
```
6. If you have more than one Azure subscription, select it:
```
az account set --subscription <Your Azure Subscription>
```
7. Start the Jupyter notebook server in the virtual environment:
```
jupyter notebook
```
8. Select correct kernel: set the kernel to be `Python [conda env: deployment_aml]`(or `Python 3` if that option does not show).
9. After following the setup instructions above, run the Jupyter notebooks in order starting with the first notebook [00_AMLSetup.ipynb](./00_AMLSetup.ipynb).
<a id='cleanup'></a>
## Cleaning up
To remove the conda environment created see [here](https://conda.io/projects/continuumio-conda/en/latest/commands/remove.html). The [last Jupyter notebook](./07_TearDown.ipynb) also gives details on deleting Azure resources associated with this repository.
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repositories using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -0,0 +1,12 @@
# Fill in the fields below and rename to .dev_env
# TENANT_ID, SP_USERNAME and SP_PASSWORD are optional. If not supplied Azure cli will default to interactive login
TENANT_ID=
SP_USERNAME=
SP_PASSWORD=
SUBSCRIPTION_ID=
RESOURCE_GROUP="deploykerasrg"
WORKSPACE_NAME="workspace"
WORKSPACE_REGION="eastus"
IMAGE_NAME="deploykerasimg"
AKS_NAME="deploykerasaks"
AKS_SERVICE_NAME="deploykerasservice"

Просмотреть файл

@ -0,0 +1,25 @@
name: deployment_aml
dependencies:
# The python interpreter version.
# Currently Azure ML only supports 3.5.2 and later.
- python=3.6
- nb_conda
- tornado
- cudatoolkit==9.0
- tensorflow-gpu==1.10.0
- urllib3
- pip:
# Required packages for AzureML execution, history, and data preparation.
- papermill==1.0.1
- python-dotenv==0.10.3
- Pillow==6.1.0
- wget==3.2
- matplotlib==3.1.1
- toolz==0.9.0
- tqdm==4.32.2
- azure-cli==2.0.63
- keras==2.2.0
- azureml-sdk[notebooks, contrib]==1.0.45
- locustio==0.11.0
- prompt-toolkit==2.0.9

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,122 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down\n",
"Once you are done with your task you can use the following commands to clean up resources."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dotenv import set_key, get_key, find_dotenv\n",
"from testing_utilities import get_auth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)\n",
"resource_group = get_key(env_path, 'resource_group')\n",
"iot_hub_name = get_key(env_path, 'iot_hub_name')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!sudo systemctl stop iotedge"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!sudo apt-get remove -y iotedge"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Delete the IoT hub. This step may take a few minutes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Delete IoT hub\n",
"cmd_results = !az iot hub show -n $iot_hub_name -g $resource_group\n",
"if 'Not Found' not in cmd_results[0]:\n",
" !az iot hub delete --name $iot_hub_name --resource-group $resource_group"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, you should delete the resource group. This also deletes the IoT hub and can be used instead of the above command if the resource group is only used for this purpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cmd_results = !az group show -n $resource_group -o tsv\n",
"if \"not be found\" not in cmd_results[0]:\n",
" !az group delete --name $resource_group -y"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!docker stop $(docker ps -qa)\n",
"!docker rm $(docker ps -qa)"
]
}
],
"metadata": {
"jupytext": {
"formats": "ipynb"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,67 @@
{
"modulesContent": {
"$edgeAgent": {
"properties.desired": {
"schemaVersion": "1.0",
"runtime": {
"type": "docker",
"settings": {
"loggingOptions": "",
"minDockerVersion": "v1.25",
"registryCredentials": {
"amlregistry": {
"address": "__REGISTRY_NAME.azurecr.io",
"password": "__REGISTRY_PASSWORD",
"username": "__REGISTRY_USER_NAME"
}
}
}
},
"systemModules": {
"edgeAgent": {
"type": "docker",
"settings": {
"image": "mcr.microsoft.com/azureiotedge-agent:1.0",
"createOptions": ""
}
},
"edgeHub": {
"type": "docker",
"settings": {
"image": "mcr.microsoft.com/azureiotedge-hub:1.0",
"createOptions": "{\"HostConfig\":{\"PortBindings\":{\"8883/tcp\":[{\"HostPort\":\"8883\"}],\"443/tcp\":[{\"HostPort\":\"443\"}],\"5671/tcp\":[{\"HostPort\":\"5671\"}]}}}"
},
"status": "running",
"restartPolicy": "always"
}
},
"modules": {
"__MODULE_NAME": {
"type": "docker",
"settings": {
"image": "__REGISTRY_IMAGE_LOCATION",
"createOptions": "{\"HostConfig\":{\"Runtime\":\"nvidia\",\"PortBindings\":{\"5001/tcp\":[{\"HostPort\":\"5001\"}]}}}"
},
"version": "1.0",
"status": "running",
"restartPolicy": "always"
}
}
}
},
"$edgeHub": {
"properties.desired": {
"schemaVersion": "1.0",
"routes": {
"route": "FROM /messages/* INTO $upstream"
},
"storeAndForwardConfiguration": {
"timeToLiveSecs": 7200
}
}
},
"__MODULE_NAME": {
"properties.desired": {}
}
}
}

Просмотреть файл

@ -0,0 +1,90 @@
# Deploy Deep Learning CNN on IoT Edge - Keras
In this tutorial, we introduce how to deploy an ML/DL (machine learning/deep learning) module through [Azure IoT Edge](https://docs.microsoft.com/en-us/azure/iot-edge/how-iot-edge-works).
Azure IoT Edge is an Internet of Things (IoT) service that builds on top of Azure IoT Hub. It is a hybrid solution combining the benefits of the two scenarios: *IoT in the Cloud* and *IoT on the Edge*. This service is meant for customers who want to analyze data on devices, a.k.a. "at the edge", instead of in the cloud. By moving parts of your workload to the edge, your devices can spend less time sending messages to the cloud and react more quickly to changes in status. On the other hand, Azure IoT Hub provides centralized way to manage Azure IoT Edge devices, and make it easy to train ML models in the Cloud and deploy the trained models on the Edge devices.
In this example, we deploy a trained Keras (Tensorflow) CNN model to the edge device. When the image data is generated from a process pipeline and fed into the edge device, the deployed model can make predictions right on the edge device without accessing to the cloud. Following diagram shows the major components of an Azure IoT edge device. Source code and full documentation are linked below.
<p align="center">
<img src="https://happypathspublic.blob.core.windows.net/aksdeploymenttutorialaml/azureiotedgeruntime.png" alt="logo" width="90%"/>
</p>
We perform following steps for the deployment.
- Step 1: Build the trained ML/DL model into docker image. This image will be used to create a docker container running on the edge device.
- Step 2: Provision and Configure IoT Edge Device
- Step 3: Deploy ML/DL Module on IoT Edge Device
- Step 4: Test ML/DL Module
To get started with the tutorial, please proceed with following steps **in sequential order**.
* [Prerequisites](#prerequisites)
* [Steps](#steps)
* [Cleaning up](#cleanup)
<a id='prerequisites'></a>
## Prerequisites
1. Linux (x64) with GPU enabled.
2. [Anaconda Python](https://www.anaconda.com/download)
3. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed.
4. [Azure account](https://azure.microsoft.com).
The tutorial was developed on an [Azure Ubuntu
DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro),
which addresses the first three prerequisites.
<a id='steps'></a>
## Steps
Please follow these steps to set up your environment and run notebooks. They setup the notebooks to use Docker and Azure seamlessly.
1. Add your user to the docker group:
```
sudo usermod -aG docker $USER
newgrp docker
```
To verify whether you have correct configuration, try executing `docker ps` command. You should not get `permission denied` errors.
2. Navigate to the directory which is the framework you have chosen (e.g. Keras_Tensorflow).
3. Create the Python virtual environment using the environment.yml:
```
conda env create -f environment.yml
```
4. Activate the virtual environment:
```
source activate deployment_aml
```
5. Login to Azure:
```
az login
```
6. If you have more than one Azure subscription, select it:
```
az account set --subscription <Your Azure Subscription>
```
7. Start the Jupyter notebook server in the virtual environment:
```
jupyter notebook
```
8. Select correct kernel: set the kernel to be `Python [conda env: deployment_aml]`(or `Python 3` if that option does not show).
9. After following the setup instructions above, run the Jupyter notebooks in order starting with the first notebook [00_AMLSetup.ipynb](./00_AMLSetup.ipynb).
<a id='cleanup'></a>
## Cleaning up
To remove the conda environment created see [here](https://conda.io/projects/continuumio-conda/en/latest/commands/remove.html). The [last Jupyter notebook](./05_TearDown.ipynb) also gives details on deleting Azure resources associated with this repository.
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repositories using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -0,0 +1,48 @@
#!/usr/bin/env bash
# Install the repository configuration. Replace <release> with 16.04 or 18.04 as appropriate for your release of Ubuntu
curl https://packages.microsoft.com/config/ubuntu/__release/prod.list > ./microsoft-prod.list
# Copy the generated list
sudo cp ./microsoft-prod.list /etc/apt/sources.list.d/
#Install Microsoft GPG public key
curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg
sudo cp ./microsoft.gpg /etc/apt/trusted.gpg.d/
#################################################################################
#Install the container runtime. It can be skipped if Docker is already installed
# Update the apt package index
#sudo apt-get update
# Install the Moby engine.
#sudo apt-get install moby-engine
################################################################################
# Install the Azure IoT Edge Security Daemon
# Perform apt update
sudo apt-get update
# Install the Moby command-line interface (CLI). The CLI is useful for development but optional for production deployments.
sudo apt-get install moby-cli
# Install the security daemon. The package is installed at /etc/iotedge/.
sudo apt-get install iotedge -y --no-install-recommends
################################################################################
#Configure the Azure IoT Edge Security
# Manual provisioning IoT edge device
sudo sed -i "s#\(device_connection_string: \).*#\1\"__device_connection_string\"#g" /etc/iotedge/config.yaml
sudo systemctl restart iotedge
###########################################
# Verify successful installation
# check the status of the IoT Edge Daemon
systemctl status iotedge
# Examine daemon logs
journalctl -u iotedge --no-pager --no-full

Просмотреть файл

@ -0,0 +1,372 @@
# -*- coding: utf-8 -*-
"""ResNet152 model for Keras.
# Reference:
- [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
Adaptation of code from flyyufelix, mvoelk, BigMoyan, fchollet at https://github.com/adamcasson/resnet152
"""
import numpy as np
import warnings
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import Flatten
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import GlobalMaxPooling2D
from keras.layers import ZeroPadding2D
from keras.layers import AveragePooling2D
from keras.layers import GlobalAveragePooling2D
from keras.layers import BatchNormalization
from keras.layers import add
from keras.models import Model
import keras.backend as K
from keras.engine.topology import get_source_inputs
from keras.utils import layer_utils
from keras import initializers
from keras.engine import Layer, InputSpec
from keras.preprocessing import image
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import decode_predictions
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.imagenet_utils import _obtain_input_shape
import sys
sys.setrecursionlimit(3000)
WEIGHTS_PATH = 'https://github.com/adamcasson/resnet152/releases/download/v0.1/resnet152_weights_tf.h5'
WEIGHTS_PATH_NO_TOP = 'https://github.com/adamcasson/resnet152/releases/download/v0.1/resnet152_weights_tf_notop.h5'
class Scale(Layer):
"""Custom Layer for ResNet used for BatchNormalization.
Learns a set of weights and biases used for scaling the input data.
the output consists simply in an element-wise multiplication of the input
and a sum of a set of constants:
out = in * gamma + beta,
where 'gamma' and 'beta' are the weights and biases larned.
Keyword arguments:
axis -- integer, axis along which to normalize in mode 0. For instance,
if your input tensor has shape (samples, channels, rows, cols),
set axis to 1 to normalize per feature map (channels axis).
momentum -- momentum in the computation of the exponential average
of the mean and standard deviation of the data, for
feature-wise normalization.
weights -- Initialization weights.
List of 2 Numpy arrays, with shapes:
`[(input_shape,), (input_shape,)]`
beta_init -- name of initialization function for shift parameter
(see [initializers](../initializers.md)), or alternatively,
Theano/TensorFlow function to use for weights initialization.
This parameter is only relevant if you don't pass a `weights` argument.
gamma_init -- name of initialization function for scale parameter (see
[initializers](../initializers.md)), or alternatively,
Theano/TensorFlow function to use for weights initialization.
This parameter is only relevant if you don't pass a `weights` argument.
"""
def __init__(self, weights=None, axis=-1, momentum = 0.9, beta_init='zero', gamma_init='one', **kwargs):
self.momentum = momentum
self.axis = axis
self.beta_init = initializers.get(beta_init)
self.gamma_init = initializers.get(gamma_init)
self.initial_weights = weights
super(Scale, self).__init__(**kwargs)
def build(self, input_shape):
self.input_spec = [InputSpec(shape=input_shape)]
shape = (int(input_shape[self.axis]),)
self.gamma = K.variable(self.gamma_init(shape), name='%s_gamma'%self.name)
self.beta = K.variable(self.beta_init(shape), name='%s_beta'%self.name)
self.trainable_weights = [self.gamma, self.beta]
if self.initial_weights is not None:
self.set_weights(self.initial_weights)
del self.initial_weights
def call(self, x, mask=None):
input_shape = self.input_spec[0].shape
broadcast_shape = [1] * len(input_shape)
broadcast_shape[self.axis] = input_shape[self.axis]
out = K.reshape(self.gamma, broadcast_shape) * x + K.reshape(self.beta, broadcast_shape)
return out
def get_config(self):
config = {"momentum": self.momentum, "axis": self.axis}
base_config = super(Scale, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def identity_block(input_tensor, kernel_size, filters, stage, block):
"""The identity_block is the block that has no conv layer at shortcut
Keyword arguments
input_tensor -- input tensor
kernel_size -- defualt 3, the kernel size of middle conv layer at main path
filters -- list of integers, the nb_filters of 3 conv layer at main path
stage -- integer, current stage label, used for generating layer names
block -- 'a','b'..., current block label, used for generating layer names
"""
eps = 1.1e-5
if K.image_dim_ordering() == 'tf':
bn_axis = 3
else:
bn_axis = 1
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
scale_name_base = 'scale' + str(stage) + block + '_branch'
x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor)
x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
x = Activation('relu', name=conv_name_base + '2a_relu')(x)
x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
x = Conv2D(nb_filter2, (kernel_size, kernel_size), name=conv_name_base + '2b', use_bias=False)(x)
x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
x = Activation('relu', name=conv_name_base + '2b_relu')(x)
x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
x = add([x, input_tensor], name='res' + str(stage) + block)
x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
return x
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
"""conv_block is the block that has a conv layer at shortcut
Keyword arguments:
input_tensor -- input tensor
kernel_size -- defualt 3, the kernel size of middle conv layer at main path
filters -- list of integers, the nb_filters of 3 conv layer at main path
stage -- integer, current stage label, used for generating layer names
block -- 'a','b'..., current block label, used for generating layer names
Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
And the shortcut should have subsample=(2,2) as well
"""
eps = 1.1e-5
if K.image_dim_ordering() == 'tf':
bn_axis = 3
else:
bn_axis = 1
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
scale_name_base = 'scale' + str(stage) + block + '_branch'
x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=False)(input_tensor)
x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
x = Activation('relu', name=conv_name_base + '2a_relu')(x)
x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
x = Conv2D(nb_filter2, (kernel_size, kernel_size),
name=conv_name_base + '2b', use_bias=False)(x)
x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
x = Activation('relu', name=conv_name_base + '2b_relu')(x)
x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
shortcut = Conv2D(nb_filter3, (1, 1), strides=strides,
name=conv_name_base + '1', use_bias=False)(input_tensor)
shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut)
shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut)
x = add([x, shortcut], name='res' + str(stage) + block)
x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
return x
def ResNet152(include_top=True, weights=None,
input_tensor=None, input_shape=None,
large_input=False, pooling=None,
classes=1000):
"""Instantiate the ResNet152 architecture.
Keyword arguments:
include_top -- whether to include the fully-connected layer at the
top of the network. (default True)
weights -- one of `None` (random initialization) or "imagenet"
(pre-training on ImageNet). (default None)
input_tensor -- optional Keras tensor (i.e. output of `layers.Input()`)
to use as image input for the model.(default None)
input_shape -- optional shape tuple, only to be specified if
`include_top` is False (otherwise the input shape has to be
`(224, 224, 3)` (with `channels_last` data format) or
`(3, 224, 224)` (with `channels_first` data format). It should
have exactly 3 inputs channels, and width and height should be
no smaller than 197. E.g. `(200, 200, 3)` would be one valid value.
(default None)
large_input -- if True, then the input shape expected will be
`(448, 448, 3)` (with `channels_last` data format) or
`(3, 448, 448)` (with `channels_first` data format). (default False)
pooling -- Optional pooling mode for feature extraction when
`include_top` is `False`.
- `None` means that the output of the model will be the 4D
tensor output of the last convolutional layer.
- `avg` means that global average pooling will be applied to
the output of the last convolutional layer, and thus
the output of the model will be a 2D tensor.
- `max` means that global max pooling will be applied.
(default None)
classes -- optional number of classes to classify image into, only
to be specified if `include_top` is True, and if no `weights`
argument is specified. (default 1000)
Returns:
A Keras model instance.
Raises:
ValueError: in case of invalid argument for `weights`,
or invalid input shape.
"""
if weights not in {'imagenet', None}:
raise ValueError('The `weights` argument should be either '
'`None` (random initialization) or `imagenet` '
'(pre-training on ImageNet).')
if weights == 'imagenet' and include_top and classes != 1000:
raise ValueError('If using `weights` as imagenet with `include_top`'
' as true, `classes` should be 1000')
eps = 1.1e-5
if large_input:
img_size = 448
else:
img_size = 224
# Determine proper input shape
input_shape = _obtain_input_shape(input_shape,
default_size=img_size,
min_size=197,
data_format=K.image_data_format(),
require_flatten=include_top)
if input_tensor is None:
img_input = Input(shape=input_shape)
else:
if not K.is_keras_tensor(input_tensor):
img_input = Input(tensor=input_tensor, shape=input_shape)
else:
img_input = input_tensor
# handle dimension ordering for different backends
if K.image_dim_ordering() == 'tf':
bn_axis = 3
else:
bn_axis = 1
x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x)
x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x)
x = Scale(axis=bn_axis, name='scale_conv1')(x)
x = Activation('relu', name='conv1_relu')(x)
x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1')(x)
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
for i in range(1,8):
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b'+str(i))
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
for i in range(1,36):
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b'+str(i))
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
if large_input:
x = AveragePooling2D((14, 14), name='avg_pool')(x)
else:
x = AveragePooling2D((7, 7), name='avg_pool')(x)
# include classification layer by default, not included for feature extraction
if include_top:
x = Flatten()(x)
x = Dense(classes, activation='softmax', name='fc1000')(x)
else:
if pooling == 'avg':
x = GlobalAveragePooling2D()(x)
elif pooling == 'max':
x = GlobalMaxPooling2D()(x)
# Ensure that the model takes into account
# any potential predecessors of `input_tensor`.
if input_tensor is not None:
inputs = get_source_inputs(input_tensor)
else:
inputs = img_input
# Create model.
model = Model(inputs, x, name='resnet152')
# load weights
if weights == 'imagenet':
if include_top:
weights_path = get_file('resnet152_weights_tf.h5',
WEIGHTS_PATH,
cache_subdir='models',
md5_hash='cdb18a2158b88e392c0905d47dcef965')
else:
weights_path = get_file('resnet152_weights_tf_notop.h5',
WEIGHTS_PATH_NO_TOP,
cache_subdir='models',
md5_hash='4a90dcdafacbd17d772af1fb44fc2660')
model.load_weights(weights_path, by_name=True)
if K.backend() == 'theano':
layer_utils.convert_all_kernels_in_model(model)
if include_top:
maxpool = model.get_layer(name='avg_pool')
shape = maxpool.output_shape[1:]
dense = model.get_layer(name='fc1000')
layer_utils.convert_dense_weights_data_format(dense, shape, 'channels_first')
if K.image_data_format() == 'channels_first' and K.backend() == 'tensorflow':
warnings.warn('You are using the TensorFlow backend, yet you '
'are using the Theano '
'image data format convention '
'(`image_data_format="channels_first"`). '
'For best performance, set '
'`image_data_format="channels_last"` in '
'your Keras config '
'at ~/.keras/keras.json.')
return model
if __name__ == '__main__':
model = ResNet152(include_top=True, weights='imagenet')
img_path = 'elephant.jpg'
img = image.load_img(img_path, target_size=(224,224))
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)
print('Input image shape:', x.shape)
preds = model.predict(x)
print('Predicted:', decode_predictions(preds))

Просмотреть файл

@ -0,0 +1,146 @@
import json
import logging
import random
import time
import urllib
from io import BytesIO
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import toolz
from PIL import Image, ImageOps
from azureml.core.authentication import AuthenticationException, AzureCliAuthentication, InteractiveLoginAuthentication
def read_image_from(url):
return toolz.pipe(url, urllib.request.urlopen, lambda x: x.read(), BytesIO)
def to_rgb(img_bytes):
return Image.open(img_bytes).convert("RGB")
@toolz.curry
def resize(img_file, new_size=(100, 100)):
return ImageOps.fit(img_file, new_size, Image.ANTIALIAS)
def to_bytes(img, encoding="JPEG"):
imgio = BytesIO()
img.save(imgio, encoding)
imgio.seek(0)
return imgio.read()
def to_img(img_url):
return toolz.pipe(img_url, read_image_from, to_rgb, resize(new_size=(224, 224)))
def _plot_image(ax, img):
ax.imshow(to_img(img))
ax.tick_params(
axis="both",
which="both",
bottom=False,
top=False,
left=False,
right=False,
labelleft=False,
labelbottom=False,
)
return ax
def _plot_prediction_bar(ax, r):
perf = [float(c[2]) for c in r.json()[0]["image"]]
ax.barh(range(3, 0, -1), perf, align="center", color="#55DD55")
ax.tick_params(
axis="both",
which="both",
bottom=False,
top=False,
left=False,
right=False,
labelbottom=False,
)
tick_labels = reversed([c[1] for c in r.json()[0]["image"]])
ax.yaxis.set_ticks([1, 2, 3])
ax.yaxis.set_ticklabels(
tick_labels, position=(0.5, 0), minor=False, horizontalalignment="center"
)
def plot_predictions(images, classification_results):
if len(images) != 6:
raise Exception("This method is only designed for 6 images")
gs = gridspec.GridSpec(2, 3)
fig = plt.figure(figsize=(12, 9))
gs.update(hspace=0.1, wspace=0.001)
for gg, r, img in zip(gs, classification_results, images):
gg2 = gridspec.GridSpecFromSubplotSpec(4, 10, subplot_spec=gg)
ax = fig.add_subplot(gg2[0:3, :])
_plot_image(ax, img)
ax = fig.add_subplot(gg2[3, 1:9])
_plot_prediction_bar(ax, r)
def write_json_to_file(json_dict, filename, mode="w"):
with open(filename, mode) as outfile:
json.dump(json_dict, outfile, indent=4, sort_keys=True)
outfile.write("\n\n")
def gen_variations_of_one_image(IMAGEURL, num):
out_images = []
img = to_img(IMAGEURL).convert("RGB")
# Flip the colours for one-pixel
# "Different Image"
for i in range(num):
diff_img = img.copy()
rndm_pixel_x_y = (
random.randint(0, diff_img.size[0] - 1),
random.randint(0, diff_img.size[1] - 1),
)
current_color = diff_img.getpixel(rndm_pixel_x_y)
diff_img.putpixel(rndm_pixel_x_y, current_color[::-1])
out_images.append(to_bytes(diff_img))
return out_images
def get_auth():
logger = logging.getLogger(__name__)
logger.debug("Trying to create Workspace with CLI Authentication")
try:
auth = AzureCliAuthentication()
auth.get_authentication_header()
except AuthenticationException:
logger.debug("Trying to create Workspace with Interactive login")
auth = InteractiveLoginAuthentication()
return auth
def wait_until_ready(endpoint, max_attempts):
code = 0
attempts = 0
while code != 200:
attempts += 1
if attempts == max_attempts:
print("Unable to connect to endpoint, quitting")
raise Exception(
"Endpoint unavailable in " + str(max_attempts) + " attempts."
)
break
try:
code = urllib.request.urlopen(endpoint).getcode()
except Exception as error:
print(
"Exception caught opening endpoint :" + str(endpoint) + " " + str(error)
)
if code != 200:
print("Endpoint unavailable, waiting")
time.sleep(10)
output_str = "We are all done with code " + str(code)
return output_str

141
archectures/Python-Keras-Scoring/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,141 @@
# aml stuff
aml_config
*.swp
*.vscode
*.ipynb_checkpoints
*__pycache__
# fuse connection
fuse_connection.cfg
# mounted disk
data/
# dont commit images or videos
*.jpg
*.png
*.mov
*.mp4
*.mp3
*.aac
*.avi
!sample*.jpg
!sample*.png
!sample*.mov
!sample*.mp4
# never commit config (as it contains secrets)
.env
# do not commit venv folder
pyenv
# ignore .tmp folder
.tmp
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
.dev_env
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/

Просмотреть файл

@ -0,0 +1,21 @@
MIT License
Copyright (c) Microsoft Corporation. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

Просмотреть файл

@ -0,0 +1,99 @@
.ONESHELL:
SHELL=/bin/bash
define PROJECT_HELP_MSG
Makefile for testing notebooks
Make sure you have edited the dev_env_template files and renamed it to .dev_env
All the variables loaded in this makefile must come from the .dev_env file
Usage:
make test run all notebooks
make clean delete env and remove files
endef
export PROJECT_HELP_MSG
include .dev_env
help:
echo "$$PROJECT_HELP_MSG" | less
test: setup test-notebook1 test-notebook2 test-notebook3 test-notebook4 test-notebook5
@echo All Notebooks Passed
setup:
conda env create -f environment.yml
ifndef TENANT_ID
@echo starting interactive login
az login -o table
else
@echo using service principal login
az login -t ${TENANT_ID} --service-principal -u ${SP_USERNAME} --password ${SP_PASSWORD}
endif
test-notebook1:
source activate batchscoringdl_aml
cd notebooks
@echo Testing 01_local_testing.ipynb
papermill 01_local_testing.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook2:
source activate batchscoringdl_aml
cd notebooks
@echo Testing 02_setup_aml.ipynb
papermill 02_setup_aml.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p subscription_id ${SUBSCRIPTION_ID} \
-p resource_group ${RESOURCE_GROUP} \
-p workspace_name ${WORKSPACE_NAME} \
-p workspace_region ${WORKSPACE_REGION} \
-p storage_account_name ${STORAGE_ACCOUNT_NAME}
test-notebook3:
source activate batchscoringdl_aml
cd notebooks
@echo Testing 03_develop_pipeline.ipynb
papermill 03_develop_pipeline.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook4:
source activate batchscoringdl_aml
cd notebooks
@echo Testing 04_deploy_logic_apps.ipynb
papermill 04_deploy_logic_apps.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook5:
source activate batchscoringdl_aml
cd notebooks
@echo Testing 05_clean_up.ipynb
papermill 05_clean_up.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
remove-notebook:
rm -f notebooks/test.ipynb
clean: remove-notebook
conda remove --name batchscoringdl_aml -y --all
cd notebooks
./clean_up.sh
rm -rf aml_config
rm -rf aml_test_orangutan
rm -rf __pycache__
rm -rf .ipynb_checkpoints
.PHONY: help test setup clean remove-notebook test-notebook1 test-notebook2 test-notebook3 test-notebook4 test-notebook5

Просмотреть файл

@ -0,0 +1,73 @@
[![Build Status](https://dev.azure.com/customai/BatchScoringDeepLearningModelsWithAMLPipeline/_apis/build/status/Azure.Batch-Scoring-Deep-Learning-Models-With-AML?branchName=master)](https://dev.azure.com/customai/BatchScoringDeepLearningModelsWithAMLPipeline/_build/latest?definitionId=9&branchName=master)
# Batch Scoring Deep Learning Models With Azure Machine Learning
## Overview
As described in the associated page on the [Azure Reference Architecture center](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/batch-scoring-deep-learning), in this repository, we use the scenario of applying style transfer onto a video (collection of images). This architecture can be generalized for any batch scoring with deep learning scenario. An alternative solution using Azure Kubernetes Service can be found [here](https://github.com/Azure/Batch-Scoring-Deep-Learning-Models-With-AKS).
## Design
![Reference Architecture Diagram](https://happypathspublic.blob.core.windows.net/assets/batch_scoring_for_dl/batchscoringdl-aml-architecture-diagram.jpg)
The above architecture works as follows:
1. Upload a video file to storage.
2. The video file will trigger Logic App to send a request to the AML pipeline published endpoint.
3. The pipeline will then process the video, apply style transfer with MPI, and postprocess the video.
4. The output will be saved back to blob storage once the pipeline is completed.
### What is Neural Style Transfer
| Style image: | Input/content video: | Output video: |
|--------|--------|---------|
| <img src="https://happypathspublic.blob.core.windows.net/assets/batch_scoring_for_dl/style_image.jpg" width="300"> | [<img src="https://happypathspublic.blob.core.windows.net/assets/batch_scoring_for_dl/input_video_image_0.jpg" width="300" height="300">](https://happypathspublic.blob.core.windows.net/assets/batch_scoring_for_dl/input_video.mp4 "Input Video") *click to view video* | [<img src="https://happypathspublic.blob.core.windows.net/assets/batch_scoring_for_dl/output_video_image_0.jpg" width="300" height="300">](https://happypathspublic.blob.core.windows.net/assets/batch_scoring_for_dl/output_video.mp4 "Output Video") *click to view* |
## Prerequsites
Local/Working Machine:
- Ubuntu >=16.04LTS (not tested on Mac or Windows)
- (Optional) [NVIDIA Drivers on GPU enabled machine](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-18-04-bionic-beaver-linux) [Additional ref: [https://github.com/NVIDIA/nvidia-docker](https://github.com/NVIDIA/nvidia-docker)]
- [Conda >=4.5.4](https://conda.io/docs/)
- [AzCopy >=7.0.0](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux?toc=%2fazure%2fstorage%2ffiles%2ftoc.json)
- [Azure CLI >=2.0](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest)
Accounts:
- [Azure Subscription](https://azure.microsoft.com/en-us/free/)
- (Optional) A [quota](https://docs.microsoft.com/en-us/azure/azure-supportability/resource-manager-core-quotas-request) for GPU-enabled VMs
While it is not required, it is also useful to use the [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/) to inspect your storage account.
## Setup
1. Clone the repo `git clone https://github.com/Azure/Batch-Scoring-Deep-Learning-Models-With-AML`
2. `cd` into the repo
3. Setup your conda env using the _environment.yaml_ file `conda env create -f environment.yml` - this will create a conda environment called __batchscoringdl_aml__
4. Activate your environment `conda activate batchscoringdl_aml`
5. Log in to Azure using the __az cli__ `az login`
## Steps
Run throught the following notebooks:
1. [Test the scripts](notebooks/01_local_testing.ipynb)
2. [Setup AML](notebooks/02_setup_aml.ipynb).
3. [Develop & publish AML pipeline](notebooks/03_develop_pipeline.ipynb)
4. [Deploy Logic Apps](notebooks/04_deploy_logic_apps.ipynb)
5. [Clean up](notebooks/05_clean_up.ipynb)
## Clean up
To clean up your working directory, you can run the `clean_up.sh` script that comes with this repo. This will remove all temporary directories that were generated as well as any configuration (such as Dockerfiles) that were created during the tutorials. This script will _not_ remove the `.env` file.
To clean up your Azure resources, you can simply delete the resource group that all your resources were deployed into. This can be done in the `az cli` using the command `az group delete --name <name-of-your-resource-group>`, or in the portal. If you want to keep certain resources, you can also use the `az cli` or the Azure portal to cherry pick the ones you want to deprovision. Finally, you should also delete the service principle using the `az ad sp delete` command.
All the step above are covered in the final [notebook](notebooks/05_clean_up.ipynb).
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repos using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -0,0 +1,106 @@
# BatchScoringDeepLearningModelsWithAML Pipeline
trigger:
batch: true
branches:
include:
- master
variables:
- group: AzureKeyVault
jobs:
- job: BatchScoringDeepLearningModelsWithAMLJob
timeoutInMinutes: 300
cancelTimeoutInMinutes: 2
pool:
vmImage: 'Ubuntu-16.04'
steps:
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
which conda
conda env create -f environment.yml
conda activate batchscoringdl_aml
conda env list
echo Login Azure Account
az login -t $(sptenent) --service-principal -u $(spidentity) --password $(spsecret)
displayName: 'Initial Step'
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate batchscoringdl_aml
conda env list
cd notebooks
echo Execute 01_local_testing.ipynb
papermill 01_local_testing.ipynb 01_local_testing_output.ipynb --log-output --no-progress-bar -k python3
displayName: '01_local_testing.ipynb'
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate batchscoringdl_aml
conda env list
cd notebooks
echo Execute 02_setup_aml.ipynb
papermill 02_setup_aml.ipynb 02_setup_aml_output.ipynb --log-output --no-progress-bar -k python3 \
-p subscription_id $(subscriptionid) \
-p resource_group $(azurergname) \
-p workspace_name $(workspacename) \
-p workspace_region $(azureregion) \
-p storage_account_name $(azurestorage)
displayName: '02_setup_aml.ipynb'
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate batchscoringdl_aml
conda env list
cd notebooks
echo Execute 03_develop_pipeline.ipynb
papermill 03_develop_pipeline.ipynb 03_develop_pipeline_output.ipynb --log-output --no-progress-bar -k python3
displayName: '03_develop_pipeline.ipynb'
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate batchscoringdl_aml
conda env list
cd notebooks
echo Execute 04_deploy_logic_apps.ipynb
papermill 04_deploy_logic_apps.ipynb 04_deploy_logic_apps_output.ipynb --log-output --no-progress-bar -k python3
displayName: '04_deploy_logic_apps.ipynb'
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate batchscoringdl_aml
conda env list
cd notebooks
echo Execute 05_clean_up.ipynb
papermill 05_clean_up.ipynb 05_clean_up_output.ipynb --log-output --no-progress-bar -k python3
displayName: '05_clean_up.ipynb'
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate batchscoringdl_aml
echo Execute Resource Group Delete
existResponse=$(az group exists -n $(azurergname))
if [ "$existResponse" == "true" ]; then
echo Deleting project resource group
az group delete --name $(azurergname) --yes
else
echo Project resource group did not exist
fi
echo Done Cleanup
displayName: 'Backup Cleanup'
condition: or(canceled(),failed())
- task: CreateWorkItem@1
inputs:
workItemType: 'Issue'
title: $(System.TeamProject) - Build $(Build.BuildNumber) Failed
assignedTo: 'JS <jiata@microsoft.com>'
associate: true
teamProject: $(System.TeamProject)
fieldMappings: |
Description=Branch: Branch $(Build.SourceBranch) failed to build. Go to Boards>WorkItems and tag the failure type.
displayName: 'Create work item on failure'
condition: failed()

Просмотреть файл

@ -0,0 +1,10 @@
# Fill in the fields below and rename to .dev_env
# TENANT_ID, SP_USERNAME and SP_PASSWORD are optional. If not supplied Azure cli will default to interactive login
TENANT_ID=
SP_USERNAME=
SP_PASSWORD=
SUBSCRIPTION_ID=
RESOURCE_GROUP="batchscorerg"
WORKSPACE_NAME="workspace"
WORKSPACE_REGION="eastus"
STORAGE_ACCOUNT_NAME="batchscorest"

Просмотреть файл

@ -0,0 +1,26 @@
name: batchscoringdl_aml
channels:
- conda-forge
- defaults
dependencies:
- ffmpeg=4.0.2
- nb_conda=2.2.1
- nb_conda_kernels=2.1.1
- tornado<6
- nbconvert=5.3.1
- pip=10.0.1
- python=3.6.7
- prompt_toolkit=1.0.15
- pip:
- azure
- azureml-sdk
- azure-cli
- azure.mgmt.common
- azure-mgmt-storage
- black==18.9b0
- python-dotenv==0.9.1
- torch==0.4.1
- torchvision==0.2.1
- ipykernel==4.10.0
- papermill==0.14.1
- urllib3<1.25

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,441 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup AML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we'll setup our Azure Machine Learning workspace as well as another storage account."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### AML Installation and configuration\n",
"This notebook configures the notebooks in this tutorial to connect to an Azure Machine Learning (AML) Workspace. You can use an existing workspace or create a new one."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import azureml.core\n",
"from azureml.core import Workspace\n",
"from dotenv import set_key, get_key, find_dotenv, load_dotenv\n",
"from pathlib import Path\n",
"import json\n",
"import os\n",
"from utilities import get_auth"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"tags": [
"install"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AML SDK Version: 1.0.17\n"
]
}
],
"source": [
"print(\"AML SDK Version:\", azureml.core.VERSION) # 1.0.2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set up your Azure Machine Learning workspace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To create or access an Azure ML Workspace, you will need the following information:\n",
"\n",
"* Your subscription id\n",
"* A resource group name\n",
"* A name for your workspace\n",
"* A region for your workspace\n",
"\n",
"**Note**: As with other Azure services, there are limits on certain resources like cluster size associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you have a workspace created already, you need to get your subscription and workspace information. You can find the values for those by visiting your workspace in the [Azure portal](http://portal.azure.com). If you don't have a workspace, the create workspace command in the next section will create a resource group and a workspace using the names you provide."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" Replace the values in the following cell with your information.\n",
" \n",
" - you can navigate to the portal to find your subscription id or enter `az account list -o table` into the cli\n",
" - you can use `az account list-locations -o table` to find the available locations (use the values in the field 'name')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"subscription_id = \"<YOUR-SUBSCRIPTION>\"\n",
"resource_group = \"<YOUR-RESOURCE-GROUP-NAME>\"\n",
"workspace_name = \"<YOUR-WORKSPACE-NAME>\"\n",
"workspace_region = \"<YOUR-REGION>\"\n",
"storage_account_name = \"<YOUR-STORAGE-ACCOUNT-NAME>\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create and initialize a dotenv file for storing parameters used in multiple notebooks."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv()\n",
"if env_path == \"\":\n",
" Path(\".env\").touch()\n",
" env_path = find_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [
{
"data": {
"text/plain": [
"(True, 'REGION', 'eastus')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"set_key(env_path, \"SUBSCRIPTION_ID\", subscription_id) \n",
"set_key(env_path, \"RESOURCE_GROUP\", resource_group)\n",
"set_key(env_path, \"WORKSPACE_NAME\", workspace_name)\n",
"set_key(env_path, \"REGION\", workspace_region)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create the workspace\n",
"This cell will create an AML workspace for you in a subscription, provided you have the correct permissions.\n",
"\n",
"This will fail when:\n",
"1. You do not have permission to create a workspace in the resource group\n",
"2. You do not have permission to create a resource group if it's non-existing.\n",
"2. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription\n",
"\n",
"If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to provision the required resources. If this cell succeeds, you're done configuring AML! "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"create workspace"
]
},
"outputs": [],
"source": [
"# import the Workspace class\n",
"ws = Workspace.create(\n",
" name=workspace_name,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" location=workspace_region,\n",
" create_resource_group=True,\n",
" exist_ok=True,\n",
" auth=get_auth()\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the details of the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": [
"ws.get_details()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's write the workspace configuration for the rest of the notebooks to connect to the workspace."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wrote the config file config.json to: /home/mat/repos/Batch-Scoring-Deep-Learning-Models-With-AML/notebooks/aml_config/config.json\n"
]
}
],
"source": [
"ws.write_config()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"At this point, you can inspect your resource group to see your active Azure Machine Learning workspace. Inside your resource group, you'll find various resources that have been generated to back the Machine Learning workspace. These resources will have a generated resource name, such as `<workspace-name>keyvaultpuvogqzk`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup a Storage Account"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One of the resources that will have been generated when created the workspace is a storage account. This storage account is used as the default datastore for Azure Machine Learning. However, to keep things seperate, we'll create another storage account for holding input and output data. This will also let us walk through how to connect our workspace to other storage locations."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"storage_container_name = \"aml\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the __az cli__ to create the account"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K\"Succeeded\" ..\n",
"\u001b[0m"
]
}
],
"source": [
"!az storage account create \\\n",
" -n {storage_account_name} \\\n",
" -g {resource_group} \\\n",
" --query 'provisioningState'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the __az cli__ to grab the keys of the storage account that was just created. The `--quote '[0].value'` part of the command simply means to select the _value_ of the _zero-th indexed_ of the set of keys."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"key = !az storage account keys list \\\n",
" --account-name {storage_account_name} \\\n",
" -g {resource_group} \\\n",
" --query '[0].value'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The stdout from the command above is stored in a string array of 1. Select the element in the array and ttrip opening and closing quotation marks."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"storage_account_key = str(key[0][1:-1]) # this is used to strip opening and closing quotation marks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the __az cli__ to create the container in the storage account"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\r\n",
" \"created\": true\r\n",
"}\r\n"
]
}
],
"source": [
"!az storage container create \\\n",
" --account-name {storage_account_name} \\\n",
" --account-key {storage_account_key} \\\n",
" --name {storage_container_name}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set storage account key to dotenv file"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [
{
"data": {
"text/plain": [
"(True, 'STORAGE_CONTAINER_NAME', 'aml')"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"set_key(env_path, \"STORAGE_ACCOUNT_NAME\", storage_account_name)\n",
"set_key(env_path, \"STORAGE_ACCOUNT_KEY\", storage_account_key) # generated\n",
"set_key(env_path, \"STORAGE_CONTAINER_NAME\", storage_container_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You are now ready to move on to the [next notebook](03_develop_pipeline.ipynb)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:batchscoringdl_aml]",
"language": "python",
"name": "conda-env-batchscoringdl_aml-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,946 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Develop our pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we will develop our Azure Machine Learning Pipeline. The Azure Machine learning pipeline will string together the steps of preprocessing the video, applying style transfer, and postprocessing the video into a single execution graph. \n",
"\n",
"To setup the pipeline, we'll need to make sure we have the necessary compute and storage available. To do so, we'll need to create our compute platform using AmlCompute and register the storage account that we created in the previous notebook.\n",
"\n",
"The last step of this notebook is to publish the pipeline. Once it's published as a public endpoint, we'll test it to make sure that it runs as expected."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import package and load .env"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from dotenv import set_key, get_key, find_dotenv, load_dotenv\n",
"from pathlib import Path\n",
"from azureml.core import Workspace, Run, Experiment\n",
"from azureml.core.compute import AmlCompute, ComputeTarget\n",
"from azureml.core.datastore import Datastore\n",
"from azureml.data.data_reference import DataReference\n",
"from azureml.pipeline.core import Pipeline, PipelineData\n",
"from azureml.pipeline.steps import PythonScriptStep, MpiStep\n",
"from azureml.core.runconfig import CondaDependencies, RunConfiguration\n",
"from azureml.core.runconfig import DEFAULT_CPU_IMAGE #, DEFAULT_GPU_IMAGE\n",
"from IPython.core.display import display, HTML\n",
"from azureml.data.datapath import DataPath, DataPathComputeBinding\n",
"from azureml.pipeline.core.graph import PipelineParameter\n",
"from azureml.core.authentication import AzureCliAuthentication\n",
"import subprocess\n",
"import requests\n",
"import json\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)\n",
"load_dotenv(env_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup the workspace in AML"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get our workspace from the config file."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')\n",
"\n",
"# Also create a Project and attach to Workspace\n",
"project_folder = \"scripts\"\n",
"run_history_name = project_folder\n",
"\n",
"if not os.path.isdir(project_folder):\n",
" os.mkdir(project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup the compute"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create our compute using `AmlCompute`. We'll need one node for the video pre/post processing. And the remaining nodes for performing the style transfer. Since we'll be using the MPI Step, all nodes must be active before the MPI step will execute. Thus, we should set max nodes to equal min nodes, as there is no point autoscaling the cluster."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the number of nodes we want for each cluster."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"style_transfer_node_count = 4\n",
"ffmpeg_node_count = 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Verify that the subscription in use has enough cores. We need to check for two vm types since we'll be using NCSv3 for style transfer and DSv2 for ffmpeg processes. If you do not have quota for the NCSv3 family, you can use another GPU family instead."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"vm_dict = {\n",
" \"NCSv3\": {\n",
" \"size\": \"STANDARD_NC6s_v3\",\n",
" \"cores\": 6\n",
" },\n",
" \"DSv2\": {\n",
" \"size\": \"STANDARD_DS3_V2\",\n",
" \"cores\": 4\n",
" }\n",
"}\n",
"\n",
"def check_quota(vm_family):\n",
" \"\"\"\n",
" returns quota object\n",
" \"\"\"\n",
" results = subprocess.run([\n",
" \"az\", \"vm\", \"list-usage\", \n",
" \"--location\", get_key(env_path, \"REGION\"), \n",
" \"--query\", \"[?contains(localName, '%s')].{max:limit, current:currentValue}\" % (vm_family)\n",
" ], stdout=subprocess.PIPE)\n",
" quota = json.loads(''.join(results.stdout.decode('utf-8')))\n",
" return int(quota[0]['max']) - int(quota[0]['current'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check that we have enough DSv2 quota."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking quota for family size DSv2...\n",
"There are enough cores, you may continue...\n"
]
}
],
"source": [
"print(\"Checking quota for family size DSv2...\")\n",
"vm_family = \"DSv2\"\n",
"requested_cores = ffmpeg_node_count * vm_dict[vm_family][\"cores\"]\n",
"\n",
"diff = check_quota(vm_family)\n",
"if diff <= requested_cores:\n",
" print(\"Not enough cores of DSv2 in region, asking for {} but have {}\".format(requested_cores, diff))\n",
"else: \n",
" print(\"There are enough cores, you may continue...\")\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create our non-gpu DSv2 cluster"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating ffmpeg-cluster\n",
"Creating\n",
"Succeeded..............\n",
"AmlCompute wait for completion finished\n",
"Minimum number of nodes requested have been provisioned\n"
]
}
],
"source": [
"# CPU compute\n",
"cpu_cluster_name = \"ffmpeg-cluster\"\n",
"try:\n",
" cpu_cluster = AmlCompute(ws, cpu_cluster_name)\n",
" print(\"Found existing cluster.\")\n",
"except:\n",
" print(\"Creating {}\".format(cpu_cluster_name))\n",
" provisioning_config = AmlCompute.provisioning_configuration(\n",
" vm_size=vm_dict[\"DSv2\"][\"size\"], \n",
" min_nodes=ffmpeg_node_count, \n",
" max_nodes=ffmpeg_node_count\n",
" )\n",
"\n",
" # create the cluster\n",
" cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, provisioning_config)\n",
" cpu_cluster.wait_for_completion(show_output=True)\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check that we have enough NCSv3 quota."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Checking quota for family size NCSv3...\n",
"There are enough cores, you may continue...\n"
]
}
],
"source": [
"print(\"Checking quota for family size NCSv3...\")\n",
"vm_family = \"NCSv3\"\n",
"requested_cores = style_transfer_node_count * vm_dict[vm_family][\"cores\"]\n",
"\n",
"diff = check_quota(vm_family)\n",
"if diff <= requested_cores:\n",
" print(\"Not enough cores of NCSv3 in region, asking for {} but have {}\".format(requested_cores, diff))\n",
"else:\n",
" print(\"There are enough cores, you may continue...\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create our NCSv3 cluster."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating style-cluster\n",
"Creating\n",
"Succeeded.......................\n",
"AmlCompute wait for completion finished\n",
"Minimum number of nodes requested have been provisioned\n"
]
}
],
"source": [
"# GPU compute\n",
"gpu_cluster_name = \"style-cluster\"\n",
"try:\n",
" gpu_cluster = AmlCompute(ws, gpu_cluster_name)\n",
" print(\"Found existing cluster.\")\n",
"except:\n",
" print(\"Creating {}\".format(gpu_cluster_name))\n",
" provisioning_config = AmlCompute.provisioning_configuration(\n",
" vm_size=vm_dict[\"NCSv3\"][\"size\"], \n",
" min_nodes=style_transfer_node_count, \n",
" max_nodes=style_transfer_node_count\n",
" )\n",
"\n",
" # create the cluster\n",
" gpu_cluster = ComputeTarget.create(ws, gpu_cluster_name, provisioning_config)\n",
" gpu_cluster.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup data references"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a datastore based on the storage account we created earlier. We'll use that storage account to hold our input and output data."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": [
"my_datastore_name = \"datastore\"\n",
"set_key(env_path, \"AML_DATASTORE_NAME\", my_datastore_name)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# datastore\n",
"my_datastore = Datastore.register_azure_blob_container(\n",
" workspace=ws, \n",
" datastore_name=my_datastore_name, \n",
" container_name=get_key(env_path, \"STORAGE_CONTAINER_NAME\"), \n",
" account_name=get_key(env_path, \"STORAGE_ACCOUNT_NAME\"), \n",
" account_key=get_key(env_path, \"STORAGE_ACCOUNT_KEY\"),\n",
" overwrite=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Upload the `models` folder (from out local directory) and the `orangutan.mp4` video to the datastore."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"$AZUREML_DATAREFERENCE_datastore"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Upload files in models folder to a directory called models\n",
"my_datastore.upload_files(\n",
" [\"./models/model.pth\"],\n",
" target_path=\"models\", \n",
" overwrite=True\n",
")\n",
"\n",
"# Upload orangutan.mp4 video\n",
"my_datastore.upload_files(\n",
" [\"./orangutan.mp4\"],\n",
" overwrite=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the `models` dir we uploaded as data references to be used by the pipeline steps later on."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"model_dir = DataReference(\n",
" data_reference_name=\"model_dir\", \n",
" datastore=my_datastore, \n",
" path_on_datastore=\"models\", \n",
" mode=\"download\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the output video to be saved in the same datastore."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"output_video = PipelineData(name=\"output_video\", datastore=my_datastore)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get a reference to the datastore that was generated when the AML workspace was created. We'll use this datastore to hold temporary pipeline data."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"default_datastore = ws.get_default_datastore() "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save all temporary data files (PipelineData) to the default datastore."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"ffmpeg_audio = PipelineData(name=\"ffmpeg_audio\", datastore=default_datastore)\n",
"ffmpeg_images = PipelineData(name=\"ffmpeg_images\", datastore=default_datastore)\n",
"processed_images = PipelineData(name=\"processed_images\", datastore=default_datastore)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup cluster environments"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Config for ffmpeg cluster"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"ffmpeg_cd = CondaDependencies()\n",
"ffmpeg_cd.add_channel(\"conda-forge\")\n",
"ffmpeg_cd.add_conda_package(\"ffmpeg\")\n",
"\n",
"ffmpeg_run_config = RunConfiguration(conda_dependencies=ffmpeg_cd)\n",
"ffmpeg_run_config.environment.docker.enabled = True\n",
"ffmpeg_run_config.environment.docker.gpu_support = False\n",
"ffmpeg_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE\n",
"ffmpeg_run_config.environment.spark.precache_packages = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Config for style transfer cluster"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"style_transfer_cd = CondaDependencies()\n",
"style_transfer_cd.add_channel(\"pytorch\")\n",
"style_transfer_cd.add_conda_package(\"pytorch\")\n",
"\n",
"style_transfer_run_config = RunConfiguration(conda_dependencies=style_transfer_cd)\n",
"style_transfer_run_config.environment.docker.enabled = True\n",
"style_transfer_run_config.environment.docker.gpu_support = True\n",
"style_transfer_run_config.environment.docker.base_image = \"pytorch/pytorch\"\n",
"style_transfer_run_config.environment.spark.precache_packages = False"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Set up pipeline steps"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"When setting up the pipelines, we'll need to create a `video_path_param` that can be modified when the pipeline is published."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"video_path_default = DataPath(datastore=my_datastore, path_on_datastore=\"orangutan.mp4\")\n",
"video_path_param = (PipelineParameter(name=\"video_path\", default_value=video_path_default), DataPathComputeBinding())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the 3-step pipeline using PythonScriptSteps and the MpiStep. In the MPI step, you'll notice that we use the `style_transfer_mpi.py` script instead of the `style_transfer.py` script. This is because the MPI expects that the script is modified to use MPI code.\n",
"\n",
"Both scripts do the exact same thing, except that the `style_transfer_mpi.py` script is set up to use MPI to run process the frames in a distributed way. \n",
"\n",
"Feel free to inspect the differences under the `scripts` folder."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"preprocess_video_step = PythonScriptStep(\n",
" name=\"preprocess video\",\n",
" script_name=\"preprocess_video.py\",\n",
" arguments=[\"--input-video\", video_path_param,\n",
" \"--output-audio\", ffmpeg_audio,\n",
" \"--output-images\", ffmpeg_images,\n",
" ],\n",
" compute_target=cpu_cluster,\n",
" inputs=[video_path_param],\n",
" outputs=[ffmpeg_images, ffmpeg_audio],\n",
" runconfig=ffmpeg_run_config,\n",
" source_directory=project_folder,\n",
" allow_reuse=False\n",
")\n",
"\n",
"distributed_style_transfer_step = MpiStep(\n",
" name=\"mpi style transfer\",\n",
" script_name=\"style_transfer_mpi.py\",\n",
" arguments=[\"--content-dir\", ffmpeg_images,\n",
" \"--output-dir\", processed_images,\n",
" \"--model-dir\", model_dir,\n",
" \"--cuda\", 1\n",
" ],\n",
" compute_target=gpu_cluster,\n",
" node_count=4, \n",
" process_count_per_node=1,\n",
" inputs=[model_dir, ffmpeg_images],\n",
" outputs=[processed_images],\n",
" pip_packages=[\"image\", \"mpi4py\", \"torch\", \"torchvision\"],\n",
" runconfig=style_transfer_run_config,\n",
" use_gpu=True,\n",
" source_directory=project_folder,\n",
" allow_reuse=False\n",
")\n",
"\n",
"postprocess_video_step = PythonScriptStep(\n",
" name=\"postprocess video\",\n",
" script_name=\"postprocess_video.py\",\n",
" arguments=[\"--images-dir\", processed_images, \n",
" \"--input-audio\", ffmpeg_audio, \n",
" \"--output-dir\", output_video],\n",
" compute_target=cpu_cluster,\n",
" inputs=[processed_images, ffmpeg_audio],\n",
" outputs=[output_video],\n",
" runconfig=ffmpeg_run_config,\n",
" source_directory=project_folder,\n",
" allow_reuse=False\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Run the pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the pipeline, passing in the video path variable."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Created step postprocess video [7cc17697][f9be7548-48d6-4df0-99f6-99a0c0377c20], (This step will run and generate new outputs)\n",
"Created step mpi style transfer [e7ba080e][3017c382-f3cf-4055-8f20-409ac9305ffc], (This step will run and generate new outputs)\n",
"Created step preprocess video [2bb7ac9c][31623158-d45d-4c8a-8fcc-c12d4a73f4b6], (This step will run and generate new outputs)\n",
"Created data reference model_dir for StepId [416f19e6][88eeea78-4063-4fc8-9930-5480186dd516], (Consumers of this data will generate new runs.)\n",
"Created data reference datastore_975052cf_ad24f844 for StepId [17a8ad15][4c81af2a-45ec-47b5-8678-e08556c3738a], (Consumers of this data will generate new runs.)\n",
"Submitted pipeline run: 304146d1-ea9c-445c-978a-b8c55800759a\n"
]
}
],
"source": [
"steps = [postprocess_video_step]\n",
"pipeline = Pipeline(workspace=ws, steps=steps)\n",
"pipeline_run = Experiment(ws, 'style_transfer_mpi').submit(\n",
" pipeline, \n",
" pipeline_params={'video_path': DataPath(datastore=my_datastore, path_on_datastore=\"orangutan.mp4\")}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"width:100%\"><tr><th>Experiment</th><th>Id</th><th>Type</th><th>Status</th><th>Details Page</th><th>Docs Page</th></tr><tr><td>style_transfer_mpi</td><td>304146d1-ea9c-445c-978a-b8c55800759a</td><td>azureml.PipelineRun</td><td>NotStarted</td><td><a href=\"https://mlworkspace.azure.ai/portal/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/jiataamltest02/providers/Microsoft.MachineLearningServices/workspaces/jiataamltest02/experiments/style_transfer_mpi/runs/304146d1-ea9c-445c-978a-b8c55800759a\" target=\"_blank\" rel=\"noopener\">Link to Azure Portal</a></td><td><a href=\"https://docs.microsoft.com/en-us/python/api/overview/azure/ml/intro?view=azure-ml-py\" target=\"_blank\" rel=\"noopener\">Link to Documentation</a></td></tr></table>"
],
"text/plain": [
"Run(Experiment: style_transfer_mpi,\n",
"Id: 304146d1-ea9c-445c-978a-b8c55800759a,\n",
"Type: azureml.PipelineRun,\n",
"Status: NotStarted)"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Wait until the pipeline completes before proceeding..."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"status:Running\n",
"...............................................................................................................................................................................................................................................................\n",
"status:Finished\n"
]
},
{
"data": {
"text/plain": [
"'Finished'"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline_run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Download the output video"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the step id of the postprocessing step"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"step_id = pipeline_run.find_step_run(\"postprocess video\")[0].id"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download the output files from the postprocessing step"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"my_datastore.download(\n",
" target_path=\"aml_test_orangutan\", \n",
" prefix=step_id, \n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Display the generated output video that we just downloaded"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <video width=\"320\" height=\"240\" controls>\n",
" <source src=\"aml_test_orangutan/304146d1ea9c445c978ab8c55800759a_7cc17697_1-30-2019_07-39-13_AM/output_video/video_processed.mp4\" type=\"video/mp4\">\n",
" </video>\n"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(HTML(\"\"\"\n",
" <video width=\"320\" height=\"240\" controls>\n",
" <source src=\"aml_test_orangutan/{}/output_video/video_processed.mp4\" type=\"video/mp4\">\n",
" </video>\n",
"\"\"\".format(step_id)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Publish the pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The last step is to publish the pipeline so that the pipeline can be triggered on an http endpoint. We'll use Logic Apps in the next notebook to consume this endpoint."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"published_pipeline = pipeline.publish(\n",
" name=\"style transfer\", \n",
" description=\"some description\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": [
"published_pipeline_id = published_pipeline.id\n",
"set_key(env_path, \"AML_PUBLISHED_PIPELINE_ID\", published_pipeline_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test the published pipeline"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": [
"cli_auth = AzureCliAuthentication()\n",
"aad_token = cli_auth.get_authentication_header()\n",
"\n",
"response = requests.post(\n",
" published_pipeline.endpoint, \n",
" headers=aad_token, \n",
" json={\n",
" \"ExperimentName\": \"My_Pipeline\",\n",
" \"DataPathAssignments\": {\n",
" \"video_path\": {\"DataStoreName\": my_datastore_name,\n",
" \"RelativePath\": \"orangutan.mp4\"}\n",
" }\n",
" }\n",
")\n",
"\n",
"run_id = response.json()[\"Id\"]\n",
"print(run_id)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You are now ready to move on to the [next notebook](04_deploy_logic_apps.ipynb)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:batchscoringdl_aml]",
"language": "python",
"name": "conda-env-batchscoringdl_aml-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,474 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploy Logic Apps"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To operationalize our batch scoring workflow, we need a way to trigger our pipeline. Since we're applying style transfer to video data, lets trigger the pipeline everytime a new video is uploaded and detected. To do this, we'll need a mechanism that can detect the appearance of new video data. \n",
"\n",
"Logic Apps can solve this problem for us. In this notebook, we'll deploy a pre-built logic app that will look for new videos that appear in a specified storage location. When a new video is detected, the logic app will send an http request to the published pipeline (which we deployed in the previous notebook). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import the packages we need."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import Workspace, Run, Experiment\n",
"from azureml.pipeline.core import PublishedPipeline\n",
"from azureml.core.datastore import Datastore\n",
"from dotenv import set_key, get_key, find_dotenv, load_dotenv\n",
"from azureml.core.authentication import AzureCliAuthentication\n",
"from pathlib import Path\n",
"import re\n",
"import json\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)\n",
"load_dotenv(env_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load our workspace from the config file."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": [
"ws = Workspace.from_config()\n",
"print('Workspace name: ' + ws.name, \n",
" 'Azure region: ' + ws.location, \n",
" 'Subscription id: ' + ws.subscription_id, \n",
" 'Resource group: ' + ws.resource_group, sep = '\\n')\n",
"\n",
"# Also create a Project and attach to Workspace\n",
"project_folder = \"scripts\"\n",
"run_history_name = project_folder\n",
"\n",
"if not os.path.isdir(project_folder):\n",
" os.mkdir(project_folder)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get authentication information about our published pipeline so that we can use it during the deployment."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"published_pipeline = PublishedPipeline.get(ws, id=get_key(env_path, \"AML_PUBLISHED_PIPELINE_ID\"))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"cli_auth = AzureCliAuthentication()\n",
"aad_token = cli_auth.get_authentication_header()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Deploy Logic App"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Logic Apps](https://happypathspublic.blob.core.windows.net/assets/batch_scoring_for_dl/azure_logic_app.jpg)\n",
"\n",
"The *logic* behind the Logic App deployment is shown above:\n",
"1. When a blob is added, begin the workflow.\n",
"2. Check the blob name. \n",
" - if the blob name ends with `.mp4`:\n",
" - make a request to the AKS endpoint\n",
" - otherwise:\n",
" - terminate in cancellation\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the deployment for the Azure blob storage connector.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K{- Finished ..\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/jiataamltest02/providers/Microsoft.Resources/deployments/blob_connector\",\n",
" \"location\": null,\n",
" \"name\": \"blob_connector\",\n",
" \"properties\": {\n",
" \"correlationId\": \"1a6e909e-adb2-4390-9498-e2a37d6274ac\",\n",
" \"debugSetting\": null,\n",
" \"dependencies\": [],\n",
" \"duration\": \"PT4.7165887S\",\n",
" \"mode\": \"Incremental\",\n",
" \"onErrorDeployment\": null,\n",
" \"outputResources\": [\n",
" {\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/jiataamltest02/providers/Microsoft.Web/connections/azureblob\",\n",
" \"resourceGroup\": \"jiataamltest02\"\n",
" }\n",
" ],\n",
" \"outputs\": null,\n",
" \"parameters\": {\n",
" \"location\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"eastus\"\n",
" },\n",
" \"storage_account_key\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"uOpI4HeCmvKRzHSbo8qQcQe9z/LvxnIdA+f8oh5R3LlqTW6g3EhOWsn7BKgXq3otHWgdHtrf52QrBrVv+es62A==\"\n",
" },\n",
" \"storage_account_name\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"jiataamltest02sa\"\n",
" },\n",
" \"subscription_id\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"edf507a2-6235-46c5-b560-fd463ba2e771\"\n",
" }\n",
" },\n",
" \"parametersLink\": null,\n",
" \"providers\": [\n",
" {\n",
" \"id\": null,\n",
" \"namespace\": \"Microsoft.Web\",\n",
" \"registrationState\": null,\n",
" \"resourceTypes\": [\n",
" {\n",
" \"aliases\": null,\n",
" \"apiVersions\": null,\n",
" \"locations\": [\n",
" \"eastus\"\n",
" ],\n",
" \"properties\": null,\n",
" \"resourceType\": \"connections\"\n",
" }\n",
" ]\n",
" }\n",
" ],\n",
" \"provisioningState\": \"Succeeded\",\n",
" \"template\": null,\n",
" \"templateHash\": \"4552859792189313276\",\n",
" \"templateLink\": null,\n",
" \"timestamp\": \"2019-01-30T07:57:26.515212+00:00\"\n",
" },\n",
" \"resourceGroup\": \"jiataamltest02\",\n",
" \"type\": null\n",
"}\n",
"\u001b[0m"
]
}
],
"source": [
"!az group deployment create \\\n",
" --name blob_connector \\\n",
" --resource-group {get_key(env_path, \"RESOURCE_GROUP\")} \\\n",
" --template-file template.blob_connector.json \\\n",
" --parameters \\\n",
" location={get_key(env_path, \"REGION\")} \\\n",
" subscription_id={get_key(env_path, \"SUBSCRIPTION_ID\")} \\\n",
" storage_account_name={get_key(env_path, \"STORAGE_ACCOUNT_NAME\")} \\\n",
" storage_account_key={get_key(env_path, \"STORAGE_ACCOUNT_KEY\")}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the deployment for the Logic App."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[K{- Finished ..\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/jiataamltest02/providers/Microsoft.Resources/deployments/logic_app\",\n",
" \"location\": null,\n",
" \"name\": \"logic_app\",\n",
" \"properties\": {\n",
" \"correlationId\": \"ab0c3b91-1235-4afe-a68f-0fa70f8f0a31\",\n",
" \"debugSetting\": null,\n",
" \"dependencies\": [],\n",
" \"duration\": \"PT6.961205S\",\n",
" \"mode\": \"Incremental\",\n",
" \"onErrorDeployment\": null,\n",
" \"outputResources\": [\n",
" {\n",
" \"id\": \"/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/jiataamltest02/providers/Microsoft.Logic/workflows/logic_app\",\n",
" \"resourceGroup\": \"jiataamltest02\"\n",
" }\n",
" ],\n",
" \"outputs\": null,\n",
" \"parameters\": {\n",
" \"aad_token\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6Im5iQ3dXMTF3M1hrQi14VWFYd0tSU0xqTUhHUSIsImtpZCI6Im5iQ3dXMTF3M1hrQi14VWFYd0tSU0xqTUhHUSJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuY29yZS53aW5kb3dzLm5ldC8iLCJpc3MiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC83MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDcvIiwiaWF0IjoxNTQ4ODMyMzUyLCJuYmYiOjE1NDg4MzIzNTIsImV4cCI6MTU0ODgzNjI1MiwiX2NsYWltX25hbWVzIjp7Imdyb3VwcyI6InNyYzEifSwiX2NsYWltX3NvdXJjZXMiOnsic3JjMSI6eyJlbmRwb2ludCI6Imh0dHBzOi8vZ3JhcGgud2luZG93cy5uZXQvNzJmOTg4YmYtODZmMS00MWFmLTkxYWItMmQ3Y2QwMTFkYjQ3L3VzZXJzLzIzYzljMWEyLWU3MjctNDE4OC1hYTI1LWFmMTA4MmM1ODI0NC9nZXRNZW1iZXJPYmplY3RzIn19LCJhY3IiOiIxIiwiYWlvIjoiQVZRQXEvOEtBQUFBcFdDY3lIb1h5M1J4S3lnbG5YWEpuOWN3Tm5PbnJ3dkhGaXdCdWw3UVNKUGYxVkh0TmxRdGd5cUtuN1FqYTVqTHhVSXV5T1NaSVg5V3ZlWHIxcTVuRS9ONTR3TlNmZGlrTDVXd0E3Um1MQkE9IiwiYW1yIjpbIndpYSIsIm1mYSJdLCJhcHBpZCI6IjA0YjA3Nzk1LThkZGItNDYxYS1iYmVlLTAyZjllMWJmN2I0NiIsImFwcGlkYWNyIjoiMCIsImZhbWlseV9uYW1lIjoiVGFuIiwiZ2l2ZW5fbmFtZSI6IkpTIiwiaXBhZGRyIjoiNDAuMTIxLjE5LjE4OCIsIm5hbWUiOiJKUyBUYW4iLCJvaWQiOiIyM2M5YzFhMi1lNzI3LTQxODgtYWEyNS1hZjEwODJjNTgyNDQiLCJvbnByZW1fc2lkIjoiUy0xLTUtMjEtMjEyNzUyMTE4NC0xNjA0MDEyOTIwLTE4ODc5Mjc1MjctMTgxOTA5NDIiLCJwdWlkIjoiMTAwMzAwMDA5M0Q1QUE2RiIsInNjcCI6InVzZXJfaW1wZXJzb25hdGlvbiIsInN1YiI6IktlRWRuOGJSZjNGYzcwX3lUbUxqRHBocDg4eVpRemFCWlJHRDFNWEFBTDgiLCJ0aWQiOiI3MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDciLCJ1bmlxdWVfbmFtZSI6ImppYXRhQG1pY3Jvc29mdC5jb20iLCJ1cG4iOiJqaWF0YUBtaWNyb3NvZnQuY29tIiwidXRpIjoiMlIyRnA4Z2R1ME9ERUhsNUY0ZEJBQSIsInZlciI6IjEuMCJ9.lCMuYCOiG8kpOJTBkbARhlKTGSMLZ4dfgSzgpZBrLU6mC5OhevF8Rk3r2wGADf-hTSQoxCuViCUBdch3fbs6THIqW9LGSVaWHoy3NekSbGbpJMBSqm_X_hBNj_wefz2tGUvUXv0ZB0n7nerio4uGh-yZwd7S1tfCf2oifrjusdp2MtYwTWH_Hz0_wrR8b_olaCQ953pP4W9Hp8MaCRevmAng_NPKcEruQXV8R2RCSu12eF3JLAmxvh0x6Fv6L1wO0oZy1FrgvJZdO9TxhImiTVqcsMlW9auDWOkdBwvcSNKWxWj9dw1LraTx_UwbT4hvntA-_B6qiRw6ieRe9_xG1A\"\n",
" },\n",
" \"datastore_name\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"datastore\"\n",
" },\n",
" \"experiment_name\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"logic_app_experiment\"\n",
" },\n",
" \"location\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"eastus\"\n",
" },\n",
" \"name\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"logic_app\"\n",
" },\n",
" \"resource_group\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"jiataamltest02\"\n",
" },\n",
" \"storage_container_name\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"aml\"\n",
" },\n",
" \"subscription_id\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"edf507a2-6235-46c5-b560-fd463ba2e771\"\n",
" },\n",
" \"url_endpoint\": {\n",
" \"type\": \"String\",\n",
" \"value\": \"https://eastus.aether.ms/api/v1.0/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/jiataamltest02/providers/Microsoft.MachineLearningServices/workspaces/jiataamltest02/PipelineRuns/PipelineSubmit/fe4d7b79-1835-4725-add4-7d6fd8ae6d47\"\n",
" }\n",
" },\n",
" \"parametersLink\": null,\n",
" \"providers\": [\n",
" {\n",
" \"id\": null,\n",
" \"namespace\": \"Microsoft.Logic\",\n",
" \"registrationState\": null,\n",
" \"resourceTypes\": [\n",
" {\n",
" \"aliases\": null,\n",
" \"apiVersions\": null,\n",
" \"locations\": [\n",
" \"eastus\"\n",
" ],\n",
" \"properties\": null,\n",
" \"resourceType\": \"workflows\"\n",
" }\n",
" ]\n",
" }\n",
" ],\n",
" \"provisioningState\": \"Succeeded\",\n",
" \"template\": null,\n",
" \"templateHash\": \"12356772322557656100\",\n",
" \"templateLink\": null,\n",
" \"timestamp\": \"2019-01-30T07:58:56.794250+00:00\"\n",
" },\n",
" \"resourceGroup\": \"jiataamltest02\",\n",
" \"type\": null\n",
"}\n",
"\u001b[0m"
]
}
],
"source": [
"!az group deployment create \\\n",
" --name logic_app \\\n",
" --resource-group {get_key(env_path, \"RESOURCE_GROUP\")} \\\n",
" --template-file template.logic_app.json \\\n",
" --parameters \\\n",
" name=\"logic_app\" \\\n",
" location={get_key(env_path, \"REGION\")} \\\n",
" resource_group={get_key(env_path, \"RESOURCE_GROUP\")} \\\n",
" subscription_id={get_key(env_path, \"SUBSCRIPTION_ID\")} \\\n",
" storage_container_name={get_key(env_path, \"STORAGE_CONTAINER_NAME\")} \\\n",
" url_endpoint={published_pipeline.endpoint} \\\n",
" aad_token='{aad_token[\"Authorization\"]}' \\\n",
" datastore_name={get_key(env_path, \"AML_DATASTORE_NAME\")} \\\n",
" experiment_name=\"logic_app_experiment\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Trigger logic app by adding a new video to the Azure blob container"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Before testing the logic app by added a new video the blob container, check in the portal or cli that the logic app deployment has completed and that it looks correct."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"!cp orangutan.mp4 trigger_test_orangutan.mp4"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"my_datastore = Datastore.register_azure_blob_container(\n",
" workspace=ws, \n",
" datastore_name=get_key(env_path, \"AML_DATASTORE_NAME\"), \n",
" container_name=get_key(env_path, \"STORAGE_CONTAINER_NAME\"), \n",
" account_name=get_key(env_path, \"STORAGE_ACCOUNT_NAME\"), \n",
" account_key=get_key(env_path, \"STORAGE_ACCOUNT_KEY\"),\n",
" overwrite=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"$AZUREML_DATAREFERENCE_datastore"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Upload new trigger file video\n",
"my_datastore.upload_files(\n",
" [\"./trigger_test_orangutan.mp4\"],\n",
" overwrite=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The appearance of the new `trigger_test_orangutan.mp4` video will trigger the Logic App flow. Inspect your logic app in the portal to see the progress."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You are now ready to move on to the [next notebook](05_clean_up.ipynb)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:batchscoringdl_aml]",
"language": "python",
"name": "conda-env-batchscoringdl_aml-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,124 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Clean up\n",
"\n",
"To clean up our resources, we'll do the following:\n",
"1. Delete all resources in Azure - we can do this simply by removing our resource group where we've stored all our resources in\n",
"2. Clean up our local repository\n",
"3. Delete the service principle\n",
"5. Optionally delete the dotenv file too"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dotenv import get_key, find_dotenv, load_dotenv\n",
"env_path = find_dotenv(raise_error_if_not_found=True)\n",
"load_dotenv(env_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Clean up Azure resources. The `--no-wait` flag means that it will run the command without waiting (async). `--yes` will bypass prompts for confirmation."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"!az group delete --resource-group {get_key(env_path, \"RESOURCE_GROUP\")} --no-wait --yes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Clean up the local repository: Run the `clean_up.sh` script. This will remove all temporary directories that were generated as well as any configuration that were created during the tutorials. This script will _not_ remove the `.env` file."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rm: cannot remove '*.mp4.*': No such file or directory\r\n",
"rm: cannot remove '*.mp3': No such file or directory\r\n"
]
}
],
"source": [
"!./clean_up.sh"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"(optionally) Delete the `.env` file. You will want to save your `.env` file if you plan on recreating this setup."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"!rm .env"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:batchscoringdl_aml]",
"language": "python",
"name": "conda-env-batchscoringdl_aml-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,16 @@
#!/bin/bash
# remove tmp files
rm -rf local_test_orangutan
# rm tmp dirs
rm -rf output_dir
rm -rf content_dir
# rm downloaded video and audio files
rm *.mp4
rm *.mp4.*
rm *.mp3
# rm jnl files from azcopy
rm -rf *.jnl

Двоичные данные
archectures/Python-Keras-Scoring/notebooks/models/model.pth Executable file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,43 @@
import argparse
import os
import subprocess
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process input video")
parser.add_argument(
'--video',
help="Name of the output video (excluding ext)"
)
parser.add_argument(
'--images-dir',
help="The input image directory of frames to stitch together.",
required=True
)
parser.add_argument(
'--input-audio',
help="The input audio directory containing the audio file.",
required=True
)
parser.add_argument(
'--output-dir',
help="The output directory to save the stitched-together video into.",
required=True
)
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
subprocess.run("ffmpeg -framerate 30 -i {}/%05d_video.jpg -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p "
"-y {}/video_without_audio.mp4"
.format(args.images_dir, args.output_dir),
shell=True, check=True
)
video_name = args.video or 'video'
subprocess.run("ffmpeg -i {}/video_without_audio.mp4 -i {}/audio.aac -map 0:0 -map 1:0 -vcodec "
"copy -acodec copy -y {}/{}_processed.mp4"
.format(args.output_dir, args.input_audio, args.output_dir, video_name),
shell=True, check=True
)

Просмотреть файл

@ -0,0 +1,37 @@
import argparse
import glob
import os
import subprocess
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Process input video")
parser.add_argument(
'--input-video',
help="Path to the input video file (include ext)",
required=True
)
parser.add_argument(
'--output-audio',
help="The name of the output folder to store the audio clip in.",
required=True
)
parser.add_argument(
'--output-images',
help="The name of the output image folder to store the output frames in.",
required=True
)
args = parser.parse_args()
os.makedirs(args.output_audio, exist_ok=True)
os.makedirs(args.output_images, exist_ok=True)
subprocess.run("ffmpeg -i {} {}/audio.aac"
.format(args.input_video, args.output_audio),
shell=True, check=True
)
subprocess.run("ffmpeg -i {} {}/%05d_video.jpg -hide_banner"
.format(args.input_video, args.output_images),
shell=True, check=True
)

Просмотреть файл

@ -0,0 +1,183 @@
# Original source: https://github.com/pytorch/examples/blob/master/fast_neural_style/neural_style/neural_style.py
import argparse
import os
import sys
import re
from PIL import Image
import torch
from torchvision import transforms
def load_image(filename, size=None, scale=None):
img = Image.open(filename)
if size is not None:
img = img.resize((size, size), Image.ANTIALIAS)
elif scale is not None:
img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.ANTIALIAS)
return img
def save_image(filename, data):
img = data.clone().clamp(0, 255).numpy()
img = img.transpose(1, 2, 0).astype("uint8")
img = Image.fromarray(img)
img.save(filename)
class TransformerNet(torch.nn.Module):
def __init__(self):
super(TransformerNet, self).__init__()
# Initial convolution layers
self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
# Residual layers
self.res1 = ResidualBlock(128)
self.res2 = ResidualBlock(128)
self.res3 = ResidualBlock(128)
self.res4 = ResidualBlock(128)
self.res5 = ResidualBlock(128)
# Upsampling Layers
self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
# Non-linearities
self.relu = torch.nn.ReLU()
def forward(self, X):
y = self.relu(self.in1(self.conv1(X)))
y = self.relu(self.in2(self.conv2(y)))
y = self.relu(self.in3(self.conv3(y)))
y = self.res1(y)
y = self.res2(y)
y = self.res3(y)
y = self.res4(y)
y = self.res5(y)
y = self.relu(self.in4(self.deconv1(y)))
y = self.relu(self.in5(self.deconv2(y)))
y = self.deconv3(y)
return y
class ConvLayer(torch.nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride):
super(ConvLayer, self).__init__()
reflection_padding = kernel_size // 2
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
def forward(self, x):
out = self.reflection_pad(x)
out = self.conv2d(out)
return out
class ResidualBlock(torch.nn.Module):
"""ResidualBlock
introduced in: https://arxiv.org/abs/1512.03385
recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
"""
def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
self.relu = torch.nn.ReLU()
def forward(self, x):
residual = x
out = self.relu(self.in1(self.conv1(x)))
out = self.in2(self.conv2(out))
out = out + residual
return out
class UpsampleConvLayer(torch.nn.Module):
"""UpsampleConvLayer
Upsamples the input and then does a convolution. This method gives better results
compared to ConvTranspose2d.
ref: http://distill.pub/2016/deconv-checkerboard/
"""
def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None):
super(UpsampleConvLayer, self).__init__()
self.upsample = upsample
if upsample:
self.upsample_layer = torch.nn.Upsample(mode='nearest', scale_factor=upsample)
reflection_padding = kernel_size // 2
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
def forward(self, x):
x_in = x
if self.upsample:
x_in = self.upsample_layer(x_in)
out = self.reflection_pad(x_in)
out = self.conv2d(out)
return out
def stylize(args):
device = torch.device("cuda" if args.cuda else "cpu")
with torch.no_grad():
style_model = TransformerNet()
state_dict = torch.load(os.path.join(args.model_dir, "model.pth"))
# remove saved deprecated running_* keys in InstanceNorm from the checkpoint
for k in list(state_dict.keys()):
if re.search(r'in\d+\.running_(mean|var)$', k):
del state_dict[k]
style_model.load_state_dict(state_dict)
style_model.to(device)
filenames = os.listdir(args.content_dir)
for filename in filenames:
print("Processing {}".format(filename))
full_path = os.path.join(args.content_dir, filename)
content_image = load_image(full_path, scale=args.content_scale)
content_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Lambda(lambda x: x.mul(255))
])
content_image = content_transform(content_image)
content_image = content_image.unsqueeze(0).to(device)
output = style_model(content_image).cpu()
output_path = os.path.join(args.output_dir, filename)
save_image(output_path, output[0])
def main():
arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
arg_parser.add_argument("--content-scale", type=float, default=None,
help="factor for scaling down the content image")
arg_parser.add_argument("--model-dir", type=str, required=True,
help="saved model to be used for stylizing the image.")
arg_parser.add_argument("--cuda", type=int, required=True,
help="set it to 1 for running on GPU, 0 for CPU")
arg_parser.add_argument("--content-dir", type=str, required=True,
help="directory holding the images")
arg_parser.add_argument("--output-dir", type=str, required=True,
help="directory holding the output images")
args = arg_parser.parse_args()
if args.cuda and not torch.cuda.is_available():
print("ERROR: cuda is not available, try running on CPU")
sys.exit(1)
os.makedirs(args.output_dir, exist_ok=True)
stylize(args)
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,207 @@
# Original source: https://github.com/pytorch/examples/blob/master/fast_neural_style/neural_style/neural_style.py
import argparse
import os
import sys
import re
from PIL import Image
import torch
from torchvision import transforms
from mpi4py import MPI
def load_image(filename, size=None, scale=None):
img = Image.open(filename)
if size is not None:
img = img.resize((size, size), Image.ANTIALIAS)
elif scale is not None:
img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.ANTIALIAS)
return img
def save_image(filename, data):
img = data.clone().clamp(0, 255).numpy()
img = img.transpose(1, 2, 0).astype("uint8")
img = Image.fromarray(img)
img.save(filename)
class TransformerNet(torch.nn.Module):
def __init__(self):
super(TransformerNet, self).__init__()
# Initial convolution layers
self.conv1 = ConvLayer(3, 32, kernel_size=9, stride=1)
self.in1 = torch.nn.InstanceNorm2d(32, affine=True)
self.conv2 = ConvLayer(32, 64, kernel_size=3, stride=2)
self.in2 = torch.nn.InstanceNorm2d(64, affine=True)
self.conv3 = ConvLayer(64, 128, kernel_size=3, stride=2)
self.in3 = torch.nn.InstanceNorm2d(128, affine=True)
# Residual layers
self.res1 = ResidualBlock(128)
self.res2 = ResidualBlock(128)
self.res3 = ResidualBlock(128)
self.res4 = ResidualBlock(128)
self.res5 = ResidualBlock(128)
# Upsampling Layers
self.deconv1 = UpsampleConvLayer(128, 64, kernel_size=3, stride=1, upsample=2)
self.in4 = torch.nn.InstanceNorm2d(64, affine=True)
self.deconv2 = UpsampleConvLayer(64, 32, kernel_size=3, stride=1, upsample=2)
self.in5 = torch.nn.InstanceNorm2d(32, affine=True)
self.deconv3 = ConvLayer(32, 3, kernel_size=9, stride=1)
# Non-linearities
self.relu = torch.nn.ReLU()
def forward(self, X):
y = self.relu(self.in1(self.conv1(X)))
y = self.relu(self.in2(self.conv2(y)))
y = self.relu(self.in3(self.conv3(y)))
y = self.res1(y)
y = self.res2(y)
y = self.res3(y)
y = self.res4(y)
y = self.res5(y)
y = self.relu(self.in4(self.deconv1(y)))
y = self.relu(self.in5(self.deconv2(y)))
y = self.deconv3(y)
return y
class ConvLayer(torch.nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride):
super(ConvLayer, self).__init__()
reflection_padding = kernel_size // 2
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
def forward(self, x):
out = self.reflection_pad(x)
out = self.conv2d(out)
return out
class ResidualBlock(torch.nn.Module):
"""ResidualBlock
introduced in: https://arxiv.org/abs/1512.03385
recommended architecture: http://torch.ch/blog/2016/02/04/resnets.html
"""
def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.conv1 = ConvLayer(channels, channels, kernel_size=3, stride=1)
self.in1 = torch.nn.InstanceNorm2d(channels, affine=True)
self.conv2 = ConvLayer(channels, channels, kernel_size=3, stride=1)
self.in2 = torch.nn.InstanceNorm2d(channels, affine=True)
self.relu = torch.nn.ReLU()
def forward(self, x):
residual = x
out = self.relu(self.in1(self.conv1(x)))
out = self.in2(self.conv2(out))
out = out + residual
return out
class UpsampleConvLayer(torch.nn.Module):
"""UpsampleConvLayer
Upsamples the input and then does a convolution. This method gives better results
compared to ConvTranspose2d.
ref: http://distill.pub/2016/deconv-checkerboard/
"""
def __init__(self, in_channels, out_channels, kernel_size, stride, upsample=None):
super(UpsampleConvLayer, self).__init__()
self.upsample = upsample
if upsample:
self.upsample_layer = torch.nn.Upsample(mode='nearest', scale_factor=upsample)
reflection_padding = kernel_size // 2
self.reflection_pad = torch.nn.ReflectionPad2d(reflection_padding)
self.conv2d = torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride)
def forward(self, x):
x_in = x
if self.upsample:
x_in = self.upsample_layer(x_in)
out = self.reflection_pad(x_in)
out = self.conv2d(out)
return out
def stylize(args, comm):
rank = comm.Get_rank()
size = comm.Get_size()
device = torch.device("cuda" if args.cuda else "cpu")
with torch.no_grad():
style_model = TransformerNet()
state_dict = torch.load(os.path.join(args.model_dir, "model.pth"))
# remove saved deprecated running_* keys in InstanceNorm from the checkpoint
for k in list(state_dict.keys()):
if re.search(r'in\d+\.running_(mean|var)$', k):
del state_dict[k]
style_model.load_state_dict(state_dict)
style_model.to(device)
filenames = os.listdir(args.content_dir)
filenames = sorted(filenames)
partition_size = len(filenames) // size
partitioned_filenames = filenames[rank * partition_size : (rank + 1) * partition_size]
print("RANK {} - is processing {} images out of the total {}".format(rank, len(partitioned_filenames), len(filenames)))
output_paths = []
for filename in partitioned_filenames:
# print("Processing {}".format(filename))
full_path = os.path.join(args.content_dir, filename)
content_image = load_image(full_path, scale=args.content_scale)
content_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Lambda(lambda x: x.mul(255))
])
content_image = content_transform(content_image)
content_image = content_image.unsqueeze(0).to(device)
output = style_model(content_image).cpu()
output_path = os.path.join(args.output_dir, filename)
save_image(output_path, output[0])
output_paths.append(output_path)
print("RANK {} - number of pre-aggregated output files {}".format(rank, len(output_paths)))
output_paths_list = comm.gather(output_paths, root=0)
if rank == 0:
print("RANK {} - number of aggregated output files {}".format(rank, len(output_paths_list)))
print("RANK {} - end".format(rank))
def main():
arg_parser = argparse.ArgumentParser(description="parser for fast-neural-style")
arg_parser.add_argument("--content-scale", type=float, default=None,
help="factor for scaling down the content image")
arg_parser.add_argument("--model-dir", type=str, required=True,
help="saved model to be used for stylizing the image.")
arg_parser.add_argument("--cuda", type=int, required=True,
help="set it to 1 for running on GPU, 0 for CPU")
arg_parser.add_argument("--content-dir", type=str, required=True,
help="directory holding the images")
arg_parser.add_argument("--output-dir", type=str, required=True,
help="directory holding the output images")
args = arg_parser.parse_args()
comm = MPI.COMM_WORLD
if args.cuda and not torch.cuda.is_available():
print("ERROR: cuda is not available, try running on CPU")
sys.exit(1)
os.makedirs(args.output_dir, exist_ok=True)
stylize(args, comm)
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,36 @@
{
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"location": {
"type": "string"
},
"subscription_id": {"type": "string"},
"storage_account_name": {
"type": "string"
},
"storage_account_key": {
"type": "string"
}
},
"resources": [
{
"type": "Microsoft.Web/connections",
"name": "azureblob",
"apiVersion": "2016-06-01",
"location": "eastus",
"scale": null,
"properties": {
"displayName": "batchscoringdlsa",
"parameterValues": {
"accountName": "[parameters('storage_account_name')]",
"accessKey": "[parameters('storage_account_key')]"
},
"api": {
"id": "[concat('/subscriptions/', parameters('subscription_id'), '/providers/Microsoft.Web/locations/', parameters('location'), '/managedApis/azureblob')]"
}
},
"dependsOn": []
}
]
}

Просмотреть файл

@ -0,0 +1,160 @@
{
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"location": {
"type": "string"
},
"resource_group": {
"type": "string"
},
"name": {
"type": "string"
},
"subscription_id": {
"type": "string"
},
"storage_container_name": {
"type": "string"
},
"url_endpoint": {
"type": "string"
},
"aad_token": {
"type": "string"
},
"datastore_name": {
"type": "string"
},
"experiment_name": {
"type": "string"
}
},
"resources": [
{
"comments": "",
"type": "Microsoft.Logic/workflows",
"name": "[parameters('name')]",
"apiVersion": "2017-07-01",
"location": "[parameters('location')]",
"tags": {},
"scale": null,
"properties": {
"state": "Enabled",
"definition": {
"$schema": "https://schema.management.azure.com/providers/Microsoft.Logic/schemas/2016-06-01/workflowdefinition.json#",
"actions": {
"Condition": {
"actions": {
"HTTP": {
"inputs": {
"body": {
"DataPathAssignments": {
"video_path": {
"DataStoreName": "[parameters('datastore_name')]",
"RelativePath": "@{triggerBody()?['Name']}"
}
},
"ExperimentName": "[parameters('experiment_name')]"
},
"headers": {
"Accept": "application/json",
"Content-Type": "application/json"
},
"authentication": {
"type": "Raw",
"value": "[parameters('aad_token')]"
},
"method": "POST",
"uri": "[parameters('url_endpoint')]"
},
"runAfter": {},
"type": "Http"
}
},
"else": {
"actions": {
"Terminate": {
"inputs": {
"runStatus": "Cancelled"
},
"runAfter": {},
"type": "Terminate"
}
}
},
"expression": {
"and": [
{
"endsWith": [
"@triggerBody()?['Name']",
".mp4"
]
},
{
"not": {
"contains": [
"@triggerBody()?['Name']",
"processed"
]
}
}
]
},
"runAfter": {},
"type": "If"
}
},
"contentVersion": "1.0.0.0",
"outputs": {},
"parameters": {
"$connections": {
"defaultValue": {},
"type": "Object"
}
},
"triggers": {
"When_a_blob_is_added_or_modified_(properties_only)": {
"inputs": {
"host": {
"connection": {
"name": "@parameters('$connections')['azureblob']['connectionId']"
}
},
"method": "get",
"path": "/datasets/default/triggers/batch/onupdatedfile",
"queries": {
"folderId": "JTJmYW1s",
"maxFileCount": 10
}
},
"metadata": {
"JTJmYW1s": "[concat('/', parameters('storage_container_name'))]"
},
"recurrence": {
"frequency": "Minute",
"interval": 1
},
"splitOn": "@triggerBody()",
"type": "ApiConnection"
}
}
},
"parameters": {
"$connections": {
"value": {
"azureblob": {
"connectionId": "[concat('/subscriptions/', parameters('subscription_id'), '/resourceGroups/', parameters('resource_group'), '/providers/microsoft.Web/connections/azureblob')]",
"connectionName": "azureblob",
"id": "[concat('/subscriptions/', parameters('subscription_id'), '/providers/Microsoft.Web/locations/', parameters('location'), '/managedApis/azureblob')]"
}
}
}
}
},
"dependsOn": [
]
}
]
}

Просмотреть файл

@ -0,0 +1,30 @@
from azureml.core.authentication import (AuthenticationException,
AzureCliAuthentication,
InteractiveLoginAuthentication,
ServicePrincipalAuthentication)
import logging
import os
def get_auth():
logger = logging.getLogger(__name__)
if os.environ.get("AML_SP_PASSWORD", None):
logger.debug("Trying to create Workspace with Service Principal")
aml_sp_password = os.environ.get("AML_SP_PASSWORD")
aml_sp_tennant_id = os.environ.get("AML_SP_TENNANT_ID")
aml_sp_username = os.environ.get("AML_SP_USERNAME")
auth = ServicePrincipalAuthentication(
tenant_id=aml_sp_tennant_id,
username=aml_sp_username,
password=aml_sp_password,
)
else:
logger.debug("Trying to create Workspace with CLI Authentication")
try:
auth = AzureCliAuthentication()
auth.get_authentication_header()
except AuthenticationException:
logger.debug("Trying to create Workspace with Interactive login")
auth = InteractiveLoginAuthentication()
return auth

104
archectures/Python-Keras-Training/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,104 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/

Просмотреть файл

@ -0,0 +1,113 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Data Processing\nIn this notebook we convert the ImageNet data to the appropriate format so that we can use it for training.\n\nThe dataset has many versions, the one commonly used for image classification is ILSVRC 2012. Go to the [download page](http://www.image-net.org/download-images) (you may need to register an account), and find the page for ILSVRC2012. You will need to download two files ILSVRC2012_img_train.tar and ILSVRC2012_img_val.tar"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "from pathlib import Path"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "DATA=Path(\"/data\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!mkdir -p {DATA/\"train\"}\n!tar -C {DATA/\"train\"} -xf {DATA/\"ILSVRC2012_img_train.tar\"}"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import tarfile\nfrom tqdm import tqdm_notebook\nimport os"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "filenames = list((DATA/\"train\").glob(\"*.tar\"))\npbar = tqdm_notebook(total=len(filenames))\nfor class_tar in filenames:\n pbar.set_description('Extracting '+class_tar.name+ '...')\n class_dir = os.path.splitext(class_tar)[0]\n os.mkdir(class_dir)\n with tarfile.open(class_tar) as f:\n f.extractall(class_dir)\n os.remove(class_tar)\n pbar.update(1)"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!rm -r {DATA/\"validation\"}"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!mkdir -p {DATA/\"validation\"}\n!tar -C {DATA/\"validation\"} -xf {DATA/\"ILSVRC2012_img_val.tar\"}"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "The validation data comes without labels so wee ned to run a script to asign the images to the appropriate classes."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "validation_path = DATA/\"validation\"\nvalidation_preparation_script = Path(os.getcwd())/\"valprep.sh\""
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!bash -c \"cd {validation_path} && {validation_preparation_script}\""
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Finally we package the processed directories so that we can upload them quicker."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!cd {DATA} && tar -czvf train.tar.gz train"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!cd {DATA} && tar -czvf validation.tar.gz validation"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,349 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Create Azure and Batch AI Resources\nIn this notebook we will create the necessary resources to train a ResNet50 model([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the ImageNet dataset. If you plan on using fake data then the sections marked optional can be skipped. This notebook will take you through the following steps:\n * [Create Azure Resources](#azure_resources)\n * [Create Fileserver(NFS)](#create_fileshare)\n * [Upload Data to Blob (Optional)](#upload_data)\n * [Configure Batch AI Cluster](#configure_cluster)"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nsys.path.append(\"common\") \n\nfrom dotenv import set_key\nimport os\nimport json\nfrom utils import get_password, dotenv_for\nfrom pathlib import Path"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Below are the variables that describe our experiment. By default we are using the NC24rs_v3 (Standard_NC24rs_v3) VMs which have V100 GPUs and Infiniband. By default we are using 2 nodes with each node having 4 GPUs, this equates to 8 GPUs. Feel free to increase the number of nodes but be aware what limitations your subscription may have.\n\nSet the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": "# Variables for Batch AI - change as necessary\nID = \"dtdemo\"\nGROUP_NAME = f\"batch{ID}rg\"\nSTORAGE_ACCOUNT_NAME = f\"batch{ID}st\"\nFILE_SHARE_NAME = f\"batch{ID}share\"\nSELECTED_SUBSCRIPTION = \"<YOUR_SUBSCRIPTION>\"\nWORKSPACE = \"workspace\"\nNUM_NODES = 2\nCLUSTER_NAME = \"msv100\"\nVM_SIZE = \"Standard_NC24rs_v3\"\nGPU_TYPE = \"V100\"\nPROCESSES_PER_NODE = 4\nLOCATION = \"eastus\"\nNFS_NAME = f\"batch{ID}nfs\"\nUSERNAME = \"batchai_user\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nDATA = Path(\"/data\")\nCONTAINER_NAME = f\"batch{ID}container\"\nDOCKER_PWD = \"<YOUR_DOCKER_PWD>\"\n\ndotenv_path = dotenv_for()\nset_key(dotenv_path, 'DOCKER_PWD', DOCKER_PWD)\nset_key(dotenv_path, 'GROUP_NAME', GROUP_NAME)\nset_key(dotenv_path, 'FILE_SHARE_NAME', FILE_SHARE_NAME)\nset_key(dotenv_path, 'WORKSPACE', WORKSPACE)\nset_key(dotenv_path, 'NUM_NODES', str(NUM_NODES))\nset_key(dotenv_path, 'CLUSTER_NAME', CLUSTER_NAME)\nset_key(dotenv_path, 'GPU_TYPE', GPU_TYPE)\nset_key(dotenv_path, 'PROCESSES_PER_NODE', str(PROCESSES_PER_NODE))\nset_key(dotenv_path, 'STORAGE_ACCOUNT_NAME', STORAGE_ACCOUNT_NAME)"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='azure_resources'></a>\n## Create Azure Resources\nFirst we need to log in to our Azure account. "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az login -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "If you have more than one Azure account you will need to select it with the command below. If you only have one account you can skip this step."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az account set --subscription \"$SELECTED_SUBSCRIPTION\""
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az account list -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Next we create the group that will hold all our Azure resources."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az group create -n $GROUP_NAME -l $LOCATION -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We will create the storage account that will store our fileshare where all the outputs from the jobs will be stored."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "json_data = !az storage account create -l $LOCATION -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME --sku Standard_LRS\nprint('Storage account {} provisioning state: {}'.format(STORAGE_ACCOUNT_NAME, \n json.loads(''.join(json_data))['provisioningState']))"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az storage share create --account-name $STORAGE_ACCOUNT_NAME \\\n--account-key $storage_account_key --name $FILE_SHARE_NAME"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az storage directory create --share-name $FILE_SHARE_NAME --name scripts \\\n--account-name $STORAGE_ACCOUNT_NAME --account-key $storage_account_key"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Here we are setting some defaults so we don't have to keep adding them to every command"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az configure --defaults location=$LOCATION\n!az configure --defaults group=$GROUP_NAME"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "#### Create Workspace\nBatch AI has the concept of workspaces and experiments. Below we will create the workspace for our work."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai workspace create -n $WORKSPACE -g $GROUP_NAME"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='upload_data'></a>\n## Upload Data to Blob (Optional)\nIn this section we will create a blob container and upload the imagenet data we prepared locally in the previous notebook.\n\n**You only need to run this section if you want to use real data. If USE_FAKE is set to False the commands below won't be executed.**\n"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "if USE_FAKE is False:\n !az storage container create --account-name {STORAGE_ACCOUNT_NAME} \\\n --account-key {storage_account_key} \\\n --name {CONTAINER_NAME}"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "if USE_FAKE is False:\n # Should take about 20 minutes\n !azcopy --source {DATA/\"train.tar.gz\"} \\\n --destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \\\n --dest-key {storage_account_key} --quiet"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "if USE_FAKE is False:\n !azcopy --source {DATA/\"validation.tar.gz\"} \\\n --destination https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \\\n --dest-key {storage_account_key} --quiet"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='create_fileshare'></a>\n## Create Fileserver\nIn this example we will store the data on an NFS fileshare. It is possible to use many storage solutions with Batch AI. NFS offers the best tradeoff between performance and ease of use. The best performance is achieved by loading the data locally but this can be cumbersome since it requires that the data is download by the all the nodes which with the ImageNet dataset can take hours. If you are using fake data we won't be using the fileserver but we will create one so that if you want to run the real ImageNet data later the server is ready."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai file-server create -n $NFS_NAME --disk-count 4 --disk-size 250 -w $WORKSPACE \\\n-s Standard_DS4_v2 -u $USERNAME -p {get_password(dotenv_for())} -g $GROUP_NAME --storage-sku Premium_LRS"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai file-server list -o table -w $WORKSPACE -g $GROUP_NAME"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "json_data = !az batchai file-server list -w $WORKSPACE -g $GROUP_NAME\nnfs_ip=json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['mountSettings']['fileServerPublicIp']"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "After we have created the NFS share we need to copy the data to it. To do this we write the script below which will be executed on the fileserver. It installs a tool called azcopy and then downloads and extracts the data to the appropriate directory."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "nodeprep_script = f\"\"\"\n#!/usr/bin/env bash\nwget https://gist.githubusercontent.com/msalvaris/073c28a9993d58498957294d20d74202/raw/87a78275879f7c9bb8d6fb9de8a2d2996bb66c24/install_azcopy\nchmod 777 install_azcopy\nsudo ./install_azcopy\n\nmkdir -p /data/imagenet\n\nazcopy --source https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/validation.tar.gz \\\n --destination /data/imagenet/validation.tar.gz\\\n --source-key {storage_account_key}\\\n --quiet\n\n\nazcopy --source https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/train.tar.gz \\\n --destination /data/imagenet/train.tar.gz\\\n --source-key {storage_account_key}\\\n --quiet\n\ncd /data/imagenet\ntar -xzf train.tar.gz\ntar -xzf validation.tar.gz\n\"\"\""
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "with open('nodeprep.sh', 'w') as f:\n f.write(nodeprep_script)"
},
{
"cell_type": "markdown",
"metadata": {
"lines_to_next_cell": 2
},
"source": "Next we will copy the file over and run it on the NFS VM. This will install azcopy and download and prepare the data"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "if USE_FAKE:\n raise Warning(\"You should not be running this section if you simply want to use fake data\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "if USE_FAKE is False:\n !sshpass -p {get_password(dotenv_for())} scp -o \"StrictHostKeyChecking=no\" nodeprep.sh $USERNAME@{nfs_ip}:~/"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "if USE_FAKE is False:\n !sshpass -p {get_password(dotenv_for())} ssh -o \"StrictHostKeyChecking=no\" $USERNAME@{nfs_ip} \"sudo chmod 777 ~/nodeprep.sh && ./nodeprep.sh\""
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='configure_cluster'></a>\n## Configure Batch AI Cluster\nWe then upload the scripts we wish to execute onto the fileshare. The fileshare will later be mounted by Batch AI. An alternative to uploading the scripts would be to embedd them inside the Docker image."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/docker.service --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source HorovodPytorch/cluster_config/nodeprep.sh --path scripts"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Below it the command to create the cluster. "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai cluster create \\\n -w $WORKSPACE \\\n --name $CLUSTER_NAME \\\n --image UbuntuLTS \\\n --vm-size $VM_SIZE \\\n --min $NUM_NODES --max $NUM_NODES \\\n --afs-name $FILE_SHARE_NAME \\\n --afs-mount-path extfs \\\n --user-name $USERNAME \\\n --password {get_password(dotenv_for())} \\\n --storage-account-name $STORAGE_ACCOUNT_NAME \\\n --storage-account-key $storage_account_key \\\n --nfs $NFS_NAME \\\n --nfs-mount-path nfs \\\n --config-file HorovodPytorch/cluster_config/cluster.json"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Let's check that the cluster was created succesfully."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai cluster show -n $CLUSTER_NAME -w $WORKSPACE"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster list -w $WORKSPACE -o table"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster node list -c $CLUSTER_NAME -w $WORKSPACE -o table"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,64 @@
FROM ubuntu:16.04
COPY environment.yml .
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
wget \
curl \
gfortran \
apt-transport-https \
jq \
locales \
git \
sshpass \
openssh-client \
software-properties-common && \
rm -rf /var/lib/apt/lists/*
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
# Install Docker
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - && \
apt-key fingerprint 0EBFCD88 && \
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable" &&\
apt-get update && apt-get install -y --no-install-recommends docker-ce
ENV ENV_NAME=py3.6
RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda env create -q --name $ENV_NAME -f environment.yml && \
/opt/conda/bin/conda clean -ya && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate $ENV_NAME" >> ~/.bashrc
ENV PATH /opt/conda/envs/$ENV_NAME/bin:/opt/conda/bin:$PATH
COPY jupyter_notebook_config.py /root/.jupyter/
# Install Azure CLI
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/azure-cli/ xenial main" | \
tee /etc/apt/sources.list.d/azure-cli.list && \
curl -L https://packages.microsoft.com/keys/microsoft.asc | apt-key add - && \
apt-get update && \
apt-get install -y --no-install-recommends \
azure-cli
# Install AzCopy
RUN echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod/ xenial main" > azure.list &&\
cp ./azure.list /etc/apt/sources.list.d/ &&\
apt-key adv --keyserver packages.microsoft.com --recv-keys B02C46DF417A0893 &&\
apt-get update &&\
apt-get install -y --no-install-recommends azcopy
WORKDIR /workspace
CMD /bin/bash

Просмотреть файл

@ -0,0 +1,17 @@
channels:
- conda-forge
dependencies:
- python=3.6
- numpy
- pyyaml
- scipy
- ipython
- pandas
- jupyter
- ipykernel
- scikit-learn
- pillow
- bokeh=0.13.0
- pip:
- https://github.com/theskumar/python-dotenv/archive/master.zip
- docker

Просмотреть файл

@ -0,0 +1,6 @@
# Configuration file for jupyter-notebook.
c.NotebookApp.ip = "0.0.0.0"
c.NotebookApp.port = 9999
c.NotebookApp.open_browser = False
c.NotebookApp.allow_root = True

Просмотреть файл

@ -0,0 +1,111 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Create Docker Image for Keras\nIn this notebook we will create the Docker image for our Keras script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": "dotenv_path = dotenv_for()\nUSE_FAKE = True\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES = 2\nDOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "dc = docker.from_env()"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "image, log_iter = dc.images.build(path='Docker', \n tag='{}/caia-horovod-keras'.format(DOCKERHUB))"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "container_labels = {'containerName': 'kerasgpu'}\nenvironment ={\n \"DISTRIBUTED\":True,\n \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n os.getenv('EXT_PWD'): {\n 'bind': '/workspace', \n 'mode': 'rw'\n }\n}\n\nif USE_FAKE:\n environment['FAKE'] = True\nelse:\n environment['FAKE'] = False\n volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n 'python -u /workspace/HorovodTF/src/imagenet_estimator_tf_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n command=cmd,\n detach=True, \n labels=container_labels,\n runtime='nvidia',\n volumes=volumes,\n environment=environment,\n shm_size='8G',\n privileged=True)"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "for line in container.logs(stderr=True, stream=True):\n print(line.decode(\"utf-8\"),end =\"\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "container.reload() # Refresh state\nif container.status is 'running':\n container.kill()"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "for line in dc.images.push(image.tags[0], \n stream=True,\n auth_config={\"username\": DOCKERHUB,\n \"password\": DOCKER_PWD}):\n print(line)"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,279 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Train Keras Model Distributed on Batch AI\nIn this notebook we will train a Keras model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Set the USE_FAKE to True if you want to use fake data rather than the ImageNet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT = f\"distributed_keras_{GPU_TYPE}\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='experiment'></a>\n# Create Experiment\nNext we create our experiment."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='training_scripts'></a>\n# Upload Training Scripts\nWe need to upload our training scripts and associated files"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Upload our training scripts"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_keras_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source src/data_generator.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Let's check our cluster we created earlier"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster list -w $WORKSPACE -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='job'></a>\n## Submit and Monitor Job\nBelow we specify the job we wish to execute. "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "jobs_dict = {\n \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n \"properties\": {\n \"nodeCount\": NUM_NODES,\n \"customToolkitSettings\": {\n \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n -bind-to none -map-by slot \\\n -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n -mca btl_tcp_if_include eth0 \\\n -x NCCL_SOCKET_IFNAME=eth0 \\\n -mca btl ^openib \\\n -x NCCL_IB_DISABLE=1 \\\n -x DISTRIBUTED=True \\\n -x AZ_BATCHAI_INPUT_TRAIN \\\n -x AZ_BATCHAI_INPUT_TEST \\\n --allow-run-as-root \\\n {FAKE} \\\n python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_leras_horovod.py\"\n },\n \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"inputDirectories\": [{\n \"id\": \"SCRIPTS\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n },\n {\n \"id\": \"TRAIN\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n {\n \"id\": \"TEST\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n ],\n \"outputDirectories\": [{\n \"id\": \"MODEL\",\n \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"pathSuffix\": \"Models\"\n }],\n \"containerSettings\": {\n \"imageSourceRegistry\": {\n \"image\": f\"{DOCKERHUB}/caia-horovod-keras\"\n }\n }\n }\n}"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "write_json_to_file(jobs_dict, 'job.json')"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "JOB_NAME='keras-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We now submit the job to Batch AI"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "With the command below we can check the status of the job"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "To view the files that the job has generated use the command below"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We can either wait for the job to complete or delete it with the command below."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='clean_up'></a>\n## Clean Up Resources\nNext we wish to tidy up the resource we created. \nFirst we reset the default values we set earlier."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az configure --defaults group=''\n!az configure --defaults location=''"
},
{
"cell_type": "markdown",
"metadata": {},
"source": " Next we delete the cluster"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Finally we can delete the group and we will have deleted everything created for this tutorial."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az group delete --name $GROUP_NAME -y"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,63 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV PYTHON_VERSION=3.5
ENV TENSORFLOW_VERSION=1.9.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV NCCL_VERSION=2.2.13-1+cuda9.0
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
build-essential \
cmake \
cpio \
git \
curl \
wget \
ca-certificates \
libdapl2 \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
libmlx4-1 \
libsm6 \
libxext6 \
python$PYTHON_VERSION \
python$PYTHON_VERSION-dev
# install intel MPI
RUN cd /tmp && \
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
tar zxvf l_mpi_2017.3.196.tgz && \
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
/tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
cd /tmp/l_mpi_2017.3.196 && \
./install.sh -s silent.cfg && \
cd .. && \
rm -rf l_mpi_2017.3.196* && \
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow and Keras
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
scikit-learn keras pillow
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig

Просмотреть файл

@ -0,0 +1,53 @@
import numpy as np
import keras
import logging
def _get_logger():
return logging.getLogger(__name__)
def _create_data(batch_size, num_batches, dim, channels, seed=42):
np.random.seed(42)
return np.random.rand(batch_size * num_batches,
dim[0],
dim[1],
channels).astype(np.float32)
def _create_labels(batch_size, num_batches, n_classes):
return np.random.choice(n_classes, batch_size * num_batches)
class FakeDataGenerator(keras.preprocessing.image.Iterator):
def __init__(self,
batch_size=32,
num_batches=20,
dim=(224, 224),
n_channels=3,
n_classes=10,
length=1000,
shuffle=True,
seed=42):
'Initialization'
super(FakeDataGenerator, self).__init__(length,
batch_size,
shuffle,
seed)
self.dim = dim
self.n_channels = n_channels
self.n_classes = n_classes
self.num_batches = num_batches
self._data = _create_data(self.batch_size, self.num_batches, self.dim, self.n_channels)
self._labels = _create_labels(self.batch_size, self.num_batches, self.n_classes)
self.translation_index = np.random.choice(len(self._labels), length)
def _get_batches_of_transformed_samples(self, index_array):
logger = _get_logger()
logger.debug('Retrieving samples')
logger.debug(str(index_array))
tr_index_array = self.translation_index[index_array]
return self._data[tr_index_array], keras.utils.to_categorical(self._labels[tr_index_array], num_classes=self.n_classes)

Просмотреть файл

@ -0,0 +1,357 @@
"""
Trains ResNet50 in Keras using Horovod.
It requires the following env variables
AZ_BATCHAI_INPUT_TRAIN
AZ_BATCHAI_INPUT_TEST
AZ_BATCHAI_OUTPUT_MODEL
AZ_BATCHAI_JOB_TEMP_DIR
"""
import logging
import os
import sys
from functools import lru_cache
from timer import Timer
import keras
import tensorflow as tf
from data_generator import FakeDataGenerator
from keras import backend as K
from keras.preprocessing import image
def _str_to_bool(in_str):
if "t" in in_str.lower():
return True
else:
return False
_WIDTH = 224
_HEIGHT = 224
_CHANNELS = 3
_LR = 0.001
_EPOCHS = os.getenv("EPOCHS", 1)
_BATCHSIZE = 64
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
# Settings from https://arxiv.org/abs/1706.02677.
_WARMUP_EPOCHS = 5
_WEIGHT_DECAY = 0.00005
_NUM_WORKERS = int(os.getenv("NUM_WORKERS", 10))
_MAX_QUEUE_SIZE = int(os.getenv("MAX_QUEUE_SIZE", 10))
_MULTIPROCESSING = _str_to_bool(os.getenv("MULTIPROCESSING", "False"))
_DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False"))
_FAKE = _str_to_bool(os.getenv("FAKE", "False"))
_DATA_LENGTH = int(
os.getenv("FAKE_DATA_LENGTH", 1281167)
) # How much fake data to simulate, default to size of imagenet dataset
_VALIDATION = _str_to_bool(os.getenv("VALIDATION", "False"))
if _DISTRIBUTED:
import horovod.keras as hvd
def _get_rank():
if _DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
class HorovodAdapter(logging.LoggerAdapter):
def __init__(self, logger):
self._str_epoch = ""
self._gpu_rank = 0
super(HorovodAdapter, self).__init__(logger, {})
def set_epoch(self, epoch):
self._str_epoch = "[Epoch {}]".format(epoch)
def process(self, msg, kwargs):
kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch}
return msg, kwargs
@lru_cache()
def _get_logger():
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter(
"%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s"
)
ch.setFormatter(formatter)
logger.addHandler(ch)
adapter = HorovodAdapter(logger)
return adapter
def _create_model():
logger = _get_logger()
logger.info("Creating model")
# Set up standard ResNet-50 model.
model = keras.applications.resnet50.ResNet50(weights=None)
# ResNet-50 model that is included with Keras is optimized for inference.
# Add L2 weight decay & adjust BN settings.
model_config = model.get_config()
for layer, layer_config in zip(model.layers, model_config["layers"]):
if hasattr(layer, "kernel_regularizer"):
regularizer = keras.regularizers.l2(_WEIGHT_DECAY)
layer_config["config"]["kernel_regularizer"] = {
"class_name": regularizer.__class__.__name__,
"config": regularizer.get_config(),
}
if type(layer) == keras.layers.BatchNormalization:
layer_config["config"]["momentum"] = 0.9
layer_config["config"]["epsilon"] = 1e-5
model = keras.models.Model.from_config(model_config)
return model
def _validation_data_iterator_from():
# Validation data iterator.
test_gen = image.ImageDataGenerator(
zoom_range=(0.875, 0.875),
preprocessing_function=keras.applications.resnet50.preprocess_input,
)
test_iter = test_gen.flow_from_directory(
os.getenv("AZ_BATCHAI_INPUT_TEST"),
batch_size=_BATCHSIZE,
target_size=(224, 224),
)
return test_iter
def _training_data_iterator_from():
# Training data iterator.
train_gen = image.ImageDataGenerator(
width_shift_range=0.33,
height_shift_range=0.33,
zoom_range=0.5,
horizontal_flip=True,
preprocessing_function=keras.applications.resnet50.preprocess_input,
)
train_iter = train_gen.flow_from_directory(
os.getenv("AZ_BATCHAI_INPUT_TRAIN"),
batch_size=_BATCHSIZE,
target_size=(224, 224),
)
return train_iter
def _fake_data_iterator_from(length=_DATA_LENGTH):
return FakeDataGenerator(batch_size=_BATCHSIZE, n_classes=1000, length=length)
def _get_optimizer(params, is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: adjust learning rate based on number of GPUs.
opt = keras.optimizers.SGD(
lr=params["learning_rate"] * hvd.size(), momentum=params["momentum"]
)
# Horovod: add Horovod Distributed Optimizer.
return hvd.DistributedOptimizer(opt)
else:
return keras.optimizers.SGD(
lr=params["learning_rate"], momentum=params["momentum"]
)
def _get_runconfig(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
else:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
return config
def _get_model_dir(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
return (
os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
if hvd.rank() == 0
else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR")
)
else:
return os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
def _get_hooks(is_distributed=_DISTRIBUTED, verbose=1):
logger = _get_logger()
if is_distributed:
logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
return [
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint.
hvd.callbacks.BroadcastGlobalVariablesCallback(0),
# Horovod: average metrics among workers at the end of every epoch.
#
# Note: This callback must be in the list before the ReduceLROnPlateau,
# TensorBoard, or other metrics-based callbacks.
hvd.callbacks.MetricAverageCallback(),
# Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
# accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
# the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
hvd.callbacks.LearningRateWarmupCallback(
warmup_epochs=_WARMUP_EPOCHS, verbose=verbose
),
# Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
hvd.callbacks.LearningRateScheduleCallback(
start_epoch=_WARMUP_EPOCHS, end_epoch=30, multiplier=1.0
),
hvd.callbacks.LearningRateScheduleCallback(
start_epoch=30, end_epoch=60, multiplier=1e-1
),
hvd.callbacks.LearningRateScheduleCallback(
start_epoch=60, end_epoch=80, multiplier=1e-2
),
hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3),
]
else:
return []
class LoggerCallback(keras.callbacks.Callback):
def __init__(self, logger, data_length):
self._timer = Timer(
output=logger.info, prefix="Epoch duration: ", fmt="{:.3f} seconds"
)
self._data_length = data_length
def on_epoch_begin(self, epoch, logs):
logger = _get_logger()
logger.set_epoch(epoch)
self._timer.start()
def on_epoch_end(self, epoch, logs):
duration = self._timer.elapsed
_log_summary(self._data_length, duration)
def _is_master(is_distributed=_DISTRIBUTED):
if is_distributed:
if hvd.rank() == 0:
return True
else:
return False
else:
return True
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info("Data length: {}".format(data_length))
logger.info("Total duration: {:.3f}".format(duration))
logger.info("Total images/sec: {:.3f}".format(images_per_second))
logger.info(
"Batch size: (Per GPU {}: Total {})".format(
_BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE
)
)
logger.info("Distributed: {}".format("True" if _DISTRIBUTED else "False"))
logger.info("Num GPUs: {:.3f}".format(hvd.size() if _DISTRIBUTED else 1))
logger.info("Dataset: {}".format("Synthetic" if _FAKE else "Imagenet"))
def main():
verbose = 1
logger = _get_logger()
if _DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger.info("Runnin Distributed")
verbose = 1 if hvd.rank() == 0 else 0
logger.info("Tensorflow version {}".format(tf.__version__))
K.set_session(tf.Session(config=_get_runconfig()))
# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = 0
if _DISTRIBUTED:
resume_from_epoch = hvd.broadcast(
resume_from_epoch, 0, name="resume_from_epoch"
)
if _FAKE:
train_iter = _fake_data_iterator_from()
else:
train_iter = _training_data_iterator_from()
test_iter = _validation_data_iterator_from() if _VALIDATION else None
model = _create_model()
params = {"learning_rate": _LR, "momentum": 0.9}
opt = _get_optimizer(params)
model.compile(
loss=keras.losses.categorical_crossentropy,
optimizer=opt,
metrics=["accuracy", "top_k_categorical_accuracy"],
)
model_dir = _get_model_dir()
checkpoint_format = os.path.join(model_dir, "checkpoint-{epoch}.h5")
callbacks = _get_hooks()
callbacks.append(LoggerCallback(logger, len(train_iter) * _BATCHSIZE))
# Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
if _is_master():
callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
# callbacks.append(keras.callbacks.TensorBoard(log_dir))
# Restore from a previous checkpoint, if initial_epoch is specified.
# Horovod: restore on the first worker which will broadcast weights to other workers.
if resume_from_epoch > 0 and _is_master():
model.load_weights(checkpoint_format.format(epoch=resume_from_epoch))
logger.info("Training...")
# Train the model. The training will randomly sample 1 / N batches of training data and
# 3 / N batches of validation data on every worker, where N is the number of workers.
# Over-sampling of validation data helps to increase probability that every validation
# example will be evaluated.
num_workers = hvd.size() if _DISTRIBUTED else 1
model.fit_generator(
train_iter,
steps_per_epoch=len(train_iter) // num_workers,
callbacks=callbacks,
epochs=_EPOCHS,
verbose=verbose,
workers=_NUM_WORKERS,
max_queue_size=_MAX_QUEUE_SIZE,
use_multiprocessing=_MULTIPROCESSING,
initial_epoch=resume_from_epoch,
)
if _FAKE is False and _VALIDATION:
# Evaluate the model on the full data set.
with Timer(output=logger.info, prefix="Testing"):
logger.info("Testing...")
score = hvd.allreduce(
model.evaluate_generator(test_iter, len(test_iter), workers=10)
)
if verbose:
print("Test loss:", score[0])
print("Test accuracy:", score[1])
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,111 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Create Docker Image for PyTorch\nIn this notebook we will create the Docker image for our PyTorch script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": "dotenv_path = dotenv_for()\nUSE_FAKE = True\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES = 2\nDOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "dc = docker.from_env()"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "image, log_iter = dc.images.build(path='Docker', \n tag='{}/caia-horovod-pytorch'.format(DOCKERHUB))"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "container_labels = {'containerName': 'pytorchgpu'}\nenvironment ={\n \"DISTRIBUTED\":True,\n \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n os.getenv('EXT_PWD'): {\n 'bind': '/workspace', \n 'mode': 'rw'\n }\n}\n\nif USE_FAKE:\n environment['FAKE'] = True\nelse:\n environment['FAKE'] = False\n volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n 'python -u /workspace/HorovodPytorch/src/imagenet_pytorch_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n command=cmd,\n detach=True, \n labels=container_labels,\n runtime='nvidia',\n volumes=volumes,\n environment=environment,\n shm_size='8G',\n privileged=True)"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "for line in container.logs(stderr=True, stream=True):\n print(line.decode(\"utf-8\"),end =\"\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "container.reload() # Refresh state\nif container.status is 'running':\n container.kill()"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "for line in dc.images.push(image.tags[0], \n stream=True,\n auth_config={\"username\": DOCKERHUB,\n \"password\": DOCKER_PWD}):\n print(line)"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,279 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Train PyTorch Model Distributed on Batch AI\nIn this notebook we will train a PyTorch model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Set the USE_FAKE to True if you want to use fake data rather than the Imagenet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT = f\"distributed_pytorch_{GPU_TYPE}\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\") #\"<YOUR DOCKERHUB>\""
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='experiment'></a>\n# Create Experiment\nNext we create our experiment."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='training_scripts'></a>\n# Upload Training Scripts\nWe need to upload our training scripts and associated files"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Upload our training scripts"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_pytorch_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Let's check our cluster we created earlier"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster list -w $WORKSPACE -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='job'></a>\n## Submit and Monitor Job\nBelow we specify the job we wish to execute. "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "jobs_dict = {\n \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n \"properties\": {\n \"nodeCount\": NUM_NODES,\n \"customToolkitSettings\": {\n \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n -bind-to none -map-by slot \\\n -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n -mca btl_tcp_if_include eth0 \\\n -x NCCL_SOCKET_IFNAME=eth0 \\\n -mca btl ^openib \\\n -x NCCL_IB_DISABLE=1 \\\n -x DISTRIBUTED=True \\\n -x AZ_BATCHAI_INPUT_TRAIN \\\n -x AZ_BATCHAI_INPUT_TEST \\\n --allow-run-as-root \\\n {FAKE} \\\n python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_pytorch_horovod.py\"\n },\n \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"inputDirectories\": [{\n \"id\": \"SCRIPTS\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n },\n {\n \"id\": \"TRAIN\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n {\n \"id\": \"TEST\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n ],\n \"outputDirectories\": [{\n \"id\": \"MODEL\",\n \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"pathSuffix\": \"Models\"\n }],\n \"containerSettings\": {\n \"imageSourceRegistry\": {\n \"image\": f\"{DOCKERHUB}/caia-horovod-pytorch\"\n }\n }\n }\n}"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "write_json_to_file(jobs_dict, 'job.json')"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "JOB_NAME='pytorch-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We now submit the job to Batch AI"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "With the command below we can check the status of the job"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "To view the files that the job has generated use the command below"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We can either wait for the job to complete or delete it with the command below."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='clean_up'></a>\n## Clean Up Resources\nNext we wish to tidy up the resource we created. \nFirst we reset the default values we set earlier."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az configure --defaults group=''\n!az configure --defaults location=''"
},
{
"cell_type": "markdown",
"metadata": {},
"source": " Next we delete the cluster"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Finally we can delete the group and we will have deleted everything created for this tutorial."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az group delete --name $GROUP_NAME -y"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,79 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
ENV NCCL_VERSION=2.2.13-1+cuda9.0
ENV PYTORCH_VERSION=0.4.0
ENV PYTHON_VERSION=3.5
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
build-essential \
cmake \
git \
curl \
vim \
wget \
ca-certificates \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libnccl-dev=${NCCL_VERSION} \
libjpeg-dev \
libpng-dev \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev
RUN ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install PyTorch
RUN pip install http://download.pytorch.org/whl/cu90/torch-${PYTORCH_VERSION}-cp35-cp35m-linux_x86_64.whl && \
pip install --no-cache-dir torchvision h5py scipy jupyter ipykernel numpy toolz pandas scikit-learn pillow
# Install Open MPI
RUN mkdir /tmp/openmpi && \
cd /tmp/openmpi && \
wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
tar zxf openmpi-3.0.0.tar.gz && \
cd openmpi-3.0.0 && \
./configure --enable-orterun-prefix-by-default && \
make -j $(nproc) all && \
make install && \
ldconfig && \
rm -rf /tmp/openmpi
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig
# Create a wrapper for OpenMPI to allow running as root by default
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
chmod a+x /usr/local/bin/mpirun
# Configure OpenMPI to run good defaults:
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
# Set default NCCL parameters
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf && \
echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
# Install OpenSSH for MPI to communicate between containers
RUN apt-get install -y --no-install-recommends openssh-client openssh-server && \
mkdir -p /var/run/sshd
# Allow OpenSSH to talk to containers without asking for confirmation
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
WORKDIR "/examples"

Просмотреть файл

@ -0,0 +1,11 @@
{
"properties": {
"nodeSetup": {
"setupTask": {
"commandLine": "$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/nodeprep.sh",
"runElevated": "True",
"stdOutErrPathPrefix": "$AZ_BATCHAI_MOUNT_ROOT/extfs"
}
}
}
}

Просмотреть файл

@ -0,0 +1,35 @@
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target docker.socket firewalld.service
Wants=network-online.target
Requires=docker.socket
[Service]
EnvironmentFile=/etc/default/docker
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/bin/dockerd --default-shm-size 8G -g /mnt/docker/ -H fd://
ExecReload=/bin/kill -s HUP $MAINPID
LimitNOFILE=1048576
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target

Просмотреть файл

@ -0,0 +1,4 @@
#!/usr/bin/env bash
sudo cp $AZ_BATCHAI_MOUNT_ROOT/extfs/scripts/docker.service /lib/systemd/system
sudo systemctl daemon-reload
sudo systemctl restart docker

Просмотреть файл

@ -0,0 +1,363 @@
"""
Trains ResNet50 in Keras using Horovod.
It requires the following env variables
AZ_BATCHAI_INPUT_TRAIN
AZ_BATCHAI_INPUT_TEST
AZ_BATCHAI_OUTPUT_MODEL
AZ_BATCHAI_JOB_TEMP_DIR
"""
import logging
import os
import sys
from functools import lru_cache
from os import path
from timer import Timer
import numpy as np
import pandas as pd
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data.distributed
import torchvision.models as models
from torch.utils.data import Dataset
from torchvision import transforms, datasets
def _str_to_bool(in_str):
if "t" in in_str.lower():
return True
else:
return False
_WIDTH = 224
_HEIGHT = 224
_CHANNELS = 3
_LR = 0.001
_EPOCHS = os.getenv("EPOCHS", 1)
_BATCHSIZE = 64
_RGB_MEAN = [0.485, 0.456, 0.406]
_RGB_SD = [0.229, 0.224, 0.225]
_SEED = 42
# Settings from https://arxiv.org/abs/1706.02677.
_WARMUP_EPOCHS = 5
_WEIGHT_DECAY = 0.00005
_FAKE = _str_to_bool(os.getenv("FAKE", "False"))
_DATA_LENGTH = int(
os.getenv("FAKE_DATA_LENGTH", 1281167)
) # How much fake data to simulate, default to size of imagenet dataset
_DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False"))
if _DISTRIBUTED:
import horovod.torch as hvd
def _get_rank():
if _DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
class HorovodAdapter(logging.LoggerAdapter):
def __init__(self, logger):
self._str_epoch = ""
self._gpu_rank = 0
super(HorovodAdapter, self).__init__(logger, {})
def set_epoch(self, epoch):
self._str_epoch = "[Epoch {}]".format(epoch)
def process(self, msg, kwargs):
kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch}
return msg, kwargs
@lru_cache()
def _get_logger():
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter(
"%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s"
)
ch.setFormatter(formatter)
logger.addHandler(ch)
adapter = HorovodAdapter(logger)
return adapter
def _append_path_to(data_path, data_series):
return data_series.apply(lambda x: path.join(data_path, x))
def _load_training(data_dir):
logger = _get_logger()
logger.info("Reading training data from {}".format(data_dir))
train_df = pd.read_csv(path.join(data_dir, "train.csv"))
return train_df.assign(
filenames=_append_path_to(path.join(data_dir, "train"), train_df.filenames)
)
def _load_validation(data_dir):
logger = _get_logger()
logger.info("Reading validation data from {}".format(data_dir))
train_df = pd.read_csv(path.join(data_dir, "validation.csv"))
return train_df.assign(
filenames=_append_path_to(path.join(data_dir, "validation"), train_df.filenames)
)
def _create_data_fn(train_path, test_path):
train_df = _load_training(train_path)
validation_df = _load_validation(test_path)
# File-path
train_X = train_df["filenames"].values
validation_X = validation_df["filenames"].values
# One-hot encoded labels for torch
train_labels = train_df[["num_id"]].values.ravel()
validation_labels = validation_df[["num_id"]].values.ravel()
# Index starts from 0
train_labels -= 1
validation_labels -= 1
return train_X, train_labels, validation_X, validation_labels
def _create_data(batch_size, num_batches, dim, channels, seed=42):
np.random.seed(seed)
return np.random.rand(batch_size * num_batches, channels, dim[0], dim[1]).astype(
np.float32
)
def _create_labels(batch_size, num_batches, n_classes):
return np.random.choice(n_classes, batch_size * num_batches)
class FakeData(Dataset):
def __init__(
self,
batch_size=32,
num_batches=20,
dim=(224, 224),
n_channels=3,
n_classes=10,
length=_DATA_LENGTH,
seed=42,
data_transform=None,
):
self.dim = dim
self.n_channels = n_channels
self.n_classes = n_classes
self.num_batches = num_batches
self._data = _create_data(
batch_size, self.num_batches, self.dim, self.n_channels
)
self._labels = _create_labels(batch_size, self.num_batches, self.n_classes)
self.translation_index = np.random.choice(len(self._labels), length)
self._length = length
self._data_transform = data_transform
logger = _get_logger()
logger.info(
"Creating fake data {} labels and {} images".format(
n_classes, len(self._data)
)
)
def __getitem__(self, idx):
logger = _get_logger()
logger.debug("Retrieving samples")
logger.debug(str(idx))
tr_index_array = self.translation_index[idx]
if self._data_transform is not None:
data = self._data_transform(self._data[tr_index_array])
else:
data = self._data[tr_index_array]
return data, self._labels[tr_index_array]
def __len__(self):
return self._length
def _is_master(is_distributed=_DISTRIBUTED):
if is_distributed:
if hvd.rank() == 0:
return True
else:
return False
else:
return True
def train(train_loader, model, criterion, optimizer, epoch):
logger = _get_logger()
msg = " duration({}) loss:{} total-samples: {}"
t = Timer()
t.start()
logger.set_epoch(epoch)
for i, (data, target) in enumerate(train_loader):
data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
optimizer.zero_grad()
# compute output
output = model(data)
loss = criterion(output, target)
# compute gradient and do SGD step
loss.backward()
optimizer.step()
if i % 100 == 0:
logger.info(msg.format(t.elapsed, loss.item(), i * len(data)))
t.start()
def validate(train_loader, model, criterion):
logger = _get_logger()
msg = "validation duration({}) loss:{} total-samples: {}"
t = Timer()
t.start()
model.eval()
with torch.no_grad():
for i, (data, target) in enumerate(train_loader):
data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True)
# compute output
output = model(data)
loss = criterion(output, target)
# compute gradient and do SGD step
if i % 100 == 0:
logger.info(msg.format(t.elapsed, loss.item(), i * len(data)))
t.start()
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info("Data length: {}".format(data_length))
logger.info("Total duration: {:.3f}".format(duration))
logger.info("Total images/sec: {:.3f}".format(images_per_second))
logger.info(
"Batch size: (Per GPU {}: Total {})".format(
_BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE
)
)
logger.info("Distributed: {}".format("True" if _DISTRIBUTED else "False"))
logger.info("Num GPUs: {:.3f}".format(hvd.size() if _DISTRIBUTED else 1))
logger.info("Dataset: {}".format("Synthetic" if _FAKE else "Imagenet"))
def _get_sampler(dataset, is_distributed=_DISTRIBUTED):
if is_distributed:
return torch.utils.data.distributed.DistributedSampler(
dataset, num_replicas=hvd.size(), rank=hvd.rank()
)
else:
return torch.utils.data.sampler.RandomSampler(dataset)
def main():
logger = _get_logger()
if _DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger.info("Runnin Distributed")
torch.manual_seed(_SEED)
# Horovod: pin GPU to local rank.
torch.cuda.set_device(hvd.local_rank())
torch.cuda.manual_seed(_SEED)
logger.info("PyTorch version {}".format(torch.__version__))
if _FAKE:
logger.info("Setting up fake loaders")
train_dataset = FakeData(n_classes=1000, data_transform=torch.FloatTensor)
else:
normalize = transforms.Normalize(_RGB_MEAN, _RGB_SD)
logger.info("Setting up loaders")
train_dataset = datasets.ImageFolder(
os.getenv("AZ_BATCHAI_INPUT_TRAIN"),
transforms.Compose(
[
transforms.RandomResizedCrop(_WIDTH),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]
),
)
validation_dataset = datasets.ImageFolder(
os.getenv("AZ_BATCHAI_INPUT_TRAIN"),
transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]
),
)
train_sampler = _get_sampler(train_dataset)
kwargs = {"num_workers": 5, "pin_memory": True}
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=_BATCHSIZE, sampler=train_sampler, **kwargs
)
# Autotune
cudnn.benchmark = True
logger.info("Loading model")
# Load symbol
model = models.__dict__["resnet50"](pretrained=False)
model.cuda()
if _DISTRIBUTED:
# Horovod: broadcast parameters.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
num_gpus = hvd.size() if _DISTRIBUTED else 1
# Horovod: scale learning rate by the number of GPUs.
optimizer = optim.SGD(model.parameters(), lr=_LR * num_gpus, momentum=0.9)
if _DISTRIBUTED:
# Horovod: wrap optimizer with DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(
optimizer, named_parameters=model.named_parameters()
)
criterion = F.cross_entropy
if not _FAKE:
val_sampler = _get_sampler(validation_dataset)
val_loader = torch.utils.data.DataLoader(
validation_dataset, batch_size=_BATCHSIZE, sampler=val_sampler, **kwargs
)
# Main training-loop
logger.info("Training ...")
for epoch in range(_EPOCHS):
with Timer(output=logger.info, prefix="Training") as t:
model.train()
if _DISTRIBUTED:
train_sampler.set_epoch(epoch)
train(train_loader, model, criterion, optimizer, epoch)
_log_summary(len(train_dataset), t.elapsed)
if not _FAKE:
validate(val_loader, model, criterion)
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,111 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Create Docker Image for TensorFlow\nIn this notebook we will create the Docker image for our TensorFlow script to run in. We will go through the process of creating the image and testing it locally to make sure it runs before submitting it to the cluster. It is often recommended you do this rather than debugging on the cluster since debugging on a cluster can be much more difficult and time consuming.\n \n**You will need to be running everything on a GPU enabled VM to run this notebook.** "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nsys.path.append(\"../common\") \n\nfrom dotenv import get_key\nimport os\nfrom utils import dotenv_for\nimport docker"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We will use fake data here since we don't want to have to download the data etc. Using fake data is often a good way to debug your models as well as checking what IO overhead is. Here we are setting the number of processes (NUM_PROCESSES) to 2 since the VM we are testing on has 2 GPUs. If you are running on a machine with 1 GPU set NUM_PROCESSES to 1."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": "dotenv_path = dotenv_for()\nUSE_FAKE = True\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")\nNUM_PROCESSES = 2\nDOCKER_PWD = get_key(dotenv_path, 'DOCKER_PWD')"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "dc = docker.from_env()"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "image, log_iter = dc.images.build(path='Docker', \n tag='{}/caia-horovod-tensorflow'.format(DOCKERHUB))"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "container_labels = {'containerName': 'tensorflowgpu'}\nenvironment ={\n \"DISTRIBUTED\":True,\n \"PYTHONPATH\":'/workspace/common/',\n}\n\nvolumes = {\n os.getenv('EXT_PWD'): {\n 'bind': '/workspace', \n 'mode': 'rw'\n }\n}\n\nif USE_FAKE:\n environment['FAKE'] = True\nelse:\n environment['FAKE'] = False\n volumes[os.getenv('EXT_DATA')]={'bind': '/mnt/input', 'mode': 'rw'}\n environment['AZ_BATCHAI_INPUT_TRAIN'] = '/mnt/input/train'\n environment['AZ_BATCHAI_INPUT_TEST'] = '/mnt/input/validation'"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "cmd=f'mpirun -np {NUM_PROCESSES} -H localhost:{NUM_PROCESSES} '\\\n 'python -u /workspace/HorovodTF/src/imagenet_estimator_tf_horovod.py'\ncontainer = dc.containers.run(image.tags[0], \n command=cmd,\n detach=True, \n labels=container_labels,\n runtime='nvidia',\n volumes=volumes,\n environment=environment,\n shm_size='8G',\n privileged=True)"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "With the code below we are simply monitoring what is happening in the container. Feel free to stop the notebook when you are happy that everything is working."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "for line in container.logs(stderr=True, stream=True):\n print(line.decode(\"utf-8\"),end =\"\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "container.reload() # Refresh state\nif container.status is 'running':\n container.kill()"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "for line in dc.images.push(image.tags[0], \n stream=True,\n auth_config={\"username\": DOCKERHUB,\n \"password\": DOCKER_PWD}):\n print(line)"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,279 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": "# Train TensorFlow Model Distributed on Batch AI\nIn this notebook we will train a TensorFlow model ([ResNet50](https://arxiv.org/abs/1512.03385)) in a distributed fashion using [Horovod](https://github.com/uber/horovod) on the Imagenet dataset. This tutorial will take you through the following steps:\n * [Create Experiment](#experiment)\n * [Upload Training Scripts](#training_scripts)\n * [Submit and Monitor Job](#job)\n * [Clean Up Resources](#clean_up)"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "import sys\nsys.path.append(\"../common\") \n\nimport json\nfrom dotenv import get_key\nimport os\nfrom utils import write_json_to_file, dotenv_for"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Set the USE_FAKE to True if you want to use fake data rather than the ImageNet dataset. This is often a good way to debug your models as well as checking what IO overhead is."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": "# Variables for Batch AI - change as necessary\ndotenv_path = dotenv_for()\nGROUP_NAME = get_key(dotenv_path, 'GROUP_NAME')\nFILE_SHARE_NAME = get_key(dotenv_path, 'FILE_SHARE_NAME')\nWORKSPACE = get_key(dotenv_path, 'WORKSPACE')\nNUM_NODES = int(get_key(dotenv_path, 'NUM_NODES'))\nCLUSTER_NAME = get_key(dotenv_path, 'CLUSTER_NAME')\nGPU_TYPE = get_key(dotenv_path, 'GPU_TYPE')\nPROCESSES_PER_NODE = int(get_key(dotenv_path, 'PROCESSES_PER_NODE'))\nSTORAGE_ACCOUNT_NAME = get_key(dotenv_path, 'STORAGE_ACCOUNT_NAME')\n\nEXPERIMENT = f\"distributed_tensorflow_{GPU_TYPE}\"\nUSE_FAKE = False\nDOCKERHUB = os.getenv('DOCKER_REPOSITORY', \"masalvar\")"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "FAKE='-x FAKE=True' if USE_FAKE else ''\nTOTAL_PROCESSES = PROCESSES_PER_NODE * NUM_NODES"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='experiment'></a>\n# Create Experiment\nNext we create our experiment."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai experiment create -n $EXPERIMENT -g $GROUP_NAME -w $WORKSPACE"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='training_scripts'></a>\n# Upload Training Scripts\nWe need to upload our training scripts and associated files"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "json_data = !az storage account keys list -n $STORAGE_ACCOUNT_NAME -g $GROUP_NAME\nstorage_account_key = json.loads(''.join([i for i in json_data if 'WARNING' not in i]))[0]['value']"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "%env AZURE_STORAGE_ACCOUNT $STORAGE_ACCOUNT_NAME\n%env AZURE_STORAGE_KEY=$storage_account_key"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Upload our training scripts"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az storage file upload --share-name $FILE_SHARE_NAME --source src/imagenet_estimator_tf_horovod.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source src/resnet_model.py --path scripts\n!az storage file upload --share-name $FILE_SHARE_NAME --source ../common/timer.py --path scripts"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Let's check our cluster we created earlier"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster list -w $WORKSPACE -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='job'></a>\n## Submit and Monitor Job\nBelow we specify the job we wish to execute. "
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "jobs_dict = {\n \"$schema\": \"https://raw.githubusercontent.com/Azure/BatchAI/master/schemas/2017-09-01-preview/job.json\",\n \"properties\": {\n \"nodeCount\": NUM_NODES,\n \"customToolkitSettings\": {\n \"commandLine\": f\"echo $AZ_BATCH_HOST_LIST; \\\n cat $AZ_BATCHAI_MPI_HOST_FILE; \\\n mpirun -np {TOTAL_PROCESSES} --hostfile $AZ_BATCHAI_MPI_HOST_FILE \\\n -bind-to none -map-by slot \\\n -x NCCL_DEBUG=INFO -x LD_LIBRARY_PATH \\\n -mca btl_tcp_if_include eth0 \\\n -x NCCL_SOCKET_IFNAME=eth0 \\\n -mca btl ^openib \\\n -x NCCL_IB_DISABLE=1 \\\n -x DISTRIBUTED=True \\\n -x AZ_BATCHAI_INPUT_TRAIN \\\n -x AZ_BATCHAI_INPUT_TEST \\\n --allow-run-as-root \\\n {FAKE} \\\n python -u $AZ_BATCHAI_INPUT_SCRIPTS/imagenet_estimator_tf_horovod.py\"\n },\n \"stdOutErrPathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"inputDirectories\": [{\n \"id\": \"SCRIPTS\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs/scripts\"\n },\n {\n \"id\": \"TRAIN\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n {\n \"id\": \"TEST\",\n \"path\": \"$AZ_BATCHAI_MOUNT_ROOT/nfs/imagenet\",\n },\n ],\n \"outputDirectories\": [{\n \"id\": \"MODEL\",\n \"pathPrefix\": \"$AZ_BATCHAI_MOUNT_ROOT/extfs\",\n \"pathSuffix\": \"Models\"\n }],\n \"containerSettings\": {\n \"imageSourceRegistry\": {\n \"image\": f\"{DOCKERHUB}/caia-horovod-tensorflow\"\n }\n }\n }\n}"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "write_json_to_file(jobs_dict, 'job.json')"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "JOB_NAME='tensorflow-horovod-{}'.format(NUM_NODES*PROCESSES_PER_NODE)"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We now submit the job to Batch AI"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job create -n $JOB_NAME --cluster $CLUSTER_NAME -w $WORKSPACE -e $EXPERIMENT -f job.json"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "With the command below we can check the status of the job"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai job list -w $WORKSPACE -e $EXPERIMENT -o table"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "To view the files that the job has generated use the command below"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file list -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We are also able to stream the stdout and stderr that our job produces. This is great to check the progress of our job as well as debug issues."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stdout.txt"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"stripout"
]
},
"outputs": [],
"source": "!az batchai job file stream -w $WORKSPACE -e $EXPERIMENT --j $JOB_NAME --output-directory-id stdouterr -f stderr.txt"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "We can either wait for the job to complete or delete it with the command below."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai job delete -w $WORKSPACE -e $EXPERIMENT --name $JOB_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "<a id='clean_up'></a>\n## Clean Up Resources\nNext we wish to tidy up the resource we created. \nFirst we reset the default values we set earlier."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az configure --defaults group=''\n!az configure --defaults location=''"
},
{
"cell_type": "markdown",
"metadata": {},
"source": " Next we delete the cluster"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai cluster delete -w $WORKSPACE --name $CLUSTER_NAME -g $GROUP_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Once the cluster is deleted you will not incur any cost for the computation but you can still retain your experiments and workspace. If you wish to delete those as well execute the commands below."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai experiment delete -w $WORKSPACE --name $EXPERIMENT -g $GROUP_NAME -y"
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az batchai workspace delete -n $WORKSPACE -g $GROUP_NAME -y"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "Finally we can delete the group and we will have deleted everything created for this tutorial."
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": "!az group delete --name $GROUP_NAME -y"
}
],
"metadata": {
"jupytext": {
"text_representation": {
"extension": ".py",
"format_name": "light",
"format_version": "1.3",
"jupytext_version": "0.8.6"
}
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,59 @@
FROM nvidia/cuda:9.0-devel-ubuntu16.04
# TensorFlow version is tightly coupled to CUDA and cuDNN so it should be selected carefully
ENV PYTHON_VERSION=3.5
ENV TENSORFLOW_VERSION=1.9.0
ENV CUDNN_VERSION=7.0.5.15-1+cuda9.0
RUN echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
RUN apt-get update && apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
build-essential \
cmake \
cpio \
git \
curl \
wget \
ca-certificates \
libdapl2 \
libcudnn7=$CUDNN_VERSION \
libjpeg-dev \
libpng-dev \
libmlx4-1 \
libsm6 \
libxext6 \
python$PYTHON_VERSION \
python$PYTHON_VERSION-dev
# install intel MPI
RUN cd /tmp && \
wget -q 'http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/11595/l_mpi_2017.3.196.tgz' && \
tar zxvf l_mpi_2017.3.196.tgz && \
sed -i -e 's/^ACCEPT_EULA=decline/ACCEPT_EULA=accept/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's|^#ACTIVATION_LICENSE_FILE=|ACTIVATION_LICENSE_FILE=/tmp/l_mpi_2017.3.196/USE_SERVER.lic|g' \
/tmp/l_mpi_2017.3.196/silent.cfg && \
sed -i -e 's/^ACTIVATION_TYPE=exist_lic/ACTIVATION_TYPE=license_server/g' /tmp/l_mpi_2017.3.196/silent.cfg && \
cd /tmp/l_mpi_2017.3.196 && \
./install.sh -s silent.cfg && \
cd .. && \
rm -rf l_mpi_2017.3.196* && \
echo "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" >> ~/.bashrc
ENV PATH $PATH:/opt/intel/compilers_and_libraries/linux/mpi/bin64
RUN ln -s /usr/bin/python$PYTHON_VERSION /usr/bin/python
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py
# Install TensorFlow
RUN pip install --no-cache-dir tensorflow-gpu==$TENSORFLOW_VERSION h5py scipy jupyter ipykernel numpy toolz pandas \
scikit-learn
# Install Horovod, temporarily using CUDA stubs
RUN ldconfig /usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs && \
/bin/bash -c "source /opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/bin/mpivars.sh" && \
HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod==0.13.2 && \
ldconfig

Просмотреть файл

@ -0,0 +1,459 @@
"""
Trains ResNet50 using Horovod.
It requires the following env variables
AZ_BATCHAI_INPUT_TRAIN
AZ_BATCHAI_INPUT_TEST
AZ_BATCHAI_OUTPUT_MODEL
AZ_BATCHAI_JOB_TEMP_DIR
"""
import glob
import logging
import os
import sys
from functools import lru_cache
from os import path
from pathlib import Path
from timer import Timer
import numpy as np
import tensorflow as tf
from resnet_model import resnet_v1
from toolz import pipe
_WIDTH = 224
_HEIGHT = 224
_CHANNELS = 3
_LR = 0.001
_EPOCHS = os.getenv("EPOCHS", 1)
_BATCHSIZE = 64
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
_BUFFER = 256
def _str_to_bool(in_str):
if "t" in in_str.lower():
return True
else:
return False
_DISTRIBUTED = _str_to_bool(os.getenv("DISTRIBUTED", "False"))
_FAKE = _str_to_bool(os.getenv("FAKE", "False"))
_DATA_LENGTH = int(
os.getenv("FAKE_DATA_LENGTH", 1281167)
) # How much fake data to simulate, default to size of imagenet dataset
_VALIDATION = _str_to_bool(os.getenv("VALIDATION", "False"))
if _DISTRIBUTED:
import horovod.tensorflow as hvd
tf_logger = logging.getLogger("tensorflow")
tf_logger.setLevel(logging.INFO)
stout = logging.StreamHandler(stream=sys.stdout)
tf_logger.addHandler(stout)
def _get_rank():
if _DISTRIBUTED:
try:
return hvd.rank()
except:
return 0
else:
return 0
class HorovodAdapter(logging.LoggerAdapter):
def __init__(self, logger):
self._str_epoch = ""
self._gpu_rank = 0
super(HorovodAdapter, self).__init__(logger, {})
def set_epoch(self, epoch):
self._str_epoch = "[Epoch {}]".format(epoch)
def process(self, msg, kwargs):
kwargs["extra"] = {"gpurank": _get_rank(), "epoch": self._str_epoch}
return msg, kwargs
@lru_cache()
def _get_logger():
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
ch = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter(
"%(levelname)s:%(name)s:%(gpurank)d: %(epoch)s %(message)s"
)
ch.setFormatter(formatter)
logger.addHandler(ch)
adapter = HorovodAdapter(logger)
return adapter
def _load_image(filename, channels=_CHANNELS):
return tf.to_float(tf.image.decode_png(tf.read_file(filename), channels=channels))
def _resize(img, width=_WIDTH, height=_HEIGHT):
return tf.image.resize_images(img, [height, width])
def _centre(img, mean_subtraction=(_R_MEAN, _G_MEAN, _B_MEAN)):
return tf.subtract(img, list(mean_subtraction))
def _random_crop(img, width=_WIDTH, height=_HEIGHT, channels=_CHANNELS):
return tf.random_crop(img, [height, width, channels])
def _random_horizontal_flip(img):
return tf.image.random_flip_left_right(img)
def _preprocess_images(filename):
return pipe(filename, _load_image, _resize, _centre)
def _preprocess_labels(label):
return tf.cast(label, dtype=tf.int32)
def _transform_to_NCHW(img):
return tf.transpose(img, [2, 0, 1]) # Transform from NHWC to NCHW
def _parse_function_train(tensor, label):
img_rgb = pipe(tensor, _random_crop, _random_horizontal_flip, _transform_to_NCHW)
return img_rgb, label
def _prep(filename, label):
return tf.data.Dataset.from_tensor_slices(
([_preprocess_images(filename)], [_preprocess_labels(label)])
)
def _parse_function_eval(filename, label):
return (
pipe(filename, _preprocess_images, _transform_to_NCHW),
_preprocess_labels(label),
)
def _get_optimizer(params, is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: add Horovod Distributed Optimizer.
return hvd.DistributedOptimizer(
tf.train.MomentumOptimizer(
learning_rate=params["learning_rate"] * hvd.size(), momentum=0.9
)
)
else:
return tf.train.MomentumOptimizer(
learning_rate=params["learning_rate"], momentum=0.9
)
def build_network(features, mode, params):
network = resnet_v1(
resnet_depth=50, num_classes=params["classes"], data_format="channels_first"
)
return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN))
def model_fn(features, labels, mode, params):
"""
features: This is the x-arg from the input_fn.
labels: This is the y-arg from the input_fn,
see e.g. train_input_fn for these two.
mode: Either TRAIN, EVAL, or PREDICT
params: User-defined hyper-parameters, e.g. learning-rate.
"""
logger = _get_logger()
logger.info("Creating model in {} mode".format(mode))
logits = build_network(features, mode, params)
if mode == tf.estimator.ModeKeys.PREDICT:
# Softmax output of the neural network.
y_pred = tf.nn.softmax(logits=logits)
# Classification output of the neural network.
y_pred_cls = tf.argmax(y_pred, axis=1)
predictions = {
"class_ids": y_pred_cls,
"probabilities": y_pred,
"logits": logits,
}
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=logits, labels=labels
)
loss = tf.reduce_mean(cross_entropy)
if mode == tf.estimator.ModeKeys.EVAL:
# Softmax output of the neural network.
y_pred = tf.nn.softmax(logits=logits)
# Classification output of the neural network.
y_pred_cls = tf.argmax(y_pred, axis=1)
accuracy = tf.metrics.accuracy(
labels=tf.argmax(labels, axis=1), predictions=y_pred_cls, name="acc_op"
)
metrics = {"accuracy": accuracy}
tf.summary.scalar("accuracy", accuracy[1])
return tf.estimator.EstimatorSpec(mode=mode, eval_metric_ops=metrics, loss=loss)
optimizer = _get_optimizer(params)
train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def _append_path_to(data_path, data_series):
return data_series.apply(lambda x: path.join(data_path, x))
def _load_training(data_dir):
return list(glob.glob(Path(data_dir) / "**" / "*.jpg"))
def _load_validation(data_dir):
return list(glob.glob(Path(data_dir) / "**" / "*.jpg"))
def _create_data_fn(train_path, test_path):
logger = _get_logger()
logger.info("Reading training data info")
train_df = _load_training(train_path)
logger.info("Reading validation data info")
validation_df = _load_validation(test_path)
train_labels = train_df[["num_id"]].values.ravel() - 1
validation_labels = validation_df[["num_id"]].values.ravel() - 1
train_data = tf.data.Dataset.from_tensor_slices(
(train_df["filenames"].values, train_labels)
)
train_data_transform = tf.contrib.data.map_and_batch(
_parse_function_train, _BATCHSIZE, num_parallel_batches=5
)
train_data = train_data.apply(
tf.contrib.data.parallel_interleave(
_prep, cycle_length=5, buffer_output_elements=1024
)
)
train_data = (
train_data.shuffle(1024).repeat().apply(train_data_transform).prefetch(_BUFFER)
)
validation_data = tf.data.Dataset.from_tensor_slices(
(validation_df["filenames"].values, validation_labels)
)
validation_data_transform = tf.contrib.data.map_and_batch(
_parse_function_eval, _BATCHSIZE, num_parallel_batches=4
)
validation_data = validation_data.apply(validation_data_transform).prefetch(_BUFFER)
def _train_input_fn():
return train_data.make_one_shot_iterator().get_next()
def _validation_input_fn():
return validation_data.make_one_shot_iterator().get_next()
_train_input_fn.length = len(train_df)
_validation_input_fn.length = len(validation_df)
_train_input_fn.classes = 1000
_validation_input_fn.classes = 1000
return _train_input_fn, _validation_input_fn
def _create_data(batch_size, num_batches, dim, channels, seed=42):
np.random.seed(seed)
return np.random.rand(batch_size * num_batches, channels, dim[0], dim[1]).astype(
np.float32
)
def _create_labels(batch_size, num_batches, n_classes):
return np.random.choice(n_classes, batch_size * num_batches)
def _create_fake_data_fn(train_length=_DATA_LENGTH, valid_length=50000, num_batches=40):
""" Creates fake dataset
Data is returned in NCHW since this tends to be faster on GPUs
"""
logger = _get_logger()
logger.info("Creating fake data")
data_array = _create_data(_BATCHSIZE, num_batches, (_HEIGHT, _WIDTH), _CHANNELS)
labels_array = _create_labels(_BATCHSIZE, num_batches, 1000)
def fake_data_generator():
for i in range(num_batches):
yield data_array[i * _BATCHSIZE : (i + 1) * _BATCHSIZE], labels_array[
i * _BATCHSIZE : (i + 1) * _BATCHSIZE
]
train_data = tf.data.Dataset().from_generator(
fake_data_generator,
output_types=(tf.float32, tf.int32),
output_shapes=(
tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
tf.TensorShape([None]),
),
)
train_data = train_data.shuffle(40 * _BATCHSIZE).repeat().prefetch(_BUFFER)
validation_data = tf.data.Dataset().from_generator(
fake_data_generator,
output_types=(tf.float32, tf.int32),
output_shapes=(
tf.TensorShape([None, _CHANNELS, _HEIGHT, _WIDTH]),
tf.TensorShape([None]),
),
)
validation_data = validation_data.prefetch(_BUFFER)
def _train_input_fn():
return train_data.make_one_shot_iterator().get_next()
def _validation_input_fn():
return validation_data.make_one_shot_iterator().get_next()
_train_input_fn.length = train_length
_validation_input_fn.length = valid_length
_train_input_fn.classes = 1000
_validation_input_fn.classes = 1000
return _train_input_fn, _validation_input_fn
def _get_runconfig(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
return tf.estimator.RunConfig(
save_checkpoints_steps=None,
save_checkpoints_secs=None,
session_config=config,
)
else:
return tf.estimator.RunConfig(save_checkpoints_steps=None)
def _get_model_dir(is_distributed=_DISTRIBUTED):
if is_distributed:
# Horovod: save checkpoints only on worker 0 to prevent other workers from
# corrupting them.
return (
os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
if hvd.rank() == 0
else os.getenv("AZ_BATCHAI_JOB_TEMP_DIR")
)
else:
return os.getenv("AZ_BATCHAI_OUTPUT_MODEL")
def _get_hooks(is_distributed=_DISTRIBUTED):
logger = _get_logger()
if is_distributed:
bcast_hook = hvd.BroadcastGlobalVariablesHook(0)
logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(), hvd.size()))
return [bcast_hook]
else:
return []
def _is_master(is_distributed=_DISTRIBUTED):
if is_distributed:
if hvd.rank() == 0:
return True
else:
return False
else:
return True
def _log_summary(data_length, duration):
logger = _get_logger()
images_per_second = data_length / duration
logger.info("Data length: {}".format(data_length))
logger.info("Total duration: {:.3f}".format(duration))
logger.info("Total images/sec: {:.3f}".format(images_per_second))
logger.info(
"Batch size: (Per GPU {}: Total {})".format(
_BATCHSIZE, hvd.size() * _BATCHSIZE if _DISTRIBUTED else _BATCHSIZE
)
)
logger.info("Distributed: {}".format("True" if _DISTRIBUTED else "False"))
logger.info("Num GPUs: {:.3f}".format(hvd.size() if _DISTRIBUTED else 1))
logger.info("Dataset: {}".format("Synthetic" if _FAKE else "Imagenet"))
def main():
if _DISTRIBUTED:
# Horovod: initialize Horovod.
hvd.init()
logger = _get_logger()
logger.info("Runnin Distributed")
else:
logger = _get_logger()
logger.info("Tensorflow version {}".format(tf.__version__))
if _FAKE:
train_input_fn, validation_input_fn = _create_fake_data_fn()
else:
train_input_fn, validation_input_fn = _create_data_fn(
os.getenv("AZ_BATCHAI_INPUT_TRAIN"), os.getenv("AZ_BATCHAI_INPUT_TEST")
)
run_config = _get_runconfig()
model_dir = _get_model_dir()
params = {"learning_rate": _LR, "classes": train_input_fn.classes}
logger.info("Creating estimator with params: {}".format(params))
model = tf.estimator.Estimator(
model_fn=model_fn, params=params, model_dir=model_dir, config=run_config
)
hooks = _get_hooks()
num_gpus = hvd.size() if _DISTRIBUTED else 1
with Timer(output=logger.info, prefix="Training") as t:
logger.info("Training...")
model.train(
input_fn=train_input_fn,
steps=_EPOCHS * train_input_fn.length // (_BATCHSIZE * num_gpus),
hooks=hooks,
)
_log_summary(_EPOCHS * train_input_fn.length, t.elapsed)
if _is_master() and _FAKE is False and _VALIDATION:
with Timer(output=logger.info, prefix="Testing"):
logger.info("Testing...")
model.evaluate(input_fn=validation_input_fn)
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,320 @@
""" Taken from official Tensorflow TPU spec https://github.com/tensorflow/tpu
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
BATCH_NORM_DECAY = 0.9
BATCH_NORM_EPSILON = 1e-5
def batch_norm_relu(inputs, is_training, relu=True, init_zero=False,
data_format='channels_first'):
"""Performs a batch normalization followed by a ReLU.
Args:
inputs: `Tensor` of shape `[batch, channels, ...]`.
is_training: `bool` for whether the model is training.
relu: `bool` if False, omits the ReLU operation.
init_zero: `bool` if True, initializes scale parameter of batch
normalization with 0 instead of 1 (default).
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
A normalized `Tensor` with the same `data_format`.
"""
if init_zero:
gamma_initializer = tf.zeros_initializer()
else:
gamma_initializer = tf.ones_initializer()
if data_format == 'channels_first':
axis = 1
else:
axis = 3
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
center=True,
scale=True,
training=is_training,
fused=True,
gamma_initializer=gamma_initializer)
if relu:
inputs = tf.nn.relu(inputs)
return inputs
def fixed_padding(inputs, kernel_size, data_format='channels_first'):
"""Pads the input along the spatial dimensions independently of input size.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]` or
`[batch, height, width, channels]` depending on `data_format`.
kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
operations. Should be a positive integer.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
A padded `Tensor` of the same `data_format` with size either intact
(if `kernel_size == 1`) or padded (if `kernel_size > 1`).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
padded_inputs = tf.pad(inputs, [[0, 0], [0, 0],
[pad_beg, pad_end], [pad_beg, pad_end]])
else:
padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]])
return padded_inputs
def conv2d_fixed_padding(inputs, filters, kernel_size, strides,
data_format='channels_first'):
"""Strided 2-D convolution with explicit padding.
The padding is consistent and is based only on `kernel_size`, not on the
dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
Args:
inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
filters: `int` number of filters in the convolution.
kernel_size: `int` size of the kernel to be used in the convolution.
strides: `int` strides of the convolution.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
A `Tensor` of shape `[batch, filters, height_out, width_out]`.
"""
if strides > 1:
inputs = fixed_padding(inputs, kernel_size, data_format=data_format)
return tf.layers.conv2d(
inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
kernel_initializer=tf.variance_scaling_initializer(),
data_format=data_format)
def residual_block(inputs, filters, is_training, strides,
use_projection=False, data_format='channels_first'):
"""Standard building block for residual networks with BN after convolutions.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
filters: `int` number of filters for the first two convolutions. Note that
the third and final convolution will use 4 times as many filters.
is_training: `bool` for whether the model is in training.
strides: `int` block stride. If greater than 1, this block will ultimately
downsample the input.
use_projection: `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
The output `Tensor` of the block.
"""
shortcut = inputs
if use_projection:
# Projection shortcut in first layer to match filters and strides
shortcut = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=1, strides=strides,
data_format=data_format)
shortcut = batch_norm_relu(shortcut, is_training, relu=False,
data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=strides,
data_format=data_format)
inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=1,
data_format=data_format)
inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True,
data_format=data_format)
return tf.nn.relu(inputs + shortcut)
def bottleneck_block(inputs, filters, is_training, strides,
use_projection=False, data_format='channels_first'):
"""Bottleneck block variant for residual networks with BN after convolutions.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
filters: `int` number of filters for the first two convolutions. Note that
the third and final convolution will use 4 times as many filters.
is_training: `bool` for whether the model is in training.
strides: `int` block stride. If greater than 1, this block will ultimately
downsample the input.
use_projection: `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
The output `Tensor` of the block.
"""
shortcut = inputs
if use_projection:
# Projection shortcut only in first block within a group. Bottleneck blocks
# end with 4 times the number of filters.
filters_out = 4 * filters
shortcut = conv2d_fixed_padding(
inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
data_format=data_format)
shortcut = batch_norm_relu(shortcut, is_training, relu=False,
data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=1, strides=1,
data_format=data_format)
inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=strides,
data_format=data_format)
inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
data_format=data_format)
inputs = batch_norm_relu(inputs, is_training, relu=False, init_zero=True,
data_format=data_format)
return tf.nn.relu(inputs + shortcut)
def block_group(inputs, filters, block_fn, blocks, strides, is_training, name,
data_format='channels_first'):
"""Creates one group of blocks for the ResNet model.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
filters: `int` number of filters for the first convolution of the layer.
block_fn: `function` for the block to use within the model
blocks: `int` number of blocks contained in the layer.
strides: `int` stride to use for the first convolution of the layer. If
greater than 1, this layer will downsample the input.
is_training: `bool` for whether the model is training.
name: `str`name for the Tensor output of the block layer.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
The output `Tensor` of the block layer.
"""
# Only the first block per block_group uses projection shortcut and strides.
inputs = block_fn(inputs, filters, is_training, strides,
use_projection=True, data_format=data_format)
for _ in range(1, blocks):
inputs = block_fn(inputs, filters, is_training, 1,
data_format=data_format)
return tf.identity(inputs, name)
def resnet_v1_generator(block_fn, layers, num_classes,
data_format='channels_first'):
"""Generator for ResNet v1 models.
Args:
block_fn: `function` for the block to use within the model. Either
`residual_block` or `bottleneck_block`.
layers: list of 4 `int`s denoting the number of blocks to include in each
of the 4 block groups. Each group consists of blocks that take inputs of
the same resolution.
num_classes: `int` number of possible classes for image classification.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
Model `function` that takes in `inputs` and `is_training` and returns the
output `Tensor` of the ResNet model.
"""
def model(inputs, is_training):
"""Creation of the model graph."""
inputs = conv2d_fixed_padding(
inputs=inputs, filters=64, kernel_size=7, strides=2,
data_format=data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
inputs = tf.layers.max_pooling2d(
inputs=inputs, pool_size=3, strides=2, padding='SAME',
data_format=data_format)
inputs = tf.identity(inputs, 'initial_max_pool')
inputs = block_group(
inputs=inputs, filters=64, block_fn=block_fn, blocks=layers[0],
strides=1, is_training=is_training, name='block_group1',
data_format=data_format)
inputs = block_group(
inputs=inputs, filters=128, block_fn=block_fn, blocks=layers[1],
strides=2, is_training=is_training, name='block_group2',
data_format=data_format)
inputs = block_group(
inputs=inputs, filters=256, block_fn=block_fn, blocks=layers[2],
strides=2, is_training=is_training, name='block_group3',
data_format=data_format)
inputs = block_group(
inputs=inputs, filters=512, block_fn=block_fn, blocks=layers[3],
strides=2, is_training=is_training, name='block_group4',
data_format=data_format)
# The activation is 7x7 so this is a global average pool.
inputs = tf.layers.average_pooling2d(
inputs=inputs, pool_size=7, strides=1, padding='VALID',
data_format=data_format)
inputs = tf.identity(inputs, 'final_avg_pool')
inputs = tf.reshape(
inputs, [-1, 2048 if block_fn is bottleneck_block else 512])
inputs = tf.layers.dense(
inputs=inputs,
units=num_classes,
kernel_initializer=tf.random_normal_initializer(stddev=.01))
inputs = tf.identity(inputs, 'final_dense')
return inputs
model.default_image_size = 224
return model
def resnet_v1(resnet_depth, num_classes, data_format='channels_first'):
"""Returns the ResNet model for a given size and number of output classes."""
model_params = {
18: {'block': residual_block, 'layers': [2, 2, 2, 2]},
34: {'block': residual_block, 'layers': [3, 4, 6, 3]},
50: {'block': bottleneck_block, 'layers': [3, 4, 6, 3]},
101: {'block': bottleneck_block, 'layers': [3, 4, 23, 3]},
152: {'block': bottleneck_block, 'layers': [3, 8, 36, 3]},
200: {'block': bottleneck_block, 'layers': [3, 24, 36, 3]}
}
if resnet_depth not in model_params:
raise ValueError('Not a valid resnet_depth:', resnet_depth)
params = model_params[resnet_depth]
return resnet_v1_generator(
params['block'], params['layers'], num_classes, data_format)

Просмотреть файл

@ -0,0 +1,21 @@
MIT License
Copyright (c) Microsoft Corporation. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

Просмотреть файл

@ -0,0 +1,46 @@
define PROJECT_HELP_MSG
Usage:
make help show this message
make build build docker image
make push push container
make run run benchmarking container
make jupyter run jupyter notebook inside container
endef
export PROJECT_HELP_MSG
PWD:=$(shell pwd)
dockerhub:=
data:=
image_name:=$(dockerhub)/distributed-training-control
help:
echo "$$PROJECT_HELP_MSG" | less
build:
docker build -t $(image_name) Docker
jupyter:
docker run -p 9999:9999 \
-e EXT_PWD=$(PWD) \
-e EXT_DATA=$(data) \
-e DOCKER_REPOSITORY=$(dockerhub) \
-v $(PWD):/workspace \
-v $(data):/data \
-v /var/run/docker.sock:/var/run/docker.sock \
-it $(image_name) bash -c "jupyter notebook"
run:
docker run -p 9999:9999 \
-e EXT_PWD=$(PWD) \
-e EXT_DATA=$(data) \
-e DOCKER_REPOSITORY=$(dockerhub) \
-v $(PWD):/workspace \
-v $(data):/data \
-v /var/run/docker.sock:/var/run/docker.sock \
-it $(image_name)
push:
docker push $(image_name)
.PHONY: help build push

Просмотреть файл

@ -0,0 +1,77 @@
# Training Distributed Training on Batch AI
This repo is a tutorial on how to train a CNN model in a distributed fashion using Batch AI.
The scenario covered is image classification, but the solution can be generalized for other deep learning scenarios such as segmentation and object detection.
![Distributed training diagram](images/dist_training_diag2.png "Distributed training diagram")
Image classification is a common task in computer vision applications and is often tackled by training a convolutional neural network (CNN).
For particularly large models with large datasets, the training process can take weeks or months on a single GPU.
In some situations, the models are so large that it isnt possible to fit reasonable batch sizes onto the GPU.
Using distributed training in these situations helps shorten the training time.
In this specific scenario, a ResNet50 CNN model is trained using Horovod on the ImageNet dataset as well as on synthetic data.
The tutorial demonstrates how to accomplish this using three of the most popular deep learning frameworks: TensorFlow, Keras, and PyTorch.
There are number of ways to train a deep learning model in a distributed fashion, including data parallel and model parallel approaches based on synchronous and asynchronous updates.
Currently the most common scenario is data parallel with synchronous updates—its the easiest to implement and sufficient for the majority of use cases.
In data parallel distributed training with synchronous updates the model is replicated across N hardware devices and a
mini-batch of training samples is divided into N micro-batches (see Figure 2).
Each device performs the forward and backward pass for a micro-batch and when it finishes the process it shares the
updates with the other devices. These are then used to calculate the updated weights of the entire mini-batch and then the
weights are synchronized across the models. This is the scenario that is covered in the GitHub repository. The same architecture though can
be used for model parallel and asynchronous updates.
## Prerequisites
* Computer with Nvidia GPU (The path was tested on an [Azure NC12 Ubuntu DSVM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu))
* Linux
* [Docker](https://docs.docker.com/install/linux/docker-ce/ubuntu/) installed
* [Nvidia Docker runtime](https://github.com/NVIDIA/nvidia-container-runtime) installed
* [Dockerhub](https://hub.docker.com/) account
* Port 9999 open on the VM or computer
* ImageNet dataset (look at [this](00_DataProcessing.ipynb) notebook for details)
## Setup
Before you begin make sure you are logged into your dockerhub account by running on your machine:
```bash
docker login
```
### Setup Execution Environment
Before being able to run anything you will need to set up the environment in which you will be executing the Batch AI commands etc.
There are a number of dependencies therefore we offer a dockerfile that will take care of these dependencies for you.
If you don't want to use Docker simply look inside the Docker directory at the dockerfile and environment.yml file for the dependencies.
To build the container run(replace all instances of <dockerhub account> with your own dockerhub account name):
```bash
make build dockerhub=<dockerhub account>
```
The you run the command to start the environment (replace <data_location> with a location on your file system. Make sure it has at least 300GB of free space for the ImageNet dataset)
```bash
make jupyter dockerhub=<dockerhub account> data=<data_location>
```
This will start the Jupyter notebook on port 9999. Simply point your browser to the IP or DNS of your machine.
From there you can navigate to [00_DataProcessing.ipynb](00_DataProcessing.ipynb) to process the ImageNet Data.
Once you have covered the two prerequisite notebooks folders [00_DataProcessing.ipynb](00_DataProcessing.ipynb) and [01_CreateResources.ipynb](01_CreateResources.ipynb) you can
navigate to the tutorials for each of the frameworks [HorovodTF](HorovodTF), [HorovodPytorch](HorovodPytorch) and [HorovodKeras](HorovodKeras).
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repos using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -0,0 +1,105 @@
import collections
import functools
import logging
from timeit import default_timer
class Timer(object):
"""
Keyword arguments:
output: if True, print output after exiting context.
if callable, pass output to callable.
format: str.format string to be used for output; default "took {} seconds"
prefix: string to prepend (plus a space) to output
For convenience, if you only specify this, output defaults to True.
"""
def __init__(self,
timer=default_timer,
factor=1,
output=None,
fmt="took {:.3f} seconds",
prefix=""):
self._timer = timer
self._factor = factor
self._output = output
self._fmt = fmt
self._prefix = prefix
self._end = None
self._start = None
def start(self):
self._start = self()
def stop(self):
self._end = self()
def __call__(self):
""" Return the current time """
return self._timer()
def __enter__(self):
""" Set the start time """
self.start()
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
""" Set the end time """
self.stop()
if self._output is True or (self._output is None and self._prefix):
self._output = print
if callable(self._output):
output = " ".join([self._prefix, self._fmt.format(self.elapsed)])
self._output(output)
def __str__(self):
return '%.3f' % (self.elapsed)
@property
def elapsed(self):
""" Return the elapsed time
"""
if self._end is None:
# if elapsed is called in the context manager scope
return (self() - self._start) * self._factor
else:
# if elapsed is called out of the context manager scope
return (self._end - self._start) * self._factor
def timer(logger=None,
level=logging.INFO,
fmt="function %(function_name)s execution time: %(execution_time).3f",
*func_or_func_args,
**timer_kwargs):
""" Function decorator displaying the function execution time
"""
def wrapped_f(f):
@functools.wraps(f)
def wrapped(*args, **kwargs):
with Timer(**timer_kwargs) as t:
out = f(*args, **kwargs)
context = {
'function_name': f.__name__,
'execution_time': t.elapsed,
}
if logger:
logger.log(
level,
fmt % context,
extra=context)
else:
print(fmt % context)
return out
return wrapped
if (len(func_or_func_args) == 1
and isinstance(func_or_func_args[0], collections.Callable)):
return wrapped_f(func_or_func_args[0])
else:
return wrapped_f

Просмотреть файл

@ -0,0 +1,31 @@
import json
import os
from dotenv import dotenv_values, set_key, find_dotenv, get_key
from getpass import getpass
def _create_env(dotenv_path):
with open(dotenv_path, 'a'):
os.utime(dotenv_path)
def dotenv_for():
dotenv_path = find_dotenv()
if dotenv_path == '':
dotenv_path = '.env'
_create_env(dotenv_path)
return dotenv_path
def get_password(dotenv_path):
if 'PASSWORD' not in dotenv_values(dotenv_path=dotenv_path):
print('Password not set')
password = getpass('Please enter password to use for the cluster')
_ = set_key(dotenv_path, 'PASSWORD', password)
return get_key(dotenv_path, 'PASSWORD')
def write_json_to_file(json_dict, filename, mode='w'):
with open(filename, mode) as outfile:
json.dump(json_dict, outfile, indent=4, sort_keys=True)
outfile.write('\n\n')

Двоичные данные
archectures/Python-Keras-Training/images/dist_training_diag2.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 65 KiB

Просмотреть файл

@ -0,0 +1,19 @@
define PROJECT_HELP_MSG
Usage:
make help show this message
make build make image
make push push image
endef
export PROJECT_HELP_MSG
help:
echo "$$PROJECT_HELP_MSG" | less
build:
docker build -t $(image) $(dockerpath)
push:
docker push $(image)
.PHONY: help build push

Разница между файлами не показана из-за своего большого размера Загрузить разницу

32
archectures/Python-ML-RealTimeServing/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,32 @@
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Environments
.env
# Jupyter Notebook
.ipynb_checkpoints
#AML
aml_config/
scripts/aml_config/
assets/
scripts/assets/
.amlignore
scripts/.amlignore
__pycache__/
scripts/__pycache__/
# Products
*.tsv
*.txt
*.pkl
datafolder/
lgbmenv.yml
score.py
.idea

Просмотреть файл

@ -0,0 +1,81 @@
![](https://dev.azure.com/customai/MLAKSDeployAMLPipeline/_apis/build/status/Microsoft.MLAKSDeployAML%20(master)?branchName=master)
### Authors: Fidan Boylu Uz, Yan Zhang
### Acknowledgements: Mario Bourgoin, Mathew Salvaris
# Deploying Python models for real-time scoring using Azure Machine Learning
In this repository there are a number of tutorials in Jupyter notebooks that have step-by-step instructions on (1) how to train a machine learning model using Python; (2) how to deploy a trained machine learning model throught Azure Machine Learning (AzureML). The tutorials cover how to deploy models on following deployment target:
- [Azure Kubernetes Service (AKS) Cluster](./{{cookiecutter.project_name}}/aks)
- [Azure IoT Edge](./{{cookiecutter.project_name}}/iotedge)
## Overview
This scenario shows how to deploy a Frequently Asked Questions (FAQ) matching model as a web service to provide predictions for user questions. For this scenario, “Input Data” in the [architecture diagram](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/realtime-scoring-python) refers to text strings containing the user questions to match with a list of FAQs. The scenario is designed for the Scikit-Learn machine learning library for Python but can be generalized to any scenario that uses Python models to make real-time predictions.
## Design
<!-- ![alt text](Design.png "Design") -->
The scenario uses a subset of Stack Overflow question data which includes original questions tagged as JavaScript, their duplicate questions, and their answers. It trains a Scikit-Learn pipeline to predict the match probability of a duplicate question with each of the original questions. These predictions are made in real time using a REST API endpoint.
The application flow for this architecture is as follows:
1. The client sends a HTTP POST request with the encoded question data.
2. The webservice extracts the question from the request
3. The question is then sent to the Scikit-learn pipeline model for featurization and scoring.
4. The matching FAQ questions with their scores are then piped into a JSON object and returned to the client.
An example app that consumes the results is included with the scenario.
## Prerequisites
1. Linux (Ubuntu).
2. [Anaconda Python](https://www.anaconda.com/download)
3. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed.
4. [Azure account](https://azure.microsoft.com).
---
**NOTE**
You will need to be able to run docker commands without sudo to run this tutorial. Use the following commands to do this.
```bash
sudo usermod -aG docker $USER
newgrp docker
```
---
The tutorial was developed on an [Azure Ubuntu
DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro),
which addresses the first three prerequisites.
## Setup
To set up your environment to run these notebooks, please follow these steps. They setup the notebooks to use Docker and Azure seamlessly.
1. Create an _Ubuntu_ _Linux_ DSVM and perform the following steps.
2. Install [cookiecutter](https://cookiecutter.readthedocs.io/en/latest/installation.html), a tool creates projects from project templates.
```bash
pip install cookiecutter
```
3. Use cookiecutter to clone this repository. Cookiecutter will prompt a series of questions where you will choose a specific framework, select your deployment settings, and obtain an Azure ML workspace.
```bash
cookiecutter https://github.com/Microsoft/MLAKSDeployAML.git
```
You will be asked to choose or enter information such as *project name*, *subsciption id*, *resource group*, etc. in an interactive way. You can press *Enter* to accept the default value or enter a value of your choice. For example, if you want to learn how to deploy machine learing model on AKS Cluster, you should choose the value "aks" for variable *deployment_type*. Instead, if you want to learn about deploying machine learning model on IoT Edge, you should select "iotedge" for the variable *deployment_type*.
Provide a valid value for "subscription_id", otherwise a `subscription id is missing` error will be generated **after** all the questions are asked. You will have to perform Step 3 all over again. The full list of questions can be found in [cookiecutter.json](./cookiecutter.json) file.
Please make sure all entered information are correct, as these information are used to customize the content of your repo.
4. On your local machine, you should now have a repo with the *project_name* you specified. Find the README.md file in this repo and proceed with instructions specified in it.
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repositories using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -0,0 +1,16 @@
{
"project_name":"pythonmodeldeploy",
"subscription_id": "",
"resource_group": "aksdeployrg",
"workspace_name": "workspace",
"workspace_region": [
"eastus",
"eastus2"
],
"image_name": "myimage",
"deployment_type": [
"aks",
"iotedge"
]
}

Просмотреть файл

@ -0,0 +1,32 @@
import os
import shutil
PROJECT_DIRECTORY = os.path.realpath(os.path.curdir)
def remove_file(filepath):
os.remove(os.path.join(PROJECT_DIRECTORY, filepath))
def remove_dir(dirpath):
shutil.rmtree(os.path.join(PROJECT_DIRECTORY, dirpath))
def move_files(parentdir, subdir):
root = os.path.join(PROJECT_DIRECTORY, parentdir)
for filename in os.listdir(os.path.join(root, subdir)):
shutil.move(os.path.join(root, subdir, filename), os.path.join(root, filename))
os.rmdir(os.path.join(root, subdir))
if __name__ == "__main__":
if "{{ cookiecutter.deployment_type }}" == "aks":
remove_dir("./iotedge")
move_files(".", "./aks")
if "{{ cookiecutter.deployment_type }}" == "iotedge":
remove_dir("./aks")
move_files(".", "./iotedge")

Просмотреть файл

@ -0,0 +1,51 @@
import re
import sys
MODULE_REGEX = r"^[_a-zA-Z][_a-zA-Z0-9]+$"
def check_module(module_name):
if not re.match(MODULE_REGEX, module_name):
print(
"ERROR: The project slug {} is not a valid Python module name. Please do not use a - and use _ instead".format(
module_name
)
)
# Exit to cancel project
sys.exit(1)
def check_sub_id(sub_id):
if len(sub_id) == 0:
print(
"ERROR: The subscription id is missing, please enter a valid subscription id slug"
)
# Exit to cancel project
sys.exit(1)
def check_image_name(image_name):
if "_" in image_name:
print(
"ERROR: The image name must not have underscores in it {}".format(
image_name
)
)
# Exit to cancel project
sys.exit(1)
if __name__ == "__main__":
check_module("{{cookiecutter.project_name}}")
check_sub_id("{{cookiecutter.subscription_id}}")
check_image_name("{{cookiecutter.image_name}}")
print("All checks passed")
if "{{ cookiecutter.deployment_type }}" == "aks":
print("Creating AKS project...")
if "{{ cookiecutter.deployment_type }}" == "iotedge":
print("Creating IOT Edge project...")

Просмотреть файл

@ -0,0 +1,245 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Installation and configuration\n",
"This notebook configures the notebooks in this tutorial to connect to an Azure Machine Learning (AML) Workspace. You can use an existing workspace or create a new one."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import azureml.core\n",
"from azureml.core import Workspace\n",
"from dotenv import set_key, get_key, find_dotenv\n",
"from pathlib import Path\n",
"from utilities import get_auth"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prerequisites"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you have already completed the prerequisites and selected the correct Kernel for this notebook, the AML Python SDK is already installed. Let's check the AML SDK version."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"AML SDK Version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up your Azure Machine Learning workspace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To create or access an Azure ML Workspace, you will need the following information:\n",
"\n",
"* Your subscription id\n",
"* A resource group name\n",
"* A name for your workspace\n",
"* A region for your workspace\n",
"\n",
"**Note**: As with other Azure services, there are limits on certain resources like cluster size associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you have a workspace created already, you need to get your subscription and workspace information. You can find the values for those by visiting your workspace in the [Azure portal](http://portal.azure.com). If you don't have a workspace, the create workspace command in the next section will create a resource group and a workspace using the names you provide."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Replace the values in the following cell with your information. If you would like to use service principal authentication as described [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/manage-azureml-service/authentication-in-azureml/authentication-in-azure-ml.ipynb) make sure you provide the optional values as well. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Azure resources\n",
"subscription_id = \"{{cookiecutter.subscription_id}}\"\n",
"resource_group = \"{{cookiecutter.resource_group}}\" \n",
"workspace_name = \"{{cookiecutter.workspace_name}}\" \n",
"workspace_region = \"{{cookiecutter.workspace_region}}\"\n",
"image_name = (\n",
" \"{{cookiecutter.image_name}}\"\n",
") # e.g. image_name = \"{{cookiecutter.image_name}} (avoid underscore in names)\"\n",
"\n",
"tenant_id = \"YOUR_TENANT_ID\" # Optional for service principal authentication\n",
"username = \"YOUR_SERVICE_PRINCIPAL_APPLICATION_ID\" # Optional for service principal authentication\n",
"password = \"YOUR_SERVICE_PRINCIPAL_PASSWORD\" # Optional for service principal authentication"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create and initialize a dotenv file for storing parameters used in multiple notebooks."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv()\n",
"if env_path == \"\":\n",
" Path(\".env\").touch()\n",
" env_path = find_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set_key(env_path, \"subscription_id\", subscription_id) # Replace YOUR_AZURE_SUBSCRIPTION\n",
"set_key(env_path, \"resource_group\", resource_group)\n",
"set_key(env_path, \"workspace_name\", workspace_name)\n",
"set_key(env_path, \"workspace_region\", workspace_region)\n",
"set_key(env_path, \"image_name\", image_name)\n",
"\n",
"set_key(env_path, \"tenant_id\", tenant_id)\n",
"set_key(env_path, \"username\", username)\n",
"set_key(env_path, \"password\", password)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Create the workspace\n",
"This cell will create an AML workspace for you in a subscription, provided you have the correct permissions.\n",
"\n",
"This will fail when:\n",
"1. You do not have permission to create a workspace in the resource group\n",
"2. You do not have permission to create a resource group if it's non-existing.\n",
"2. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription\n",
"\n",
"If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to provision the required resources. If this cell succeeds, you're done configuring AML! "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.create(\n",
" name=workspace_name,\n",
" subscription_id=subscription_id,\n",
" resource_group=resource_group,\n",
" location=workspace_region,\n",
" create_resource_group=True,\n",
" auth=get_auth(env_path),\n",
" exist_ok=True,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the details of the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws.get_details()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's write the workspace configuration for the rest of the notebooks to connect to the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws.write_config()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You are now ready to move on to the [data preperation](01_DataPrep.ipynb) notebook."
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,707 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we use a subset of [Stack Exchange network](https://archive.org/details/stackexchange) question data which includes original questions tagged as 'JavaScript', their duplicate questions and their answers. Here, we provide the steps to prepare the data to use in model development for training a model that will match a new question with an existing original question. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"from utilities import read_csv_gz, clean_text, round_sample_strat, random_merge"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below, we define some parameters that will be used in the data cleaning as well as train and test set preparation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The size of the test set\n",
"test_size = 0.21\n",
"# The minimum length of clean text\n",
"min_text = 150\n",
"# The minimum number of duplicates per question\n",
"min_dupes = 12\n",
"# The maximum number of duplicate matches\n",
"match = 20\n",
"# The output files path\n",
"outputs_path = os.path.join('.', 'data_folder')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data cleaning"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we download the questions, duplicate questions and answers and load the datasets into pandas dataframes using the helper functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# URLs to original questions, duplicate questions, and answers.\n",
"data_url = \"https://bostondata.blob.core.windows.net/stackoverflow/{}\"\n",
"questions_url = data_url.format(\"orig-q.tsv.gz\")\n",
"dupes_url = data_url.format(\"dup-q.tsv.gz\")\n",
"answers_url = data_url.format(\"ans.tsv.gz\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load datasets.\n",
"questions = read_csv_gz(questions_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
"dupes = read_csv_gz(dupes_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
"answers = read_csv_gz(answers_url, names=('Id', 'Text0'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's now check the dataframes. Notice that questions and duplicates have \"AnswerID\" column that would help match with the index of answers dataframe."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"answers.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the first original question's text."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions.iloc[0,1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's now check the duplicates for that question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes[dupes.AnswerId == questions.iloc[0, 0]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below is the answer to the original question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"answers.at[questions.iloc[0,0],'Text0']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we use the helper functions to clean questions, duplicates and answers from unwanted text such as code, html tags and links. Notice that we add a new column 'Text' to each dataframe for clean text in lowercase."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Clean up all text, and keep only data with some clean text.\n",
"for df in (questions, dupes, answers):\n",
" df[\"Text\"] = df.Text0.apply(clean_text).str.lower()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions = questions[questions.Text.str.len() > 0]\n",
"answers = answers[answers.Text.str.len() > 0]\n",
"dupes = dupes[dupes.Text.str.len() > 0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's compare the first original question and cleaned version as an example."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Original question.\n",
"questions.iloc[0,1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# After cleaning.\n",
"questions.iloc[0,3]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"it turns out that some duplicate questions were also in original questions. Also, some original questions and some duplicate questions were duplicated in the datasets. In the following, we remove them from the dataframes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# First, remove dupes that are questions, then remove duplicated questions and dupes.\n",
"dupes = dupes[~dupes.index.isin(questions.index)]\n",
"questions = questions[~questions.index.duplicated(keep='first')]\n",
"dupes = dupes[~dupes.index.duplicated(keep='first')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We also make sure we keep questions with answers and duplicates."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only questions with answers and dupes, answers to questions, and dupes of questions.\n",
"questions = questions[\n",
" questions.AnswerId.isin(answers.index) & questions.AnswerId.isin(dupes.AnswerId)\n",
"]\n",
"answers = answers[answers.index.isin(questions.AnswerId)]\n",
"dupes = dupes[dupes.AnswerId.isin(questions.AnswerId)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Verify data integrity.\n",
"assert questions.AnswerId.isin(answers.index).all()\n",
"assert answers.index.isin(questions.AnswerId).all()\n",
"assert questions.AnswerId.isin(dupes.AnswerId).all()\n",
"assert dupes.AnswerId.isin(questions.AnswerId).all()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below are some statistics on the data. Notice that some questions have very low number of duplicates while others may have a large number. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the data.\n",
"print(\"Text statistics:\")\n",
"print(\n",
" pd.DataFrame(\n",
" [\n",
" questions.Text.str.len().describe().rename(\"questions\"),\n",
" answers.Text.str.len().describe().rename(\"answers\"),\n",
" dupes.Text.str.len().describe().rename(\"dupes\"),\n",
" ]\n",
" )\n",
")\n",
"print(\"\\nDuplication statistics:\")\n",
"print(pd.DataFrame([dupes.AnswerId.value_counts().describe().rename(\"duplications\")]))\n",
"print(\n",
" \"\\nLargest class: {:.2%}\".format(\n",
" dupes.AnswerId.value_counts().max() / dupes.shape[0]\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, we reset all indexes to use them as columns in the rest of the steps."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Reset each dataframe's index.\n",
"questions.reset_index(inplace=True)\n",
"answers.reset_index(inplace=True)\n",
"dupes.reset_index(inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We filter the questions and duplicates to have at least min_text number of characters."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Apply the minimum text length to questions and dupes.\n",
"questions = questions[questions.Text.str.len() >= min_text]\n",
"dupes = dupes[dupes.Text.str.len() >= min_text]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only questions with dupes, and dupes of questions.\n",
"label_column = \"AnswerId\"\n",
"questions = questions[questions[label_column].isin(dupes[label_column])]\n",
"dupes = dupes[dupes[label_column].isin(questions[label_column])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, we remove questions and their duplicates that are less than min_dupes parameter."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Restrict the questions to those with a minimum number of dupes.\n",
"answerid_count = dupes.groupby(label_column)[label_column].count()\n",
"answerid_min = answerid_count.index[answerid_count >= min_dupes]\n",
"questions = questions[questions[label_column].isin(answerid_min)]\n",
"dupes = dupes[dupes[label_column].isin(answerid_min)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # Verify data integrity.\n",
"assert questions[label_column].isin(dupes[label_column]).all()\n",
"assert dupes[label_column].isin(questions[label_column]).all()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here are some statistics on the resulting dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the data.\n",
"print(\"Restrictions: min_text={}, min_dupes={}\".format(min_text, min_dupes))\n",
"print(\"Restricted text statistics:\")\n",
"print(\n",
" pd.DataFrame(\n",
" [\n",
" questions.Text.str.len().describe().rename(\"questions\"),\n",
" dupes.Text.str.len().describe().rename(\"dupes\"),\n",
" ]\n",
" )\n",
")\n",
"print(\"\\nRestricted duplication statistics:\")\n",
"print(\n",
" pd.DataFrame([dupes[label_column].value_counts().describe().rename(\"duplications\")])\n",
")\n",
"print(\n",
" \"\\nRestricted largest class: {:.2%}\".format(\n",
" dupes[label_column].value_counts().max() / dupes.shape[0]\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare train and test sets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this part, we prepare train and test sets. For training a binary classification model, we will need to construct match and non-match pairs from duplicates and their questions. Finding matching pairs can be accomplished by joining each duplicate with its question. However, non-match examples need to be constructed randomly. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As a first step, to make sure we train and test the performance of the model on each question, we will need to have examples of match and non-match pairs for each question both in train and test sets. In order to achieve that, we split the duplicates in a stratified manner into train and test sets making sure at least 1 or more duplicates per question is in the test set depending on test_size parameter and number of duplicates per each question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Split dupes into train and test ensuring at least one of each label class is in test.\n",
"dupes_test = round_sample_strat(dupes, dupes[label_column], frac=test_size)\n",
"dupes_train = dupes[~dupes.Id.isin(dupes_test.Id)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert (dupes_test[label_column].unique().shape[0] == dupes[label_column].unique().shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The relevant columns for text pairs data.\n",
"balanced_pairs_columns = ['Id_x', 'AnswerId_x', 'Text_x', 'Id_y', 'Text_y', 'AnswerId_y', 'Label', 'n']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we pair each training duplicate in train set with its matching question and N-1 random questions using the helper function."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use AnswerId to pair each training dupe with its matching question and also with N-1 questions not its match.\n",
"%time balanced_pairs_train = random_merge(dupes_train, questions, N=match)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Labeling is done such that matching pairs are labeled as 1 and non-match pairs are labeled as 0."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Label records by matching AnswerIds.\n",
"balanced_pairs_train[\"Label\"] = (\n",
" balanced_pairs_train.AnswerId_x == balanced_pairs_train.AnswerId_y\n",
").astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only the relevant data.\n",
"balanced_pairs_train = balanced_pairs_train[balanced_pairs_columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"balanced_pairs_train.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sort the data by dupe ID and Label.\n",
"balanced_pairs_train.sort_values(by=['Id_x', 'Label'], ascending=[True, False], inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In testing set, we match each duplicate with all the original questions and label them same way as training set."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use AnswerId to pair each testing dupe with all questions.\n",
"%time balanced_pairs_test = random_merge(dupes_test, questions, N=questions.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Label records by matching AnswerIds.\n",
"balanced_pairs_test[\"Label\"] = (\n",
" balanced_pairs_test.AnswerId_x == balanced_pairs_test.AnswerId_y\n",
").astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only the relevant data.\n",
"balanced_pairs_test = balanced_pairs_test[balanced_pairs_columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"balanced_pairs_test.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sort the data by dupe ID and Label.\n",
"balanced_pairs_test.sort_values(\n",
" by=[\"Id_x\", \"Label\"], ascending=[True, False], inplace=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, we report the final train and test sets and save as text files to be used by modeling."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the datasets.\n",
"print(\n",
" \"balanced_pairs_train: {:,} rows with {:.2%} matches\".format(\n",
" balanced_pairs_train.shape[0], balanced_pairs_train.Label.mean()\n",
" )\n",
")\n",
"print(\n",
" \"balanced_pairs_test: {:,} rows with {:.2%} matches\".format(\n",
" balanced_pairs_test.shape[0], balanced_pairs_test.Label.mean()\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(outputs_path, exist_ok=True)\n",
"\n",
"# Save the data.\n",
"balanced_pairs_train_path = os.path.join(outputs_path, \"balanced_pairs_train.tsv\")\n",
"print(\n",
" \"Writing {:,} to {}\".format(\n",
" balanced_pairs_train.shape[0], balanced_pairs_train_path\n",
" )\n",
")\n",
"balanced_pairs_train.to_csv(\n",
" balanced_pairs_train_path, sep=\"\\t\", header=True, index=False\n",
")\n",
"\n",
"balanced_pairs_test_path = os.path.join(outputs_path, \"balanced_pairs_test.tsv\")\n",
"print(\n",
" \"Writing {:,} to {}\".format(balanced_pairs_test.shape[0], balanced_pairs_test_path)\n",
")\n",
"balanced_pairs_test.to_csv(balanced_pairs_test_path, sep=\"\\t\", header=True, index=False)\n",
"\n",
"# Save original questions to be used for scoring later.\n",
"questions_path = os.path.join(outputs_path, \"questions.tsv\")\n",
"print(\"Writing {:,} to {}\".format(questions.shape[0], questions_path))\n",
"questions.to_csv(questions_path, sep=\"\\t\", header=True, index=False)\n",
"\n",
"# Save the test duplicate questions to be used with the scoring function.\n",
"dupes_test_path = os.path.join(outputs_path, \"dupes_test.tsv\")\n",
"print(\"Writing {:,} to {}\".format(dupes_test.shape[0], dupes_test_path))\n",
"dupes_test.to_csv(dupes_test_path, sep=\"\\t\", header=True, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now move on to [train on local](02_TrainOnLocal.ipynb) notebook to train our model using Azure Machine Learning."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,649 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train Locally\n",
"In this notebook, you will perform the following using Azure Machine Learning.\n",
"* Load workspace.\n",
"* Configure & execute a local run in a user-managed Python environment.\n",
"* Configure & execute a local run in a system-managed Python environment.\n",
"* Configure & execute a local run in a Docker environment.\n",
"* Register model for operationalization."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.core import Experiment\n",
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core import ScriptRunConfig\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from dotenv import set_key, get_key, find_dotenv\n",
"from utilities import get_auth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Model Hyperparameters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook uses a training script that uses [lightgbm](https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api) . Here we set the number of estimators. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"num_estimators = \"10\"\n",
"set_key(env_path, \"num_estimators\", num_estimators)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create An Experiment\n",
"**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics and output artifacts from your experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"experiment_name = \"mlaks-train-on-local\"\n",
"exp = Experiment(workspace=ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## View `create_model.py`\n",
"\n",
"The script that trains the model `create_model.py` is already created for you. Let's check its contents."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(\"./scripts/create_model.py\", \"r\") as f:\n",
" print(f.read())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note `create_model.py` also references a `item_selector.py` and `label_rank.py` file. Let's check those scripts as well."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('./scripts/item_selector.py', 'r') as f:\n",
" print(f.read())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('./scripts/label_rank.py', 'r') as f:\n",
" print(f.read())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure & Run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this section, we show three different ways of locally training your model through Azure ML SDK for demonstration purposes. Only one of these runs is sufficient to register the model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### User-managed environment\n",
"Below, we use a user-managed run, which means you are responsible to ensure all the necessary packages that are available in the Python environment you choose to run the script. We will use the environment created for this tutorial which has Azure ML SDK and other dependencies installed."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Editing a run configuration property on-fly.\n",
"run_config_user_managed = RunConfiguration()\n",
"\n",
"run_config_user_managed.environment.python.user_managed_dependencies = True\n",
"\n",
"# Choose the specific Python environment of this tutorial by pointing to the Python path\n",
"run_config_user_managed.environment.python.interpreter_path = (\n",
" \"/anaconda/envs/MLAKSDeployAML/bin/python\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit script to run in the user-managed environment\n",
"Note that the whole `scripts` folder is submitted for execution, including the `item_selector.py` and `label_rank.py` files. The model will be written to `outputs` directory which is a special directory such that all content in this directory is automatically uploaded to your workspace. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scrpt = \"create_model.py\"\n",
"args = [\n",
" \"--inputs\",\n",
" os.path.abspath(\"./data_folder\"),\n",
" \"--outputs\",\n",
" \"outputs\",\n",
" \"--estimators\",\n",
" get_key(env_path, 'num_estimators'),\n",
" \"--match\",\n",
" \"5\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"src = ScriptRunConfig(\n",
" source_directory=\"./scripts\",\n",
" script=scrpt,\n",
" arguments=args,\n",
" run_config=run_config_user_managed,\n",
")\n",
"#run = exp.submit(src)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get run history details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Block to wait till run finishes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#%%time\n",
"#run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check that the model is now available in your workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#run.get_file_names()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrive the accuarcy of the model from run logs by querying the run metrics."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### System-managed environment\n",
"You can also ask the system to build a new conda environment and execute your scripts in it. The environment is built once and will be reused in subsequent executions as long as the conda dependencies remain unchanged. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_config_system_managed = RunConfiguration()\n",
"run_config_system_managed.environment.python.user_managed_dependencies = False\n",
"run_config_system_managed.auto_prepare_environment = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's specifiy the conda and pip dependencies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Specify conda dependencies with scikit-learn and pandas\n",
"conda_pack = [\"scikit-learn==0.19.1\", \"pandas==0.23.3\"]\n",
"requirements = [\"lightgbm==2.1.2\", \"azureml-defaults==1.0.10\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cd = CondaDependencies.create(conda_packages=conda_pack, pip_packages=requirements)\n",
"run_config_system_managed.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit script to run in the system-managed environment\n",
"A new conda environment is built based on the conda dependencies object. If you are running this for the first time, this might take up to 5 minutes. But this conda environment is reused so long as you don't change the conda dependencies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"src = ScriptRunConfig(\n",
" source_directory=\"./scripts\",\n",
" script=scrpt,\n",
" arguments=args,\n",
" run_config=run_config_system_managed,\n",
")\n",
"run = exp.submit(src)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get run history details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Block and wait till run finishes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"run.wait_for_completion(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.get_file_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Docker-based execution\n",
"**IMPORTANT**: You must have Docker engine installed locally in order to use this execution mode. If your kernel is already running in a Docker container, such as **Azure Notebooks**, this mode will **NOT** work.\n",
"\n",
"You can also ask the system to pull down a Docker image and execute your scripts in it. We will use the `continuumio/miniconda3` image for that purpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_config_docker = RunConfiguration()\n",
"run_config_docker.environment.python.user_managed_dependencies = False\n",
"run_config_docker.auto_prepare_environment = True\n",
"run_config_docker.environment.docker.enabled = True\n",
"run_config_docker.environment.docker.base_image = \"continuumio/miniconda3\"\n",
"\n",
"# Specify conda and pip dependencies\n",
"cd = CondaDependencies.create(conda_packages=conda_pack, pip_packages=requirements)\n",
"run_config_docker.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, we map the local `data_folder` that includes the training and testing data to the docker container using `-v` flag."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"host_dir = os.path.abspath(\"./data_folder\")\n",
"container_dir = \"/data_folder\"\n",
"docker_arg = \"{}:{}\".format(host_dir, container_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This time the run will use the mapped `data_folder` inside the docker container to find the data files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"args = [\n",
" \"--inputs\",\n",
" \"/data_folder\",\n",
" \"--outputs\",\n",
" \"outputs\",\n",
" \"--estimators\",\n",
" get_key(env_path, 'num_estimators'),\n",
" \"--match\",\n",
" \"5\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_config_docker.environment.docker.arguments.append(\"-v\")\n",
"run_config_docker.environment.docker.arguments.append(docker_arg)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"src = ScriptRunConfig(\n",
" source_directory=\"./scripts\",\n",
" script=scrpt,\n",
" arguments=args,\n",
" run_config=run_config_docker,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" run = exp.submit(src)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Get run history details\n",
"run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.get_file_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now register the model with the workspace so that we can later deploy the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# supply a model name, and the full path to the serialized model file.\n",
"model = run.register_model(\n",
" model_name=\"question_match_model\", model_path=\"./outputs/model.pkl\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(model.name, model.version, model.url, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_version = str(model.version)\n",
"set_key(env_path, \"model_version\", model_version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we [develop the scoring script](03_DevelopScoringScript.ipynb) for this model."
]
}
],
"metadata": {
"authors": [
{
"name": "roastala"
}
],
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,264 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Develop Scoring Script"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we will develop the scoring script and test it locally. We will use the scoring script to create the web service that will call the model for scoring."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pandas as pd\n",
"from utilities import text_to_json, get_auth\n",
"import logging\n",
"from dotenv import set_key, get_key, find_dotenv\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.core.model import Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sys.path.append('./scripts/')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's load the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrive the model registered earlier and download it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = 'question_match_model'\n",
"model_version = int(get_key(env_path, 'model_version'))\n",
"model = Model(ws, name=model_name, version=model_version)\n",
"print(model.name, model.version, model.url, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.download(target_dir=\".\", exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Scoring Script"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We use the writefile magic to write the contents of the below cell to `score.py` which includes the `init` and `run` functions required by AML.\n",
"- The init() function typically loads the model into a global object.\n",
"- The run(input_data) function uses the model to predict a value based on the input_data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"\n",
"import pandas as pd\n",
"import json\n",
"from duplicate_model import DuplicateModel\n",
"import logging\n",
"import timeit as t\n",
"\n",
"def init():\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" global model\n",
" model_path = \"model.pkl\"\n",
" questions_path = \"./data_folder/questions.tsv\"\n",
" start = t.default_timer()\n",
" model = DuplicateModel(model_path, questions_path)\n",
" end = t.default_timer()\n",
" loadTimeMsg = \"Model loading time: {0} ms\".format(round((end-start)*1000, 2))\n",
" logger.info(loadTimeMsg)\n",
"\n",
"\n",
"def run(body):\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" json_load_text = json.loads(body)\n",
" text_to_score = json_load_text[\"input\"]\n",
" start = t.default_timer()\n",
" resp = model.score(text_to_score)\n",
" end = t.default_timer()\n",
" logger.info(\"Prediction took {0} ms\".format(round((end-start)*1000, 2)))\n",
" return json.dumps(resp)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's test by running the score.py which will bring the imports and functions into the context of the notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"logging.basicConfig(level=logging.DEBUG)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%run score.py"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, let's use one of the duplicate questions to test our driver."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_test_path = './data_folder/dupes_test.tsv'\n",
"dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n",
"text_to_score = dupes_test.iloc[0,4]\n",
"text_to_score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, call the init() to initalize the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"init()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We convert the question text to json format and make predictions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"jsontext = text_to_json(text_to_score)\n",
"r = run(jsontext)\n",
"r"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we move on to [creating the docker image which we will deploy](04_CreateImage.ipynb)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,332 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create Image\n",
"In this notebook, we show the following steps for deploying a web service using AzureML:\n",
"- Create an image\n",
"- Test image locally"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from utilities import text_to_json, get_auth\n",
"from azureml.core.model import Model\n",
"from azureml.core.workspace import Workspace\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from dotenv import set_key, get_key, find_dotenv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"AML will use the following information to create an image, provision a cluster and deploy a service. Replace the values in the following cell with your information."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_name = get_key(env_path, 'image_name')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get workspace\n",
"Load existing workspace from the config file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = 'question_match_model'\n",
"model_version = int(get_key(env_path, 'model_version'))\n",
"model = Model(ws, name=model_name, version=model_version)\n",
"print(model.name, model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an image\n",
"We will now modify the `score.py` created in the previous notebook for the `init()` function to use the model we registered to the workspace earlier."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"\n",
"import sys\n",
"import pandas as pd\n",
"import json\n",
"from duplicate_model import DuplicateModel\n",
"import logging\n",
"import timeit as t\n",
"from azureml.core.model import Model\n",
"sys.path.append('./scripts/')\n",
"\n",
"def init():\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" global model\n",
" model_name = 'question_match_model'\n",
" model_path = Model.get_model_path(model_name)\n",
" questions_path = './data_folder/questions.tsv'\n",
" start = t.default_timer()\n",
" model = DuplicateModel(model_path, questions_path)\n",
" end = t.default_timer()\n",
" loadTimeMsg = \"Model loading time: {0} ms\".format(round((end-start)*1000, 2))\n",
" logger.info(loadTimeMsg)\n",
"\n",
"def run(body):\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" json_load_text = json.loads(body)\n",
" text_to_score = json_load_text['input']\n",
" start = t.default_timer()\n",
" resp = model.score(text_to_score) \n",
" end = t.default_timer()\n",
" logger.info(\"Prediction took {0} ms\".format(round((end-start)*1000, 2)))\n",
" return(json.dumps(resp))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's specifiy the conda and pip dependencies for the image."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"conda_pack = [\"scikit-learn==0.19.1\", \"pandas==0.23.3\"]\n",
"requirements = [\"lightgbm==2.1.2\", \"azureml-defaults==1.0.10\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lgbmenv = CondaDependencies.create(conda_packages=conda_pack, pip_packages=requirements)\n",
"\n",
"with open(\"lgbmenv.yml\", \"w\") as f:\n",
" f.write(lgbmenv.serialize_to_string())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(\n",
" execution_script=\"score.py\",\n",
" runtime=\"python\",\n",
" conda_file=\"lgbmenv.yml\",\n",
" description=\"Image with lightgbm model\",\n",
" tags={\"area\": \"text\", \"type\": \"lightgbm\"},\n",
" dependencies=[\n",
" \"./data_folder/questions.tsv\",\n",
" \"./duplicate_model.py\",\n",
" \"./scripts/item_selector.py\",\n",
" ],\n",
")\n",
"\n",
"image = ContainerImage.create(\n",
" name=image_name,\n",
" # this is the model object\n",
" models=[model],\n",
" image_config=image_config,\n",
" workspace=ws,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"image.wait_for_creation(show_output = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(image.name, image.version)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_version = str(image.version)\n",
"set_key(env_path, \"image_version\", image_version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can find the logs of image creation in the following location."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image.image_build_log_uri"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test image locally"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, let's use one of the duplicate questions to test our image."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_test_path = './data_folder/dupes_test.tsv'\n",
"dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n",
"text_to_score = dupes_test.iloc[0,4]\n",
"text_to_score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"jsontext = text_to_json(text_to_score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"image.run(input_data=jsontext)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conclusion\n",
"\n",
"We have created a docker Image using AzureML and registred this image on Azure Container Registry (ACR). This docker image encapsulates a trained machine learning model and scoring scripts. In the next step, we can take this image and deploy it on the compute target of your choice: Azure Kubernetes Service (AKS) Cluster or Azure IoT Edge."
]
}
],
"metadata": {
"authors": [
{
"name": "raymondl"
}
],
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,194 @@
.ONESHELL:
SHELL=/bin/bash
define PROJECT_HELP_MSG
Makefile for testing notebooks
Make sure you have edited the dev_env_template files and renamed it to .dev_env
All the variables loaded in this makefile must come from the .dev_env file
Usage:
make test run all notebooks
make clean delete env and remove files
endef
export PROJECT_HELP_MSG
env_location=.dev_env
PWD:=$(shell pwd)
include ${env_location}
help:
echo "$$PROJECT_HELP_MSG" | less
test: setup test-notebook1 test-notebook2 test-notebook3 test-notebook4 test-notebook5 test-notebook6 test-notebook7 \
test-notebook8 test-notebook-iot1 test-notebook9 test-notebook-iot2
@echo All Notebooks Passed
setup:
conda env create -f environment.yml
ifndef TENANT_ID
@echo starting interactive login
az login -o table
az account set --subscription ${SUBSCRIPTION_ID}
else
@echo using service principal login
az login -t ${TENANT_ID} --service-principal -u ${SP_USERNAME} --password ${SP_PASSWORD}
endif
test-notebook1:
source activate MLAKSDeployAML
@echo Testing 00_AMLConfiguration.ipynb
papermill 00_AMLConfiguration.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p subscription_id ${SUBSCRIPTION_ID} \
-p resource_group ${RESOURCE_GROUP} \
-p workspace_name ${WORKSPACE_NAME} \
-p workspace_region ${WORKSPACE_REGION} \
-p image_name ${IMAGE_NAME} \
test-notebook2:
source activate MLAKSDeployAML
@echo Testing 01_DataPrep.ipynb
papermill 01_DataPrep.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook3:
source activate MLAKSDeployAML
@echo Testing 02_TrainOnLocal.ipynb
papermill 02_TrainOnLocal.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook4:
source activate MLAKSDeployAML
@echo Testing 03_DevelopScoringScript.ipynb
papermill 03_DevelopScoringScript.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
sleep 1m
test-notebook5:
source activate MLAKSDeployAML
@echo Testing 04_CreateImage.ipynb
papermill 04_CreateImage.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
sleep 30
test-notebook6:
source activate MLAKSDeployAML
@echo Testing 05_DeployOnAKS.ipynb
papermill aks/05_DeployOnAKS.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p aks_name ${AKS_NAME} \
-p aks_location ${WORKSPACE_REGION} \
-p aks_service_name ${AKS_SERVICE_NAME}
test-notebook7:
source activate MLAKSDeployAML
@echo Testing 06_SpeedTestWebApp.ipynb
papermill aks/06_SpeedTestWebApp.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook8:
source activate MLAKSDeployAML
@echo Testing 07_RealTimeScoring.ipynb
papermill aks/07_RealTimeScoring.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook-iot1:
source activate MLAKSDeployAML
@echo Testing 05_DeployOnIOTedge.ipynb
export PYTHONPATH=${PWD}:${PYTHONPATH}
cd iotedge
mkdir ./data_folder
cp ../data_folder/dupes_test.tsv ./data_folder
papermill 05_DeployOnIOTedge.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p iot_hub_name fstlstnameiothub \
-p device_id mydevice \
-p module_name mymodule
test-notebook9:
source activate MLAKSDeployAML
@echo Testing 08_TearDown.ipynb
papermill aks/08_TearDown.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook-iot2:
source activate MLAKSDeployAML
@echo Testing 06_TearDown.ipynb
export PYTHONPATH=${PWD}:${PYTHONPATH}
papermill iotedge/06_TearDown.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-cookiecutter-aks:
cookiecutter --no-input https://github.com/Microsoft/MLAKSDeployAML.git --checkout yzhang \
subscription_id="${SUBSCRIPTION_ID}" \
workspace_region=${WORKSPACE_REGION} \
deployment_type="aks"
test-cookiecutter-iot:
cookiecutter --no-input https://github.com/Microsoft/MLAKSDeployAML.git --checkout yzhang \
subscription_id=${SUBSCRIPTION_ID} \
workspace_region=${WORKSPACE_REGION} \
deployment_type="iotedge"
remove-notebook:
rm -f test.ipynb
clean: remove-notebook
conda remove --name MLAKSDeployAML -y --all
rm -rf aml_config
rm -rf __pycache__
rm -rf .ipynb_checkpoints
rm -rf data_folder
rm -rf azureml-models
rm -rf score.py lgbmenv.yml model.pkl
rm -rf iotedge/deployment.json iotedge/deviceconfig.sh
rm -rf iotedge/data_folder
notebook:
source activate MLAKSDeployAML
jupyter notebook --port 9999 --ip 0.0.0.0 --no-browser
install-jupytext:
source activate MLAKSDeployAML
conda install -c conda-forge jupytext
convert-to-py:
jupytext --set-formats ipynb,py_scripts//py --sync *.ipynb
sync:
jupytext --sync *.ipynb
convert-to-ipynb:
jupytext --set-formats ipynb *.ipynb
remove-py:
rm -r py_scripts
.PHONY: help test setup clean remove-notebook test-notebook1 test-notebook2 test-notebook3 test-notebook4 \
test-notebook5 test-notebook6 test-notebook7 test-notebook8 test-notebook-iot1 test-notebook9 test-notebook-iot2

Просмотреть файл

@ -0,0 +1,614 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploying a web service to Azure Kubernetes Service (AKS)\n",
"In this notebook, we show the following steps for deploying a web service using AzureML:\n",
"- Provision an AKS cluster (one time action)\n",
"- Deploy the service\n",
"- Test the web service\n",
"- Scale up the service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import os\n",
"import subprocess\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import requests\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"from azureml.core.workspace import Workspace\n",
"from dotenv import set_key, get_key, find_dotenv\n",
"from utilities import text_to_json, get_auth\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"AML will use the following information to create an image, provision a cluster and deploy a service. Replace the values in the following cell with your information."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_name = get_key(env_path, 'image_name')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"aks_service_name = \"YOUR_AKS_SERVICE_NAME\"\n",
"aks_name = \"YOUR_AKS_NAME\"\n",
"aks_location = \"YOUR_AKS_LOCATION\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"set_key(env_path, \"aks_service_name\", aks_service_name)\n",
"set_key(env_path, \"aks_name\", aks_name)\n",
"set_key(env_path, \"aks_location\", aks_location)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get workspace\n",
"Load existing workspace from the config file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image = ws.images[image_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Provision the AKS Cluster\n",
"This is a one time setup. You can reuse this cluster for multiple deployments after it has been created. If you delete the cluster or the resource group that contains it, then you would have to recreate it. Let's first check if there are enough cores in the subscription for the cluster ."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_dict = {\n",
" \"Dv2\": {\n",
" \"size\": \"Standard_D4_v2\",\n",
" \"cores\": 8\n",
" }\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_family = \"Dv2\"\n",
"node_count = 4\n",
"requested_cores = node_count * vm_dict[vm_family][\"cores\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = subprocess.run([\n",
" \"az\", \"vm\", \"list-usage\", \n",
" \"--location\", get_key(env_path, \"aks_location\"), \n",
" \"--query\", \"[?contains(localName, '%s')].{max:limit, current:currentValue}\" % (vm_family)\n",
"], stdout=subprocess.PIPE)\n",
"quota = json.loads(''.join(results.stdout.decode('utf-8')))\n",
"diff = int(quota[0]['max']) - int(quota[0]['current'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if diff <= requested_cores:\n",
" print(\"Not enough cores in region, asking for {} but have {}\".format(requested_cores, diff))\n",
" raise Exception(\"Core Limit\", \"Note enough cores to satisfy request\")\n",
"print(\"There are enough cores, you may continue...\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prov_config = AksCompute.provisioning_configuration(\n",
" agent_count=4, vm_size=\"Standard_D4_v2\", location=aks_location\n",
")\n",
"\n",
"# Create the cluster\n",
"aks_target = ComputeTarget.create(\n",
" workspace=ws, name=aks_name, provisioning_configuration=prov_config\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.wait_for_completion(show_output = True)\n",
"print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check that the cluster is created successfully."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_status = aks_target.get_status()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert aks_status == 'Succeeded', 'AKS failed to create'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy web service to AKS"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we deploy the web service. We deploy two pods with 1 CPU core each."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Set the web service configuration \n",
"aks_config = AksWebservice.deploy_configuration(num_replicas=2, cpu_cores=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service = Webservice.deploy_from_image(\n",
" workspace=ws,\n",
" name=aks_service_name,\n",
" image=image,\n",
" deployment_config=aks_config,\n",
" deployment_target=aks_target,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_service.wait_for_deployment(show_output=True)\n",
"print(aks_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can check the logs of the web service with the below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service.get_logs()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test the web service\n",
"We now test the web sevice."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_test_path = './data_folder/dupes_test.tsv'\n",
"dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n",
"text_to_score = dupes_test.iloc[0,4]\n",
"text_to_score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"jsontext = text_to_json(text_to_score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"prediction = aks_service.run(input_data = jsontext)\n",
"print(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's try a few more duplicate questions and display their top 3 original matches. Let's first get the scoring URL and and API key for the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scoring_url = aks_service.scoring_uri\n",
"api_key = aks_service.get_keys()[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"headers = {'content-type': 'application/json', 'Authorization':('Bearer '+ api_key)}\n",
"r = requests.post(scoring_url, data=jsontext, headers=headers) # Run the request twice since the first time takes a \n",
"%time r = requests.post(scoring_url, data=jsontext, headers=headers) # little longer due to the loading of the model\n",
"print(r)\n",
"r.json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_to_score = dupes_test.iloc[:5,4]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = [\n",
" requests.post(scoring_url, data=text_to_json(text), headers=headers)\n",
" for text in dupes_to_score\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's print top 3 matches for each duplicate question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"[eval(results[i].json())[0:3] for i in range(0, len(results))]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next let's quickly check what the request response performance is for the deployed model on AKS cluster."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text_data = list(map(text_to_json, dupes_to_score)) # Retrieve the text data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"timer_results = list()\n",
"for text in text_data:\n",
" res=%timeit -r 1 -o -q requests.post(scoring_url, data=text, headers=headers)\n",
" timer_results.append(res.best)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"timer_results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Average time taken: {0:4.2f} ms\".format(10 ** 3 * np.mean(timer_results)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scaling"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this part, we scale the number of pods to make sure we fully utilize the AKS cluster. To connect to the Kubernetes cluster, we will use kubectl, the Kubernetes command-line client. To install, run the following:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!sudo az aks install-cli"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will get the credentials to connect to the cluster."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(os.path.join(os.path.expanduser('~'),'.kube'), exist_ok=True) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config_path = os.path.join(os.path.expanduser('~'),'.kube/config')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(config_path, 'a') as f:\n",
" f.write(aks_target.get_credentials()['userKubeConfig'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the nodes and pods of the cluster."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get nodes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get events"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now scale up the number of pods."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl scale --current-replicas=2 --replicas=10 {\"deployment/\" + aks_service_name}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get deployment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will test the [throughput of the web service](06_SpeedTestWebApp.ipynb)."
]
}
],
"metadata": {
"authors": [
{
"name": "raymondl"
}
],
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,319 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Test deployed web application"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we test the latency of the deployed web application by sending a number of duplicate questions as asychronous requests."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import asyncio\n",
"import json\n",
"import urllib.request\n",
"from timeit import default_timer\n",
"\n",
"import aiohttp\n",
"import nest_asyncio\n",
"import pandas as pd\n",
"from azureml.core.webservice import AksWebservice\n",
"from azureml.core.workspace import Workspace\n",
"from dotenv import get_key, find_dotenv\n",
"from tqdm import tqdm\n",
"from utilities import text_to_json, get_auth\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(aiohttp.__version__) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nest_asyncio.apply()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrive the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = get_key(env_path, 'aks_service_name')\n",
"aks_service = AksWebservice(ws, name=aks_service_name)\n",
"aks_service.name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will test our deployed service with 100 calls. We will only have 4 requests concurrently at any time. Feel free to try different values and see how the service responds."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"NUMBER_OF_REQUESTS = 100 # Total number of requests\n",
"CONCURRENT_REQUESTS = 4 # Number of requests at a time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the scoring URL and API key of the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scoring_url = aks_service.scoring_uri\n",
"api_key = aks_service.get_keys()[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_test_path = './data_folder/dupes_test.tsv'\n",
"dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n",
"dupes_to_score = dupes_test.iloc[:NUMBER_OF_REQUESTS,4]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"url_list = [[scoring_url, jsontext] for jsontext in dupes_to_score.apply(text_to_json)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def decode(result):\n",
" return json.loads(result.decode(\"utf-8\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"async def fetch(url, session, data, headers):\n",
" start_time = default_timer()\n",
" async with session.request(\"post\", url, data=data, headers=headers) as response:\n",
" resp = await response.read()\n",
" elapsed = default_timer() - start_time\n",
" return resp, elapsed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"async def bound_fetch(sem, url, session, data, headers):\n",
" # Getter function with semaphore.\n",
" async with sem:\n",
" return await fetch(url, session, data, headers)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"async def await_with_progress(coros):\n",
" results = []\n",
" for f in tqdm(asyncio.as_completed(coros), total=len(coros)):\n",
" result = await f\n",
" results.append((decode(result[0]), result[1]))\n",
" return results\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"async def run(url_list, num_concurrent=CONCURRENT_REQUESTS):\n",
" headers = {\n",
" \"content-type\": \"application/json\",\n",
" \"Authorization\": (\"Bearer \" + api_key),\n",
" }\n",
" tasks = []\n",
" # create instance of Semaphore\n",
" sem = asyncio.Semaphore(num_concurrent)\n",
"\n",
" # Create client session that will ensure we dont open new connection\n",
" # per each request.\n",
" async with aiohttp.ClientSession() as session:\n",
" for url, data in url_list:\n",
" # pass Semaphore and session to every POST request\n",
" task = asyncio.ensure_future(bound_fetch(sem, url, session, data, headers))\n",
" tasks.append(task)\n",
" return await await_with_progress(tasks)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we run the 100 requests against our deployed service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"loop = asyncio.get_event_loop()\n",
"start_time = default_timer()\n",
"complete_responses = loop.run_until_complete(\n",
" asyncio.ensure_future(run(url_list, num_concurrent=CONCURRENT_REQUESTS))\n",
")\n",
"elapsed = default_timer() - start_time\n",
"print(\"Total Elapsed {}\".format(elapsed))\n",
"print(\"Avg time taken {0:4.2f} ms\".format(1000 * elapsed / len(url_list)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Example response\n",
"complete_responses[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's use the number of original questions to count the succesful responses."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"no_questions = len(eval(complete_responses[0][0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_succesful = [len(eval(i[0])) for i in complete_responses].count(no_questions)\n",
"print(\"Succesful {} out of {}\".format(num_succesful, len(url_list)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will explore the real-time scoring in an [iPyWidget app](07_RealTimeScoring.ipynb)."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,734 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": false
}
}
}
}
},
"source": [
"# Explore Duplicate Question Matches\n",
"Use this dashboard to explore the relationship between duplicate and original questions."
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"## Setup\n",
"This section loads needed packages, and defines useful functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"\n",
"import math\n",
"\n",
"import ipywidgets as widgets\n",
"import pandas as pd\n",
"import requests\n",
"from azureml.core.webservice import AksWebservice\n",
"from azureml.core.workspace import Workspace\n",
"from dotenv import get_key, find_dotenv\n",
"from utilities import read_questions, text_to_json, get_auth\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = get_key(env_path, 'aks_service_name')\n",
"aks_service = AksWebservice(ws, name=aks_service_name)\n",
"aks_service.name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the duplicate questions scoring app's URL."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scoring_url = aks_service.scoring_uri\n",
"api_key = aks_service.get_keys()[0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A constructor function for ID-text contents. Constructs buttons and text areas for each text ID and text passage.\n",
"* Each buttons's description is set to a text's ID, and its click action is set to the handler.\n",
"* Each text area's content is set to a text.\n",
"* A dictionary is created to map IDs to text areas."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def buttons_and_texts(\n",
" data, id, answerid, text, handle_click, layout=widgets.Layout(width=\"100%\"), n=15\n",
"):\n",
" \"\"\"Construct buttons, text areas, and a mapping from IDs to text areas.\"\"\"\n",
" items = []\n",
" text_map = {}\n",
" for i in range(min(n, len(data))):\n",
" button = widgets.Button(description=data.iloc[i][id])\n",
" button.answerid = data.iloc[i][answerid] if answerid in data else None\n",
" button.open = False\n",
" button.on_click(handle_click)\n",
" items.append(button)\n",
" text_area = widgets.Textarea(\n",
" data.iloc[i][text], placeholder=data.iloc[i][id], layout=layout\n",
" )\n",
" items.append(text_area)\n",
" text_map[data.iloc[i][id]] = text_area\n",
" return items, text_map\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A constructor function for the duplicates and questions explorer widget. This builds a box containing duplicates and question tabs, each in turn containing boxes that contain the buttons and text areas."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def duplicates_questions_widget(\n",
" duplicates, questions, layout=widgets.Layout(width=\"100%\")\n",
"):\n",
" \"\"\"Construct a duplicates and questions exploration widget.\"\"\"\n",
" # Construct the duplicates Tab of buttons and text areas.\n",
" duplicates_items, duplicates_map = buttons_and_texts(\n",
" duplicates,\n",
" duplicates_id,\n",
" duplicates_answerid,\n",
" duplicates_text,\n",
" duplicates_click,\n",
" n=duplicates.shape[0],\n",
" )\n",
" duplicates_tab = widgets.Tab(\n",
" [widgets.VBox(duplicates_items, layout=layout)],\n",
" layout=widgets.Layout(width=\"100%\", height=\"500px\", overflow_y=\"auto\"),\n",
" )\n",
" duplicates_tab.set_title(0, duplicates_title)\n",
" # Construct the questions Tab of buttons and text areas.\n",
" questions_items, questions_map = buttons_and_texts(\n",
" questions,\n",
" questions_id,\n",
" questions_answerid,\n",
" questions_text,\n",
" questions_click,\n",
" n=questions.shape[0],\n",
" )\n",
" questions_tab = widgets.Tab(\n",
" [widgets.VBox(questions_items, layout=layout)],\n",
" layout=widgets.Layout(width=\"100%\", height=\"500px\", overflow_y=\"auto\"),\n",
" )\n",
" questions_tab.set_title(0, questions_title)\n",
" # Put both tabs in an HBox.\n",
" duplicates_questions = widgets.HBox([duplicates_tab, questions_tab], layout=layout)\n",
" return duplicates_map, questions_map, duplicates_questions\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A handler function for a question passage button press. If the passage's text window is open, it is collapsed. Otherwise, it is opened."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def questions_click(button):\n",
" \"\"\"Respond to a click on a question button.\"\"\"\n",
" global questions_map\n",
" if button.open:\n",
" questions_map[button.description].rows = None\n",
" button.open = False\n",
" else:\n",
" questions_map[button.description].rows = 10\n",
" button.open = True\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A handler function for a duplicate obligation button press. If the obligation is not selected, select it and update the questions tab with its top 15 question passages ordered by match score. Otherwise, if the duplicate's text window is open, it is collapsed, else it is opened."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def duplicates_click(button):\n",
" \"\"\"Respond to a click on a duplicate button.\"\"\"\n",
" global duplicates_map\n",
" if select_duplicate(button):\n",
" duplicates_map[button.description].rows = 10\n",
" button.open = True\n",
" else:\n",
" if button.open:\n",
" duplicates_map[button.description].rows = None\n",
" button.open = False\n",
" else:\n",
" duplicates_map[button.description].rows = 10\n",
" button.open = True\n",
"\n",
"\n",
"def select_duplicate(button):\n",
" \"\"\"Update the displayed questions to correspond to the button's duplicate\n",
" selections. Returns whether or not the selected duplicate changed.\n",
" \"\"\"\n",
" global selected_button, questions_map, duplicates_questions\n",
" if \"selected_button\" not in globals() or button != selected_button:\n",
" if \"selected_button\" in globals():\n",
" selected_button.style.button_color = None\n",
" selected_button.style.font_weight = \"\"\n",
" selected_button = button\n",
" selected_button.style.button_color = \"yellow\"\n",
" selected_button.style.font_weight = \"bold\"\n",
" duplicates_text = duplicates_map[selected_button.description].value\n",
" questions_scores = score_text(duplicates_text)\n",
" ordered_questions = questions.loc[questions_scores[questions_id]]\n",
" questions_items, questions_map = buttons_and_texts(\n",
" ordered_questions,\n",
" questions_id,\n",
" questions_answerid,\n",
" questions_text,\n",
" questions_click,\n",
" n=questions_display,\n",
" )\n",
" if questions_button_color is True and selected_button.answerid is not None:\n",
" set_button_color(questions_items[::2], selected_button.answerid)\n",
" if questions_button_score is True:\n",
" questions_items = [\n",
" item\n",
" for button, text_area in zip(*[iter(questions_items)] * 2)\n",
" for item in (add_button_prob(button, questions_scores), text_area)\n",
" ]\n",
" duplicates_questions.children[1].children[0].children = questions_items\n",
" duplicates_questions.children[1].set_title(0, selected_button.description)\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"\n",
"def add_button_prob(button, questions_scores):\n",
" \"\"\"Return an HBox containing button and its probability.\"\"\"\n",
" id = button.description\n",
" prob = widgets.Label(\n",
" score_label\n",
" + \": \"\n",
" + str(\n",
" int(\n",
" math.ceil(score_scale * questions_scores.loc[id][questions_probability])\n",
" )\n",
" )\n",
" )\n",
" return widgets.HBox([button, prob])\n",
"\n",
"\n",
"def set_button_color(button, answerid):\n",
" \"\"\"Set each button's color according to its label.\"\"\"\n",
" for i in range(len(button)):\n",
" button[i].style.button_color = (\n",
" \"lightgreen\" if button[i].answerid == answerid else None\n",
" )\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Functions for interacting with the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def score_text(text):\n",
" \"\"\"Return a data frame with the original question scores for the text.\"\"\"\n",
" headers = {\n",
" \"content-type\": \"application/json\",\n",
" \"Authorization\": (\"Bearer \" + api_key),\n",
" }\n",
" # jsontext = json.dumps({'input':'{0}'.format(text)})\n",
" jsontext = text_to_json(text)\n",
" result = requests.post(scoring_url, data=jsontext, headers=headers)\n",
" # scores = result.json()['result'][0]\n",
" scores = eval(result.json())\n",
" scores_df = pd.DataFrame(\n",
" scores, columns=[questions_id, questions_answerid, questions_probability]\n",
" )\n",
" scores_df[questions_id] = scores_df[questions_id].astype(str)\n",
" scores_df[questions_answerid] = scores_df[questions_answerid].astype(str)\n",
" scores_df = scores_df.set_index(questions_id, drop=False)\n",
" return scores_df"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"Control the appearance of cell output boxes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"%%html\n",
"<style>\n",
".output_wrapper, .output {\n",
" height:auto !important;\n",
" max-height:1000px; /* your desired max-height here */\n",
"}\n",
".output_scroll {\n",
" box-shadow:none !important;\n",
" webkit-box-shadow:none !important;\n",
"}\n",
"</style>"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"## Load data\n",
"\n",
"Load the pre-formatted text of questions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"questions_title = 'Questions'\n",
"questions_id = 'Id'\n",
"questions_answerid = 'AnswerId'\n",
"questions_text = 'Text'\n",
"questions_probability = 'Probability'\n",
"questions_path = './data_folder/questions.tsv'\n",
"questions = read_questions(questions_path, questions_id, questions_answerid)"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"Load the pre-formatted text of duplicates."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"duplicates_title = 'Duplicates'\n",
"duplicates_id = 'Id'\n",
"duplicates_answerid = 'AnswerId'\n",
"duplicates_text = 'Text'\n",
"duplicates_path = './data_folder/dupes_test.tsv'\n",
"duplicates = read_questions(duplicates_path, duplicates_id, duplicates_answerid)"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": false
}
}
}
}
},
"source": [
"## Explore original questions matched up with duplicate questions\n",
"\n",
"Define other variables and settings used in creating the interface."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"questions_display = 15\n",
"questions_button_color = True\n",
"questions_button_score = True\n",
"score_label = 'Score'\n",
"score_scale = 100"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"This builds the exploration widget as a box containing duplicates and question tabs, each in turn containing boxes that have for each ID-text pair a button and a text area."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1.0,
"views": {
"grid_default": {},
"report_default": {
"hidden": false
}
}
}
}
},
"outputs": [],
"source": [
"duplicates_map, questions_map, duplicates_questions = duplicates_questions_widget(duplicates, questions)\n",
"duplicates_questions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To tear down the cluster and related resources go to the [last notebook](08_TearDown.ipynb)."
]
}
],
"metadata": {
"extensions": {
"jupyter_dashboards": {
"activeView": "report_default",
"version": 1.0,
"views": {
"grid_default": {
"name": "grid",
"type": "grid"
},
"report_default": {
"name": "report",
"type": "report"
}
}
}
},
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,235 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down\n",
"Use this notebook to clean up the web service, image, model and the AKS cluster created by the tutorial."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AksCompute\n",
"from azureml.core.image import Image\n",
"from azureml.core.model import Model\n",
"from azureml.core.webservice import AksWebservice\n",
"from azureml.core.workspace import Workspace\n",
"from dotenv import get_key, find_dotenv\n",
"from utilities import get_auth\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's get the workspace information."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the web service to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = get_key(env_path, 'aks_service_name')\n",
"aks_service = AksWebservice(ws, name=aks_service_name)\n",
"print(aks_service.name, aks_service.tags)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the image to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_name = get_key(env_path, 'image_name')\n",
"image_version = int(get_key(env_path, 'image_version'))\n",
"image = Image(ws, name=image_name, version=image_version)\n",
"print(image.name, image.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the model to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = 'question_match_model'\n",
"model_version = int(get_key(env_path, 'model_version'))\n",
"model = Model(ws, name=model_name, version=model_version)\n",
"print(model.name, model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the AKS compute to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_name = get_key(env_path, 'aks_name')\n",
"aks_target = AksCompute(ws, name=aks_name)\n",
"print(aks_target.name, aks_target.get_status())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Delete the service, image and model. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_service.delete()\n",
"image.delete()\n",
"model.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's delete the AKS compute from the workspace. Since we created the cluster through AML, the corresponding cloud based objects will also be deleted. If the custer was created externally and attached to the workspace, the below would raise an exception and nothing will be changed."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you also would like to delete the workspace and all experiments in it, you can use the following."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"ws.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, you can delete the resource group with the following."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resource_group = get_key(env_path, 'resource_group')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az group delete --yes --name $resource_group"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:MLAKSDeployAML]",
"language": "python",
"name": "conda-env-MLAKSDeployAML-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,73 @@
### Authors: Fidan Boylu Uz
### Acknowledgements: Mario Bourgoin, Mathew Salvaris
# Deploying Python models on a Kubernetes Cluster
To get started with the tutorial, please proceed with following steps **in sequential order**.
* [Prerequisites](#prerequisites)
* [Steps](#steps)
* [Cleaning up](#cleanup)
<a id='prerequisites'></a>
## Prerequisites
1. Linux (Ubuntu).
2. [Anaconda Python](https://www.anaconda.com/download)
3. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed.
4. [Azure account](https://azure.microsoft.com).
The tutorial was developed on an [Azure Ubuntu
DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro),
which addresses the first three prerequisites.
<a id='steps'></a>
## Steps
To set up your environment to run these notebooks, please follow these steps. They setup the notebooks to use Docker and Azure seamlessly.
1. Add your user to the docker group:
```
sudo usermod -aG docker $USER
newgrp docker
```
2. Login to Docker with your username and password:
```
docker login
```
3. Create the Python MLAKSDeployAML virtual environment using the environment.yml:
```
conda env create -f environment.yml
```
4. Activate the virtual environment:
```
source activate MLAKSDeployAML
```
5. Login to Azure:
```
az login
```
6. If you have more than one Azure subscription, select it:
```
az account set --subscription <Your Azure Subscription>
```
7. Start the Jupyter notebook server in the virtual environment:
```
jupyter notebook
```
8. After following the setup instructions above, run the Jupyter notebooks in order starting with the first notebook.
<a id='cleanup'></a>
## Cleaning up
To remove the conda environment created see [here](https://conda.io/docs/commands/env/conda-env-remove.html). The last Jupyter notebook also gives details on deleting Azure resources associated with this repository.
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repositories using our CLA.
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -0,0 +1,12 @@
# Fill in the fields below and rename to .dev_env
# TENANT_ID, SP_USERNAME and SP_PASSWORD are optional. If not supplied Azure cli will default to interactive login
TENANT_ID=
SP_USERNAME=
SP_PASSWORD=
SUBSCRIPTION_ID=
RESOURCE_GROUP="deployrg"
WORKSPACE_NAME="workspace"
WORKSPACE_REGION="eastus"
IMAGE_NAME="deployimg"
AKS_NAME="deployaks"
AKS_SERVICE_NAME="deployservice"

Просмотреть файл

@ -0,0 +1,47 @@
# Copyright (C) Microsoft Corporation. All rights reserved.
# 23456789012345678901234567890123456789012345678901234567890123456789012345678
import pandas as pd
from sklearn.externals import joblib
class DuplicateModel(object):
questions_cols = ['Id', 'AnswerId', 'Text']
dup_col = 'Text_x'
id_col = 'Id_y'
answerId_col = 'AnswerId_y'
orig_col = 'Text_y'
feature_cols = [dup_col, orig_col]
probabilities_col = 'probabilities'
def __init__(self, model_path, questions_path):
self.model_path = model_path
self.questions_path = questions_path
self.model = joblib.load(model_path)
self.questions = pd.read_csv(questions_path, sep='\t',
encoding='latin1')
self.questions = self.questions[self.questions_cols]
self.questions.columns = [
self.id_col, self.answerId_col, self.orig_col]
def score(self, Text):
# Create a scoring dataframe.
test = self.questions.copy()
test[self.dup_col] = Text
test_X = test[self.feature_cols]
# Score the text.
test[self.probabilities_col] = self.model.predict_proba(
test_X)[:, 1]
# Order the data by descending probability.
test.sort_values(by=self.probabilities_col, ascending=False,
inplace=True)
# Extract the original question ids, answer ids, and probabilities.
scores = test[[self.id_col, self.answerId_col, self.probabilities_col]]
pairs = [x[1:] for x in scores.itertuples()]
# Return the result.
return pairs

Просмотреть файл

@ -0,0 +1,22 @@
name: MLAKSDeployAML
channels:
- conda-forge
dependencies:
- python=3.6.2
- tornado<6
- nb_conda_kernels==2.1.0
- ipywidgets==7.3.0
- pandas>=0.23.3
- scikit-learn>=0.19.1
- pip:
- urllib3==1.23
- aiohttp>=3.3.2
- nest_asyncio>=0.9.10
- toolz>=0.9.0
- tqdm>=4.23.4
- azure-cli>=2.0.50
- prompt_toolkit>=2.0.7
- lightgbm>=2.1.2
- papermill>=0.14.1
- python-dotenv==0.10.1
- azureml-sdk[notebooks]==1.0.33

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше