Updating to remove cookie cutter, remove iot, and simplify folders. Also adding pytests.
This commit is contained in:
Daniel Ciborowski 2020-02-25 00:38:04 -05:00 коммит произвёл GitHub
Родитель a36b122c3f
Коммит b22f5ac4ec
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
26 изменённых файлов: 5264 добавлений и 190 удалений

Просмотреть файл

@ -3,7 +3,7 @@
# A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub"
# https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml
#
# An Agent_Name Variable must be creating in the Azure DevOps UI.
# An Agent_Name Variable must be creating in the Azure DevOps UI.
# https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables
#
# This must point to an Agent Pool, with a Self-Hosted Linux VM with a DOcker.
@ -32,6 +32,10 @@ stages:
- template: .ci/stages/deploy_notebooks_stages_v2.yml@aitemplates
parameters:
Agent: $(Agent_Name)
jobDisplayName: MLAKSDeployAMLJob
jobDisplayName: az-ml-realtime-score
DefaultWorkingDirectory: $(System.DefaultWorkingDirectory)
workload_vars: ../vars/ml_realtime_scoring.yml
workload_vars: ../vars/az-ml-realtime-score.yml
flighting_release: false
flighting_preview: false
flighting_master: false
post_cleanup: false

Просмотреть файл

@ -1,155 +1,50 @@
# AI Architecture Template TODO: update tile
#
# A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub"
# https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml
#
# An Agent_Name Variable must be creating in the Azure DevOps UI.
# https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables
#
# This must point to an Agent Pool, with a Self-Hosted Linux VM with a DOcker.
# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops
resources:
repositories:
- repository: aitemplates
type: github
name: microsoft/AI
endpoint: AIArchitecturesAndPractices-GitHub
schedules:
- cron: "*/10 * * * *"
displayName: Daily midnight build
always: true
branches:
include:
- master
# MLAKSDeploy Pipeline
trigger:
batch: true
branches:
include:
- master
- staging
variables:
- group: AzureKeyVault
jobs:
- job: MLAKSDeployAMLJob
timeoutInMinutes: 300
cancelTimeoutInMinutes: 2
pool:
vmImage: 'Ubuntu-16.04'
strategy:
maxParallel: 3
matrix: {"eastus": {"azureregion": "eastus", "azureresourcegroup" : "mlaksdplyamleastus"},"southcentralus": {"azureregion": "southcentralus", "azureresourcegroup" : "mlaksdplyamlsouthctrl" },"westus2": {"azureregion": "westus2", "azureresourcegroup" : "mlaksdplyamlwestus"}}
steps:
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
which conda
conda env create -f {{cookiecutter.project_name}}/environment.yml
conda env list
conda activate MLAKSDeployAML
conda env list
echo Login Azure Account
az login -t $(sptenent) --service-principal -u $(spidentity) --password $(spsecret)
cd {{cookiecutter.project_name}}
echo Execute 00_AMLConfiguration.ipynb
papermill 00_AMLConfiguration.ipynb 00_AMLConfiguration_Output.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p subscription_id $(azuresubscription) \
-p resource_group $(azureresourcegroup) \
-p workspace_name $(workspacename) \
-p workspace_region $(azureregion) \
-p image_name $(aksimagename)
displayName: '00_AML_Configuration.ipynb'
- template: steps/papermill.yml
parameters:
notebook: 01_DataPrep.ipynb
location: "{{cookiecutter.project_name}}"
- bash: |
mkdir -p {{cookiecutter.project_name}}/iotedge/data_folder
mkdir -p {{cookiecutter.project_name}}/aks/data_folder
cd {{cookiecutter.project_name}}
cp data_folder/*.tsv iotedge/data_folder
cp data_folder/*.tsv aks/data_folder
displayName: 'Copying data'
- template: steps/papermill.yml
parameters:
notebook: 02_TrainOnLocal.ipynb
location: "{{cookiecutter.project_name}}"
- template: steps/papermill.yml
parameters:
notebook: 03_DevelopScoringScript.ipynb
location: "{{cookiecutter.project_name}}"
- template: steps/papermill.yml
parameters:
notebook: 04_CreateImage.ipynb
location: "{{cookiecutter.project_name}}"
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate MLAKSDeployAML
echo Execute 05_DeployOnAKS.ipynb
export PYTHONPATH=$(pwd)/{{cookiecutter.project_name}}:${PYTHONPATH}
cd {{cookiecutter.project_name}}/aks
papermill 05_DeployOnAKS.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p aks_name $(aksname) \
-p aks_location $(azureregion) \
-p aks_service_name $(aksvcname)
displayName: '05_DeployOnAKS.ipynb'
- template: steps/papermill.yml
parameters:
notebook: 06_SpeedTestWebApp.ipynb
location: "{{cookiecutter.project_name}}/aks"
- template: steps/papermill.yml
parameters:
notebook: 07_RealTimeScoring.ipynb
location: "{{cookiecutter.project_name}}/aks"
# - bash: |
# source /usr/share/miniconda/etc/profile.d/conda.sh
# conda activate MLAKSDeployAML
# export PYTHONPATH=$(pwd)/{{cookiecutter.project_name}}:${PYTHONPATH}
# cd {{cookiecutter.project_name}}/iotedge
# echo Execute 05_DeployOnIOTedge.ipynb
# papermill 05_DeployOnIOTedge.ipynb test.ipynb \
# --log-output \
# --no-progress-bar \
# -k python3 \
# -p iot_hub_name fstlstnameiothub \
# -p device_id mydevice \
# -p module_name mymodule
# displayName: '05_DeployOnIOTedge.ipynb'
- template: steps/papermill.yml
parameters:
notebook: 08_TearDown.ipynb
location: "{{cookiecutter.project_name}}/aks"
# - template: steps/papermill.yml
# parameters:
# notebook: 06_TearDown.ipynb
# location: "{{cookiecutter.project_name}}/iotedge"
- bash: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate MLAKSDeployAML
echo Execute Resource Group Delete
existResponse=$(az group exists -n $(azureresourcegroup))
if [ "$existResponse" == "true" ]; then
echo Deleting project resource group
az group delete --name $(azureresourcegroup) --yes
else
echo Project resource group did not exist
fi
echo Done Cleanup
displayName: 'Backup Cleanup'
condition: or(canceled(),failed())
- task: CreateWorkItem@1
inputs:
workItemType: 'Issue'
title: $(System.TeamProject) - Build $(Build.BuildNumber) Failed
assignedTo: 'Fidan <fboylu@microsoft.com>'
associate: true
teamProject: $(System.TeamProject)
fieldMappings: |
Description=Branch: Branch $(Build.SourceBranch) failed to build. Go to Boards>WorkItems and tag the failure type.
displayName: 'Create work item on failure'
condition: failed()
pr:
autoCancel: true
branches:
include:
- master
stages:
- template: .ci/stages/deploy_notebooks_stages_v2.yml@aitemplates
parameters:
Agent: $(Agent_Name)
jobDisplayName: ai-architecture-template #TODO: Update with project name
DefaultWorkingDirectory: $(System.DefaultWorkingDirectory)
workload_vars: ../vars/ai-architecture-template.yml #TODO: Update with project name
flighting_release: false
flighting_preview: false
flighting_master: false

Просмотреть файл

@ -0,0 +1,64 @@
# AI Architecture Template TODO: update tile
#
# A Github Service Connection must also be created with the name "AIArchitecturesAndPractices-GitHub"
# https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml
#
# An Agent_Name Variable must be creating in the Azure DevOps UI.
# https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables
#
# This must point to an Agent Pool, with a Self-Hosted Linux VM with a Docker.
# https://docs.microsoft.com/en-us/azure/devops/pipelines/agents/v2-linux?view=azure-devops
parameters:
azureSubscription: ''
azure_subscription: ''
location: ''
azureresourcegroup: ''
workspacename: ''
azureregion: westus2
aksimagename: ''
aks_name: ''
aks_service_name: myimage
conda: ''
doCleanup: true
python_path: ''
flighting_release: false
flighting_preview: false
flighting_master: false
steps:
- template: config_conda.yml
parameters:
conda_location: .
azureSubscription: ${{parameters.azureSubscription}}
conda: ai-architecture-template
flighting_release: ${{parameters.flighting_release}}
flighting_preview: ${{parameters.flighting_preview}}
flighting_master: ${{parameters.flighting_master}}
- template: azpapermill.yml
parameters:
notebook: 00_AMLConfiguration.ipynb
location: ${{parameters.location}}
azureSubscription: ${{parameters.azureSubscription}}
conda: ai-architecture-template
azure_subscription: ${{parameters.azure_subscription}}
azureresourcegroup: ${{parameters.azureresourcegroup}}
workspacename: "aiarchtemplate"
azureregion: ${{parameters.azureregion}}
aksimagename: ${{parameters.aksimagename}}
# Insert more notebook steps here
- template: pytest_steps.yml
parameters:
location: ${{parameters.location}}
azureSubscription: ${{parameters.azureSubscription}}
conda: ai-architecture-template
- template: cleanuptask.yml
parameters:
azureSubscription: ${{parameters.azureSubscription}}
conda: ${{parameters.conda}}
azureresourcegroup: ${{parameters.azureresourcegroup}}
doCleanup: ${{parameters.doCleanup}}

Просмотреть файл

@ -0,0 +1,6 @@
variables:
TridentWorkloadTypeShort: aiarchtemp # TODO: update with project short name
DeployLocation: westus
ProjectLocation: "notebooks/"
PythonPath: "."
Template: steps/ai-architecture-template.yml # TODO: update file name to project name

155
.gitignore поставляемый
Просмотреть файл

@ -1,13 +1,11 @@
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Environments
.env
# Jupyter Notebook
.ipynb_checkpoints
# Project Configuration Files
workspace_conf.yml
*.output_ipynb
.azureml
pylint-results.xml
.idea
score.py
#AML
aml_config/
@ -19,14 +17,137 @@ scripts/.amlignore
__pycache__/
scripts/__pycache__/
# Products
*.tsv
*.txt
*.pkl
datafolder/
lgbmenv.yml
score.py
# Environments
.env
.idea
# Jupyter Notebook
.ipynb_checkpoints
# Byte-compiled / optimized / DLL files
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/

584
.pylintrc Normal file
Просмотреть файл

@ -0,0 +1,584 @@
[MASTER]
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-whitelist=
# Add files or directories to the blacklist. They should be base names, not
# paths.
ignore=CVS
# Add files or directories matching the regex patterns to the blacklist. The
# regex matches against base names, not paths.
ignore-patterns=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use.
jobs=1
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=pylint_junit
# Pickle collected data for later comparisons.
persistent=yes
# Specify a configuration file.
#rcfile=
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
confidence=
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then reenable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=missing-module-docstring,
trailing-whitespace,
fixme,
print-statement,
parameter-unpacking,
unpacking-in-except,
old-raise-syntax,
backtick,
long-suffix,
old-ne-operator,
old-octal-literal,
import-star-module-level,
non-ascii-bytes-literal,
raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
apply-builtin,
basestring-builtin,
buffer-builtin,
cmp-builtin,
coerce-builtin,
execfile-builtin,
file-builtin,
long-builtin,
raw_input-builtin,
reduce-builtin,
standarderror-builtin,
unicode-builtin,
xrange-builtin,
coerce-method,
delslice-method,
getslice-method,
setslice-method,
no-absolute-import,
old-division,
dict-iter-method,
dict-view-method,
next-method-called,
metaclass-assignment,
indexing-exception,
raising-string,
reload-builtin,
oct-method,
hex-method,
nonzero-method,
cmp-method,
input-builtin,
round-builtin,
intern-builtin,
unichr-builtin,
map-builtin-not-iterating,
zip-builtin-not-iterating,
range-builtin-not-iterating,
filter-builtin-not-iterating,
using-cmp-argument,
eq-without-hash,
div-method,
idiv-method,
rdiv-method,
exception-message-attribute,
invalid-str-codec,
sys-max-int,
bad-python3-import,
deprecated-string-function,
deprecated-str-translate-call,
deprecated-itertools-function,
deprecated-types-field,
next-method-defined,
dict-items-not-iterating,
dict-keys-not-iterating,
dict-values-not-iterating,
deprecated-operator-function,
deprecated-urllib-function,
xreadlines-attribute,
deprecated-sys-function,
exception-escape,
comprehension-escape
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'error', 'warning', 'refactor', and 'convention'
# which contain the number of messages in each category, as well as 'statement'
# which is the total number of statements analyzed. This score is used by the
# global evaluation report (RP0004).
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
#msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
output-format=text
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style.
#class-attribute-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=snake_case
# Regular expression matching correct constant names. Overrides const-naming-
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
ex,
Run,
_
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=any
# Regular expression matching correct module names. Overrides module-naming-
# style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style.
#variable-rgx=
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=120
# Maximum number of lines in a module.
max-module-lines=1000
# List of optional constructs for which whitespace checking is disabled. `dict-
# separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
# `trailing-comma` allows a space between comma and closing bracket: (a, ).
# `empty-line` allows space-only lines.
no-space-check=trailing-comma,
dict-separator
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[LOGGING]
# Format style used to check logging format string. `old` means using %
# formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
[SIMILARITIES]
# Ignore comments when computing similarities.
ignore-comments=yes
# Ignore docstrings when computing similarities.
ignore-docstrings=yes
# Ignore imports when computing similarities.
ignore-imports=no
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. Available dictionaries: none. To make it work,
# install the python-enchant package.
spelling-dict=
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[STRING]
# This flag controls whether the implicit-str-concat-in-sequence should
# generate a warning on implicit string concatenation in sequences defined over
# several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether missing members accessed in mixin class should be ignored. A
# mixin class is detected if its name ends with "mixin" (case insensitive).
ignore-mixin-members=yes
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# List of decorators that change the signature of a decorated function.
signature-mutators=
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored. Default to name
# with leading underscore.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
[CLASSES]
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,
_fields,
_replace,
_source,
_make
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=cls
[DESIGN]
# Maximum number of arguments for function / method.
max-args=5
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=12
# Maximum number of locals for function / method body.
max-locals=15
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=6
# Maximum number of statements in function / method body.
max-statements=50
# Minimum number of public methods for a class (see R0903).
min-public-methods=2
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=optparse,tkinter.tix
# Create a graph of external dependencies in the given file (report RP0402 must
# not be disabled).
ext-import-graph=
# Create a graph of every (i.e. internal and external) dependencies in the
# given file (report RP0402 must not be disabled).
import-graph=
# Create a graph of internal dependencies in the given file (report RP0402 must
# not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[EXCEPTIONS]
# Exceptions that will emit a warning when being caught. Defaults to
# "BaseException, Exception".
overgeneral-exceptions=BaseException,
Exception

9
CODE_OF_CONDUCT.md Normal file
Просмотреть файл

@ -0,0 +1,9 @@
# Microsoft Open Source Code of Conduct
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
Resources:
- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns

Просмотреть файл

@ -6,9 +6,6 @@
In this repository there are a number of tutorials in Jupyter notebooks that have step-by-step instructions on (1) how to train a machine learning model using Python; (2) how to deploy a trained machine learning model throught Azure Machine Learning (AzureML). The tutorials cover how to deploy models on following deployment target:
- [Azure Kubernetes Service (AKS) Cluster](./{{cookiecutter.project_name}}/aks)
- [Azure IoT Edge](./{{cookiecutter.project_name}}/iotedge)
## Overview
This scenario shows how to deploy a Frequently Asked Questions (FAQ) matching model as a web service to provide predictions for user questions. For this scenario, “Input Data” in the [architecture diagram](https://docs.microsoft.com/en-us/azure/architecture/reference-architectures/ai/realtime-scoring-python) refers to text strings containing the user questions to match with a list of FAQs. The scenario is designed for the Scikit-Learn machine learning library for Python but can be generalized to any scenario that uses Python models to make real-time predictions.
@ -25,9 +22,10 @@ An example app that consumes the results is included with the scenario.
## Prerequisites
1. Linux (Ubuntu).
2. [Anaconda Python](https://www.anaconda.com/download)
3. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed.
4. [Azure account](https://azure.microsoft.com).
1. [Anaconda Python](https://www.anaconda.com/download)
1. [Docker](https://docs.docker.com/v17.12/install/linux/docker-ee/ubuntu) installed.
1. [Azure account](https://azure.microsoft.com).
---
**NOTE**
@ -44,27 +42,46 @@ DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtu
which addresses the first three prerequisites.
## Setup
To set up your environment to run these notebooks, please follow these steps. They setup the notebooks to use Docker and Azure seamlessly.
1. Create an _Ubuntu_ _Linux_ DSVM and perform the following steps.
2. Install [cookiecutter](https://cookiecutter.readthedocs.io/en/latest/installation.html), a tool creates projects from project templates.
```bash
pip install cookiecutter
```
To set up your environment to run these notebooks, please follow these steps. They setup the notebooks to use Azure seamlessly.
3. Use cookiecutter to clone this repository. Cookiecutter will prompt a series of questions where you will choose a specific framework, select your deployment settings, and obtain an Azure ML workspace.
```bash
cookiecutter https://github.com/Microsoft/MLAKSDeployAML.git
1. Create a _Linux_ _Ubuntu_ VM.
1. Log in to your VM. We recommend that you use a graphical client
such as
[X2Go](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro#x2go)
to access your VM. The remaining steps are to be done on the VM.
1. Open a terminal emulator.
1. Clone, fork, or download the zip file for this repository:
```
git clone https://github.com/Microsoft/az-ml-realtime-score.git
```
1. Enter the local repository:
```
cd az-ml-realtime-score
```
1. Copy `sample_workspace_conf.yml` to a new file, `workspace_conf.yml`, and fill in each field. This will keep secrets out of the source code, and this file will be ignored by git.
1. Create the Python az-ml-realtime-score virtual environment using the environment.yml:
```
conda env create -f environment.yml
```
1. Activate the virtual environment:
```
source activate az-ml-realtime-score
```
The remaining steps should be done in this virtual environment.
1. Login to Azure:
```
az login
```
You can verify that you are logged in to your subscription by executing
the command:
```
az account show -o table
```
1. Start the Jupyter notebook server:
```
jupyter notebook
```
You will be asked to choose or enter information such as *project name*, *subsciption id*, *resource group*, etc. in an interactive way. You can press *Enter* to accept the default value or enter a value of your choice. For example, if you want to learn how to deploy machine learing model on AKS Cluster, you should choose the value "aks" for variable *deployment_type*. Instead, if you want to learn about deploying machine learning model on IoT Edge, you should select "iotedge" for the variable *deployment_type*.
Provide a valid value for "subscription_id", otherwise a `subscription id is missing` error will be generated **after** all the questions are asked. You will have to perform Step 3 all over again. The full list of questions can be found in [cookiecutter.json](./cookiecutter.json) file.
Please make sure all entered information are correct, as these information are used to customize the content of your repo.
4. On your local machine, you should now have a repo with the *project_name* you specified. Find the README.md file in this repo and proceed with instructions specified in it.
# Contributing
This project welcomes contributions and suggestions. Most contributions require you to agree to a

41
SECURITY.md Normal file
Просмотреть файл

@ -0,0 +1,41 @@
<!-- BEGIN MICROSOFT SECURITY.MD V0.0.3 BLOCK -->
## Security
Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.
## Reporting Security Issues
**Please do not report security vulnerabilities through public GitHub issues.**
Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
* Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
* Full paths of source file(s) related to the manifestation of the issue
* The location of the affected source code (tag/branch/commit or direct URL)
* Any special configuration required to reproduce the issue
* Step-by-step instructions to reproduce the issue
* Proof-of-concept or exploit code (if possible)
* Impact of the issue, including how an attacker might exploit the issue
This information will help us triage your report more quickly.
If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
## Preferred Languages
We prefer all communications to be in English.
## Policy
Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
<!-- END MICROSOFT SECURITY.MD BLOCK -->

21
environment.yml Normal file
Просмотреть файл

@ -0,0 +1,21 @@
name: az-ml-realtime-score
channels:
- conda-forge
dependencies:
- python=3.6.2
- pip
- jupyter
- pytest
- pytest-cov
- pylint
- pandas
- pip:
- papermill
- azureml-core==1.0.85.2
- pylint-junit
- pytest-nunit
- nbconvert
- junit-xml
- nbformat
- Microsoft-AI-Azure-Utility-Samples
- python-dotenv

Просмотреть файл

@ -0,0 +1,199 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ai-architecture-template - 00_AMLConfiguration.ipynb\n",
"TODO: Update with new repo name\n",
"\n",
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License.\n",
"\n",
"# Installation and configuration\n",
"This notebook configures the notebooks in this tutorial to connect to an Azure Machine Learning (AML) Workspace. \n",
"You can use an existing workspace or create a new one.\n",
"\n",
"## Prerequisites\n",
"\n",
"If you have already completed the prerequisites and selected the correct Kernel for this notebook, the AML Python SDK \n",
"is already installed. Let's load the imports and check the AML SDK version."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import json\n",
"\n",
"import azureml.core\n",
"from azure_utils.machine_learning.utils import load_configuration, get_or_create_workspace\n",
"\n",
"print(\"AML SDK Version:\", azureml.core.VERSION)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set up your Azure Machine Learning workspace\n",
"## Load Configurations from file\n",
"\n",
"Configurations are loaded from a file, to prevent accident commits of Azure secerts into source control.\n",
"This file name is included in the .gitignore to also prevent accident commits. A template file is included that should\n",
"be copied, and each parameter filled in."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"cfg = load_configuration(\"../workspace_conf.yml\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Load Configurations into Notebook.\n",
"\n",
"The following cell loads the configurations from the local file, into the notebook memory. The following cell is also\n",
"marked as a parameter cell. When using this notebook with [papermill](https://github.com/nteract/papermill), these\n",
"parameters can be override. See the tests for examples."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"subscription_id = cfg['subscription_id']\n",
"resource_group = cfg['resource_group']\n",
"workspace_name = cfg['workspace_name']\n",
"workspace_region = cfg['workspace_region']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create the workspace\n",
"This cell will create an AML workspace for you in a subscription, provided you have the correct permissions.\n",
"\n",
"This will fail when:\n",
"1. You do not have permission to create a workspace in the resource group\n",
"1. You do not have permission to create a resource group if it's non-existing.\n",
"1. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this \n",
"subscription\n",
"\n",
"If workspace creation fails, please work with your IT admin to provide you with the appropriate permissions or to \n",
"provision the required resources. If this cell succeeds, you're done configuring AML!\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"ws = get_or_create_workspace(workspace_name, subscription_id, resource_group, workspace_region)\n",
"ws_json = ws.get_details()"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"Let's check the details of the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"print(json.dumps(ws_json, indent=2))"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"You are now ready to move on to the [AutoML Local](01_DataPrep.ipynb) notebook."
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

733
notebooks/01_DataPrep.ipynb Normal file
Просмотреть файл

@ -0,0 +1,733 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook, we use a subset of [Stack Exchange network](https://archive.org/details/stackexchange) question data \n",
"which includes original questions tagged as 'JavaScript', their duplicate questions and their answers. Here, we \n",
"provide the steps to prepare the data to use in model development for training a model that will match a new \n",
"question with an existing original question. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"from azure_utils.utilities import read_csv_gz, clean_text, round_sample_strat, random_merge\n",
"from notebooks import DIRECTORY"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below, we define some parameters that will be used in the data cleaning as well as train and test set preparation."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The size of the test set\n",
"test_size = 0.21\n",
"# The minimum length of clean text\n",
"min_text = 150\n",
"# The minimum number of duplicates per question\n",
"min_dupes = 12\n",
"# The maximum number of duplicate matches\n",
"match = 20\n",
"# The output files path\n",
"outputs_path = DIRECTORY + \"/data_folder\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data cleaning"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we download the questions, duplicate questions and answers and load the datasets into pandas dataframes using \n",
"the helper functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# URLs to original questions, duplicate questions, and answers.\n",
"data_url = \"https://bostondata.blob.core.windows.net/stackoverflow/{}\"\n",
"questions_url = data_url.format(\"orig-q.tsv.gz\")\n",
"dupes_url = data_url.format(\"dup-q.tsv.gz\")\n",
"answers_url = data_url.format(\"ans.tsv.gz\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load datasets.\n",
"questions = read_csv_gz(questions_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
"dupes = read_csv_gz(dupes_url, names=('Id', 'AnswerId', 'Text0', 'CreationDate'))\n",
"answers = read_csv_gz(answers_url, names=('Id', 'Text0'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's now check the dataframes. Notice that questions and duplicates have \"AnswerID\" column that would help match \n",
"ith the index of answers dataframe."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"answers.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the first original question's text."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(questions.iloc[0, 1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's now check the duplicates for that question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(dupes[dupes.AnswerId == questions.iloc[0, 0]])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below is the answer to the original question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(answers.at[questions.iloc[0, 0], 'Text0'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we use the helper functions to clean questions, duplicates and answers from unwanted text such as code, html \n",
"tags and links. Notice that we add a new column 'Text' to each dataframe for clean text in lowercase."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Clean up all text, and keep only data with some clean text.\n",
"for df in (questions, dupes, answers):\n",
" df[\"Text\"] = df.Text0.apply(clean_text).str.lower()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"questions = questions[questions.Text.str.len() > 0]\n",
"answers = answers[answers.Text.str.len() > 0]\n",
"dupes = dupes[dupes.Text.str.len() > 0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's compare the first original question and cleaned version as an example."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Original question.\n",
"print(questions.iloc[0, 1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# After cleaning.\n",
"print(questions.iloc[0, 3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It turns out that some duplicate questions were also in original questions. Also, some original questions and some \n",
"duplicate questions were duplicated in the datasets. In the following, we remove them from the dataframes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# First, remove dupes that are questions, then remove duplicated questions and dupes.\n",
"dupes = dupes[~dupes.index.isin(questions.index)]\n",
"questions = questions[~questions.index.duplicated(keep='first')]\n",
"dupes = dupes[~dupes.index.duplicated(keep='first')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We also make sure we keep questions with answers and duplicates."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only questions with answers and dupes, answers to questions, and dupes of questions.\n",
"questions = questions[\n",
" questions.AnswerId.isin(answers.index) & questions.AnswerId.isin(dupes.AnswerId)\n",
"]\n",
"answers = answers[answers.index.isin(questions.AnswerId)]\n",
"dupes = dupes[dupes.AnswerId.isin(questions.AnswerId)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Verify data integrity.\n",
"assert questions.AnswerId.isin(answers.index).all()\n",
"assert answers.index.isin(questions.AnswerId).all()\n",
"assert questions.AnswerId.isin(dupes.AnswerId).all()\n",
"assert dupes.AnswerId.isin(questions.AnswerId).all()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below are some statistics on the data. Notice that some questions have very low number of duplicates while others may \n",
"have a large number. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the data.\n",
"print(\"Text statistics:\")\n",
"print(\n",
" pd.DataFrame(\n",
" [\n",
" questions.Text.str.len().describe().rename(\"questions\"),\n",
" answers.Text.str.len().describe().rename(\"answers\"),\n",
" dupes.Text.str.len().describe().rename(\"dupes\"),\n",
" ]\n",
" )\n",
")\n",
"print(\"\\nDuplication statistics:\")\n",
"print(pd.DataFrame([dupes.AnswerId.value_counts().describe().rename(\"duplications\")]))\n",
"print(\n",
" \"\\nLargest class: {:.2%}\".format(\n",
" dupes.AnswerId.value_counts().max() / dupes.shape[0]\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, we reset all indexes to use them as columns in the rest of the steps."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Reset each dataframe's index.\n",
"questions.reset_index(inplace=True)\n",
"answers.reset_index(inplace=True)\n",
"dupes.reset_index(inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We filter the questions and duplicates to have at least min_text number of characters."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Apply the minimum text length to questions and dupes.\n",
"questions = questions[questions.Text.str.len() >= min_text]\n",
"dupes = dupes[dupes.Text.str.len() >= min_text]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only questions with dupes, and dupes of questions.\n",
"label_column = \"AnswerId\"\n",
"questions = questions[questions[label_column].isin(dupes[label_column])]\n",
"dupes = dupes[dupes[label_column].isin(questions[label_column])]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, we remove questions and their duplicates that are less than min_dupes parameter."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Restrict the questions to those with a minimum number of dupes.\n",
"answerid_count = dupes.groupby(label_column)[label_column].count()\n",
"answerid_min = answerid_count.index[answerid_count >= min_dupes]\n",
"questions = questions[questions[label_column].isin(answerid_min)]\n",
"dupes = dupes[dupes[label_column].isin(answerid_min)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
" # Verify data integrity.\n",
"assert questions[label_column].isin(dupes[label_column]).all()\n",
"assert dupes[label_column].isin(questions[label_column]).all()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here are some statistics on the resulting dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the data.\n",
"print(\"Restrictions: min_text={}, min_dupes={}\".format(min_text, min_dupes))\n",
"print(\"Restricted text statistics:\")\n",
"print(\n",
" pd.DataFrame(\n",
" [\n",
" questions.Text.str.len().describe().rename(\"questions\"),\n",
" dupes.Text.str.len().describe().rename(\"dupes\"),\n",
" ]\n",
" )\n",
")\n",
"print(\"\\nRestricted duplication statistics:\")\n",
"print(\n",
" pd.DataFrame([dupes[label_column].value_counts().describe().rename(\"duplications\")])\n",
")\n",
"print(\n",
" \"\\nRestricted largest class: {:.2%}\".format(\n",
" dupes[label_column].value_counts().max() / dupes.shape[0]\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare train and test sets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this part, we prepare train and test sets. For training a binary classification model, we will need to construct \n",
"match and non-match pairs from duplicates and their questions. Finding matching pairs can be accomplished by joining \n",
"each duplicate with its question. However, non-match examples need to be constructed randomly. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As a first step, to make sure we train and test the performance of the model on each question, we will need to have \n",
"examples of match and non-match pairs for each question both in train and test sets. In order to achieve that, \n",
"we split the duplicates in a stratified manner into train and test sets making sure at least 1 or more duplicates per \n",
"question is in the test set depending on test_size parameter and number of duplicates per each question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Split dupes into train and test ensuring at least one of each label class is in test.\n",
"dupes_test = round_sample_strat(dupes, dupes[label_column], frac=test_size)\n",
"dupes_train = dupes[~dupes.Id.isin(dupes_test.Id)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert dupes_test[label_column].unique().shape[0] == dupes[label_column].unique().shape[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The relevant columns for text pairs data.\n",
"balanced_pairs_columns = ['Id_x', 'AnswerId_x', 'Text_x', 'Id_y', 'Text_y', 'AnswerId_y', 'Label', 'n']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we pair each training duplicate in train set with its matching question and N-1 random questions using the \n",
"helper function."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use AnswerId to pair each training dupe with its matching question and also with N-1 questions not its match.\n",
"balanced_pairs_train = random_merge(dupes_train, questions, N=match)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Labeling is done such that matching pairs are labeled as 1 and non-match pairs are labeled as 0."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Label records by matching AnswerIds.\n",
"balanced_pairs_train[\"Label\"] = (\n",
" balanced_pairs_train.AnswerId_x == balanced_pairs_train.AnswerId_y\n",
").astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only the relevant data.\n",
"balanced_pairs_train = balanced_pairs_train[balanced_pairs_columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"balanced_pairs_train.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sort the data by dupe ID and Label.\n",
"balanced_pairs_train.sort_values(by=['Id_x', 'Label'], ascending=[True, False], inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In testing set, we match each duplicate with all the original questions and label them same way as training set."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use AnswerId to pair each testing dupe with all questions.\n",
"balanced_pairs_test = random_merge(dupes_test, questions, N=questions.shape[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Label records by matching AnswerIds.\n",
"balanced_pairs_test[\"Label\"] = (\n",
" balanced_pairs_test.AnswerId_x == balanced_pairs_test.AnswerId_y\n",
").astype(int)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only the relevant data.\n",
"balanced_pairs_test = balanced_pairs_test[balanced_pairs_columns]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"balanced_pairs_test.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sort the data by dupe ID and Label.\n",
"balanced_pairs_test.sort_values(\n",
" by=[\"Id_x\", \"Label\"], ascending=[True, False], inplace=True\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, we report the final train and test sets and save as text files to be used by modeling."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Report on the datasets.\n",
"print(\n",
" \"balanced_pairs_train: {:,} rows with {:.2%} matches\".format(\n",
" balanced_pairs_train.shape[0], balanced_pairs_train.Label.mean()\n",
" )\n",
")\n",
"print(\n",
" \"balanced_pairs_test: {:,} rows with {:.2%} matches\".format(\n",
" balanced_pairs_test.shape[0], balanced_pairs_test.Label.mean()\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(outputs_path, exist_ok=True)\n",
"\n",
"# Save the data.\n",
"balanced_pairs_train_path = os.path.join(outputs_path, \"balanced_pairs_train.tsv\")\n",
"print(\n",
" \"Writing {:,} to {}\".format(\n",
" balanced_pairs_train.shape[0], balanced_pairs_train_path\n",
" )\n",
")\n",
"balanced_pairs_train.to_csv(\n",
" balanced_pairs_train_path, sep=\"\\t\", header=True, index=False\n",
")\n",
"\n",
"balanced_pairs_test_path = os.path.join(outputs_path, \"balanced_pairs_test.tsv\")\n",
"print(\n",
" \"Writing {:,} to {}\".format(balanced_pairs_test.shape[0], balanced_pairs_test_path)\n",
")\n",
"balanced_pairs_test.to_csv(balanced_pairs_test_path, sep=\"\\t\", header=True, index=False)\n",
"\n",
"# Save original questions to be used for scoring later.\n",
"questions_path = os.path.join(outputs_path, \"questions.tsv\")\n",
"print(\"Writing {:,} to {}\".format(questions.shape[0], questions_path))\n",
"questions.to_csv(questions_path, sep=\"\\t\", header=True, index=False)\n",
"\n",
"# Save the test duplicate questions to be used with the scoring function.\n",
"dupes_test_path = os.path.join(outputs_path, \"dupes_test.tsv\")\n",
"print(\"Writing {:,} to {}\".format(dupes_test.shape[0], dupes_test_path))\n",
"dupes_test.to_csv(dupes_test_path, sep=\"\\t\", header=True, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now move on to [train on local](02_TrainOnLocal.ipynb) notebook to train our model using Azure Machine \n",
"Learning."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,664 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train Locally\n",
"In this notebook, you will perform the following using Azure Machine Learning.\n",
"* Load workspace.\n",
"* Configure & execute a local run in a user-managed Python environment.\n",
"* Configure & execute a local run in a system-managed Python environment.\n",
"* Configure & execute a local run in a Docker environment.\n",
"* Register model for operationalization."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"from azure_utils.machine_learning.utils import get_workspace_from_config\n",
"from azureml.core import Experiment\n",
"from azureml.core import ScriptRunConfig\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.core.runconfig import RunConfiguration\n",
"from notebooks import DIRECTORY"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Model Hyperparameters"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": [
"parameters"
]
},
"source": [
"This notebook uses a training script that uses \n",
"[lightgbm](https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api). \n",
"Here we set the number of estimators. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_estimators = \"10\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Initialize Workspace\n",
"\n",
"Initialize a workspace object from persisted configuration file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = get_workspace_from_config()\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create An Experiment\n",
"**Experiment** is a logical container in an Azure ML Workspace. It hosts run records which can include run metrics \n",
"and output artifacts from your experiments."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"experiment_name = \"mlaks-train-on-local\"\n",
"exp = Experiment(workspace=ws, name=experiment_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure & Run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this section, we show three different ways of locally training your model through Azure ML SDK for demonstration \n",
"purposes. Only one of these runs is sufficient to register the model."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"### User-managed environment\n",
"Below, we use a user-managed run, which means you are responsible to ensure all the necessary packages that are \n",
"available in the Python environment you choose to run the script. We will use the environment created for this \n",
"tutorial which has Azure ML SDK and other dependencies installed."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Editing a run configuration property on-fly.\n",
"run_config_user_managed = RunConfiguration()\n",
"\n",
"run_config_user_managed.environment.python.user_managed_dependencies = True\n",
"\n",
"# Choose the specific Python environment of this tutorial by pointing to the Python path\n",
"run_config_user_managed.environment.python.interpreter_path = (\n",
" \"/anaconda/envs/az-ml-realtime-score/bin/python\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Submit script to run in the user-managed environment\n",
"Note that the whole `scripts` folder is submitted for execution, including the `item_selector.py` and `label_rank.py` \n",
"files. The model will be written to `outputs` directory which is a special directory such that all content in this \n",
"directory is automatically uploaded to your workspace. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.isdir(\"script\"):\n",
" os.mkdir(\"script\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile script/create_model.py\n",
"from azure_utils.machine_learning import create_model\n",
"\n",
"if __name__ == '__main__':\n",
" create_model.main()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scrpt = \"create_model.py\"\n",
"args = [\n",
" \"--inputs\",\n",
" os.path.abspath(DIRECTORY + \"/data_folder\"),\n",
" \"--outputs\",\n",
" \"outputs\",\n",
" \"--estimators\",\n",
" num_estimators,\n",
" \"--match\",\n",
" \"5\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"src = ScriptRunConfig(\n",
" source_directory=\"./script\",\n",
" script=scrpt,\n",
" arguments=args,\n",
" run_config=run_config_user_managed,\n",
")\n",
"#run = exp.submit(src)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Get run history details"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Block to wait till run finishes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"# run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check that the model is now available in your workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"# run.get_file_names()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the accuracy of the model from run logs by querying the run metrics."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"# run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"source": [
"### System-managed environment\n",
"You can also ask the system to build a new conda environment and execute your scripts in it. The environment is built \n",
"once and will be reused in subsequent executions as long as the conda dependencies remain unchanged. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"run_config_system_managed = RunConfiguration()\n",
"run_config_system_managed.environment.python.user_managed_dependencies = False\n",
"run_config_system_managed.auto_prepare_environment = True"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's specify the conda and pip dependencies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# Specify conda dependencies with scikit-learn and pandas\n",
"conda_pack = [\"scikit-learn==0.19.1\", \"pandas==0.23.3\"]\n",
"requirements = [\"lightgbm==2.1.2\", \"azureml-defaults==1.0.57\", \"Microsoft-AI-Azure-Utility-Samples\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"cd = CondaDependencies.create(conda_packages=conda_pack,\n",
" pip_packages=requirements)\n",
"run_config_system_managed.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"source": [
"#### Submit script to run in the system-managed environment\n",
"A new conda environment is built based on the conda dependencies object. If you are running this for the first time, \n",
"this might take up to 5 minutes. But this conda environment is reused so long as you don't change the conda \n",
"dependencies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"src = ScriptRunConfig(\n",
" source_directory=\"./script\",\n",
" script=scrpt,\n",
" arguments=args,\n",
" run_config=run_config_system_managed,\n",
")\n",
"run = exp.submit(src)\n",
"run"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"source": [
"Block and wait till run finishes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"run.get_file_names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"source": [
"### Docker-based execution\n",
"**IMPORTANT**: You must have Docker engine installed locally in order to use this execution mode. If your kernel is \n",
"already running in a Docker container, such as **Azure Notebooks**, this mode will **NOT** work.\n",
"\n",
"You can also ask the system to pull down a Docker image and execute your scripts in it. We will use the \n",
"`continuumio/miniconda3` image for that purpose."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"run_config_docker = RunConfiguration()\n",
"run_config_docker.environment.python.user_managed_dependencies = False\n",
"run_config_docker.auto_prepare_environment = True\n",
"run_config_docker.environment.docker.enabled = True\n",
"run_config_docker.environment.docker.base_image = \"continuumio/miniconda3\"\n",
"\n",
"# Specify conda and pip dependencies\n",
"cd = CondaDependencies.create(conda_packages=conda_pack,\n",
" pip_packages=requirements)\n",
"run_config_docker.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, we map the local `data_folder` that includes the training and testing data to the docker container using `-v` \n",
"flag."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"outputs": [],
"source": [
"host_dir = os.path.abspath(DIRECTORY + \"/data_folder\")\n",
"container_dir = \"/data_folder\"\n",
"docker_arg = \"{}:{}\".format(host_dir, container_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This time the run will use the mapped `data_folder` inside the docker container to find the data files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"args = [\n",
" \"--inputs\",\n",
" \"/data_folder\",\n",
" \"--outputs\",\n",
" \"outputs\",\n",
" \"--estimators\",\n",
" num_estimators,\n",
" \"--match\",\n",
" \"5\",\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_config_docker.environment.docker.arguments.append(\"-v\")\n",
"run_config_docker.environment.docker.arguments.append(docker_arg)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"src = ScriptRunConfig(\n",
" source_directory=\"./script\",\n",
" script=scrpt,\n",
" arguments=args,\n",
" run_config=run_config_docker,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"run = exp.submit(src)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.get_metrics()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Register Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We now register the model with the workspace so that we can later deploy the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# supply a model name, and the full path to the serialized model file.\n",
"model = run.register_model(model_name=\"question_match_model\",\n",
" model_path=\"./outputs/model.pkl\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(model.name, model.version, model.url, sep=\"\\n\")"
]
}
],
"metadata": {
"authors": [
{
"name": "roastala"
}
],
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,173 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License.\n",
"\n",
"# Develop Scoring Script\n",
"\n",
"In this notebook, we will develop the scoring script and test it locally. We will use the scoring script to create the \n",
"web service that will call the model for scoring."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pandas as pd\n",
"\n",
"from azure_utils.machine_learning.utils import get_workspace_from_config\n",
"from azure_utils.utilities import text_to_json\n",
"from azureml.core.model import Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sys.path.append('./scripts/')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's load the workspace."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = get_workspace_from_config()\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the model registered earlier and download it."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = 'question_match_model'\n",
"\n",
"model = Model(ws, name=model_name)\n",
"print(model.name, model.version, model.url, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.download(target_dir=\".\", exist_ok=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Scoring Script"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We use the writefile magic to write the contents of the below cell to `score.py` which includes the `init` and `run` \n",
"functions required by AML.\n",
"- The init() function typically loads the model into a global object.\n",
"- The run(input_data) function uses the model to predict a value based on the input_data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"\n",
"import pandas as pd\n",
"\n",
"import json\n",
"import logging\n",
"import timeit as t\n",
"from azure_utils.machine_learning.duplicate_model import DuplicateModel\n",
"\n",
"def init():\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" global model\n",
" model_path = \"model.pkl\"\n",
" questions_path = \"./data_folder/questions.tsv\"\n",
" start = t.default_timer()\n",
" model = DuplicateModel(model_path, questions_path)\n",
" end = t.default_timer()\n",
" loadTimeMsg = \"Model loading time: {0} ms\".format(\n",
" round((end - start) * 1000, 2))\n",
" logger.info(loadTimeMsg)\n",
"\n",
"\n",
"def run(body):\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" json_load_text = json.loads(body)\n",
" text_to_score = json_load_text[\"input\"]\n",
" start = t.default_timer()\n",
" resp = model.score(text_to_score)\n",
" end = t.default_timer()\n",
" logger.info(\"Prediction took {0} ms\".format(round((end - start) * 1000,\n",
" 2)))\n",
" return json.dumps(resp)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,398 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License.\n",
"\n",
"# Create Image\n",
"In this notebook, we show the following steps for deploying a web service using AzureML:\n",
"- Create an image\n",
"- Test image locally"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azure_utils.machine_learning.utils import load_configuration, get_workspace_from_config\n",
"from azure_utils.utilities import text_to_json\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"from azureml.core.model import Model\n",
"from notebooks import DIRECTORY\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"source": [
"AML will use the following information to create an image, provision a cluster and deploy a service. Replace the \n",
"values in the following cell with your information."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cfg = load_configuration(\"../workspace_conf.yml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"image_name = cfg['image_name']"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"## Get workspace\n",
"Load existing workspace from the config file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"ws = get_workspace_from_config()\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"source": [
"## Load model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = 'question_match_model'\n",
"\n",
"model = Model(ws, name=model_name)\n",
"print(model.name, model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create an image\n",
"We will now modify the `score.py` created in the previous notebook for the `init()` function to use the model we \n",
"registered to the workspace earlier."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile score.py\n",
"\n",
"import sys\n",
"import pandas as pd\n",
"import json\n",
"import logging\n",
"import timeit as t\n",
"from sklearn.externals import joblib\n",
"from azureml.core.model import Model\n",
"from azureml.contrib.services.aml_request import rawhttp\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from azure_utils.machine_learning.duplicate_model import DuplicateModel\n",
"\n",
"sys.path.append('./scripts/')\n",
"\n",
"\n",
"def init():\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" global model\n",
" model_name = 'question_match_model'\n",
" model_path = Model.get_model_path(model_name)\n",
" questions_path = './notebooks/data_folder/questions.tsv'\n",
" start = t.default_timer()\n",
" model = DuplicateModel(model_path, questions_path)\n",
" end = t.default_timer()\n",
" loadTimeMsg = \"Model loading time: {0} ms\".format(\n",
" round((end - start) * 1000, 2))\n",
" logger.info(loadTimeMsg)\n",
"\n",
"\n",
"@rawhttp\n",
"def run(request):\n",
" \"\"\"\n",
" Function runs on each request\n",
" \"\"\"\n",
" body = request.data\n",
" if request.method == 'POST':\n",
" logger = logging.getLogger(\"scoring_script\")\n",
" json_load_text = json.loads(body)\n",
" text_to_score = json_load_text['input']\n",
" start = t.default_timer()\n",
" resp = model.score(text_to_score)\n",
" end = t.default_timer()\n",
" logger.info(\"Prediction took {0} ms\".format(\n",
" round((end - start) * 1000, 2)))\n",
" return (json.dumps(resp))\n",
" if request.method == 'GET':\n",
" resp_body = {\n",
" \"azEnvironment\": \"Azure\",\n",
" \"location\": \"westus2\",\n",
" \"osType\": \"Ubuntu 16.04\",\n",
" \"resourceGroupName\": \"\",\n",
" \"resourceId\": \"\",\n",
" \"sku\": \"\",\n",
" \"subscriptionId\": \"\",\n",
" \"uniqueId\": \"PythonMLRST\",\n",
" \"vmSize\": \"\",\n",
" \"zone\": \"\",\n",
" \"isServer\": False,\n",
" \"version\": \"\"\n",
" }\n",
" return (resp_body)\n",
" return AMLResponse(\"bad request\", 500)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's specifiy the conda and pip dependencies for the image."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"conda_pack = [\"scikit-learn==0.19.1\", \"pandas==0.23.3\"]\n",
"requirements = [\n",
" \"lightgbm==2.1.2\", \"azureml-defaults==1.0.57\", \"azureml-contrib-services\", \n",
" \"Microsoft-AI-Azure-Utility-Samples\"\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"lgbmenv = CondaDependencies.create(conda_packages=conda_pack,\n",
" pip_packages=requirements)\n",
"\n",
"with open(\"lgbmenv.yml\", \"w\") as f:\n",
" f.write(lgbmenv.serialize_to_string())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.image import ContainerImage\n",
"\n",
"image_config = ContainerImage.image_configuration(\n",
" execution_script=\"score.py\",\n",
" runtime=\"python\",\n",
" conda_file=\"lgbmenv.yml\",\n",
" description=\"Image with lightgbm model\",\n",
" tags={\n",
" \"area\": \"text\",\n",
" \"type\": \"lightgbm\"\n",
" },\n",
" dependencies=[\n",
" \"./notebooks/data_folder/questions.tsv\"\n",
" ],\n",
")\n",
"\n",
"image = ContainerImage.create(\n",
" name=image_name,\n",
" # this is the model object\n",
" models=[model],\n",
" image_config=image_config,\n",
" workspace=ws,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"image.wait_for_creation(show_output=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(image.name, image.version)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_version = str(image.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can find the logs of image creation in the following location."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image.image_build_log_uri"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test image locally"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, let's use one of the duplicate questions to test our image."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_test_path = DIRECTORY + '/data_folder/dupes_test.tsv'\n",
"dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n",
"text_to_score = dupes_test.iloc[0, 4]\n",
"text_to_score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"json_text = text_to_json(text_to_score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"image.run(input_data=json_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Conclusion\n",
"\n",
"We have created a docker Image using AzureML and registred this image on Azure Container Registry (ACR). This docker \n",
"image encapsulates a trained machine learning model and scoring scripts. In the next step, we can take this image \n",
"and deploy it on the compute target of your choice: Azure Kubernetes Service (AKS) Cluster or Azure IoT Edge."
]
}
],
"metadata": {
"authors": [
{
"name": "raymondl"
}
],
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,646 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Deploying a web service to Azure Kubernetes Service (AKS)\n",
"In this notebook, we show the following steps for deploying a web service using AzureML:\n",
"- Provision an AKS cluster (one time action)\n",
"- Deploy the service\n",
"- Test the web service\n",
"- Scale up the service"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"import requests\n",
"from azure_utils.machine_learning.utils import get_workspace_from_config\n",
"from azure_utils.machine_learning.utils import load_configuration\n",
"from azure_utils.utilities import text_to_json\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
"\n",
"from notebooks import DIRECTORY"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"AML will use the following information to create an image, provision a cluster and deploy a service. Replace the \n",
"values in the following cell with your information."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"cfg = load_configuration(\"../workspace_conf.yml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"image_name = cfg['image_name']\n",
"aks_service_name = cfg['aks_service_name']\n",
"aks_name = cfg['aks_name']\n",
"aks_location = cfg['workspace_region']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Get workspace\n",
"Load existing workspace from the config file."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = get_workspace_from_config()\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image = ws.images[image_name]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Restore the statistics data."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Provision the AKS Cluster\n",
"This is a one time setup. You can reuse this cluster for multiple deployments after it has been created. If you delete \n",
"the cluster or the resource group that contains it, then you would have to recreate it. Let's first check if there are \n",
"enough cores in the subscription for the cluster ."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_family = \"Dv2\"\n",
"vm_size = \"Standard_D4_v2\"\n",
"vm_cores = 8\n",
"node_count = 4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"vm_dict = {vm_family: {\"size\": vm_size, \"cores\": vm_cores}}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"prov_config = AksCompute.provisioning_configuration(agent_count=node_count,\n",
" vm_size=vm_size,\n",
" location=aks_location)\n",
"\n",
"# Create the cluster\n",
"aks_target = ComputeTarget.create(workspace=ws,\n",
" name=aks_name,\n",
" provisioning_configuration=prov_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.wait_for_completion(show_output=True)\n",
"print(aks_target.provisioning_state)\n",
"print(aks_target.provisioning_errors)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check that the cluster is created successfully."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_status = aks_target.get_status()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"assert aks_status == 'Succeeded', 'AKS failed to create'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Deploy web service to AKS"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": [
"parameters"
]
},
"source": [
"Next, we deploy the web service. We deploy two pods with 1 CPU core each."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_replicas = 2\n",
"cpu_cores = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Set the web service configuration\n",
"aks_config = AksWebservice.deploy_configuration(num_replicas=num_replicas,\n",
" cpu_cores=cpu_cores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service = Webservice.deploy_from_image(\n",
" workspace=ws,\n",
" name=aks_service_name,\n",
" image=image,\n",
" deployment_config=aks_config,\n",
" deployment_target=aks_target,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_service.wait_for_deployment(show_output=True)\n",
"print(aks_service.state)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can check the logs of the web service with the below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service.get_logs()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test the web service\n",
"We now test the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_dupes_to_score = 4"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_test_path = DIRECTORY + '/data_folder/dupes_test.tsv'\n",
"dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n",
"text_to_score = dupes_test.iloc[0, num_dupes_to_score]\n",
"text_to_score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"json_text = text_to_json(text_to_score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"prediction = aks_service.run(input_data=json_text)\n",
"print(prediction)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's try a few more duplicate questions and display their top 3 original matches. Let's first get the scoring URL \n",
"and API key for the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scoring_url = aks_service.scoring_uri\n",
"api_key = aks_service.get_keys()[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Write the URI and key to the statistics tracker."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"headers = {\n",
" 'content-type': 'application/json',\n",
" 'Authorization': ('Bearer ' + api_key)\n",
"}\n",
"r = requests.post(\n",
" scoring_url, data=json_text,\n",
" headers=headers) # Run the request twice since the first time takes a\n",
"%time r = requests.post(scoring_url, data=json_text, headers=headers) # little longer due to the loading of the model\n",
"print(r)\n",
"r.json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dupes_to_score = dupes_test.iloc[:5, num_dupes_to_score]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = [\n",
" requests.post(scoring_url, data=text_to_json(text), headers=headers)\n",
" for text in dupes_to_score\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's print top 3 matches for each duplicate question."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"[eval(results[i].json())[0:3] for i in range(0, len(results))]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next let's quickly check what the request response performance is for the deployed model on AKS cluster."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"text_data = list(map(text_to_json, dupes_to_score)) # Retrieve the text data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"timer_results = list()\n",
"for text in text_data:\n",
" res=%timeit -r 1 -o -q requests.post(scoring_url, data=text, headers=headers)\n",
" timer_results.append(res.best)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"timer_results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Average time taken: {0:4.2f} ms\".format(10 ** 3 * np.mean(timer_results)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Scaling"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this part, we scale the number of pods to make sure we fully utilize the AKS cluster. To connect to the Kubernetes \n",
"cluster, we will use kubectl, the Kubernetes command-line client. To install, run the following:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!sudo az aks install-cli"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will get the credentials to connect to the cluster."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"os.makedirs(os.path.join(os.path.expanduser('~'),'.kube'), exist_ok=True) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"config_path = os.path.join(os.path.expanduser('~'),'.kube/config')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(config_path, 'a') as f:\n",
" f.write(aks_target.get_credentials()['userKubeConfig'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's check the nodes and pods of the cluster."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get nodes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get events"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can now scale up the number of pods."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_num_replicas = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get namespaces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl scale --current-replicas=$num_replicas \\\n",
" --replicas=$new_num_replicas {\"deployments/\" + aks_service_name} \\\n",
" --namespace azureml-workspace"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get pods --all-namespaces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!kubectl get deployment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Next, we will test the [throughput of the web service](06_SpeedTestWebApp.ipynb)."
]
}
],
"metadata": {
"authors": [
{
"name": "raymondl"
}
],
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,250 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Load Test deployed web application"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook pulls some images and tests them against the deployed web application. We submit requests asychronously \n",
"which should reduce the contribution of latency."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from urllib.parse import urlparse\n",
"\n",
"from azure_utils.machine_learning.utils import get_workspace_from_config\n",
"from azureml.core.webservice import AksWebservice\n",
"from dotenv import get_key"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = get_workspace_from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = get_key(env_path, 'aks_service_name')\n",
"aks_service = AksWebservice(ws, name=aks_service_name)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We will test our service concurrently but only have 4 concurrent requests at any time. We have only deployed one pod \n",
"on one node and increasing the number of concurrent calls does not really increase throughput. Feel free to try \n",
"different values and see how the service responds."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"CONCURRENT_REQUESTS = 4 # Number of requests at a time"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the scoring URL and API key of the service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scoring_url = aks_service.scoring_uri\n",
"api_key = aks_service.get_keys()[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we are going to use [Locust](https://locust.io/) to load test our deployed model. First we need to write the \n",
"locustfile."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%writefile locustfile.py\n",
"from locust import HttpLocust, TaskSet, task\n",
"import os\n",
"import pandas as pd\n",
"from utilities import text_to_json\n",
"from itertools import cycle\n",
"\n",
"_NUMBER_OF_REQUESTS = os.getenv('NUMBER_OF_REQUESTS', 100)\n",
"dupes_test_path = './data_folder/dupes_test.tsv'\n",
"dupes_test = pd.read_csv(dupes_test_path, sep='\\t', encoding='latin1')\n",
"dupes_to_score = dupes_test.iloc[:_NUMBER_OF_REQUESTS, 4]\n",
"_SCORE_PATH = os.getenv('SCORE_PATH', \"/score\")\n",
"_API_KEY = os.getenv('API_KEY')\n",
"\n",
"\n",
"class UserBehavior(TaskSet):\n",
" def on_start(self):\n",
" print('Running setup')\n",
" self._text_generator = cycle(dupes_to_score.apply(text_to_json))\n",
" self._headers = {\n",
" \"content-type\": \"application/json\",\n",
" 'Authorization': ('Bearer {}'.format(_API_KEY))\n",
" }\n",
"\n",
" @task\n",
" def score(self):\n",
" self.client.post(_SCORE_PATH,\n",
" data=next(self._text_generator),\n",
" headers=self._headers)\n",
"\n",
"\n",
"class WebsiteUser(HttpLocust):\n",
" task_set = UserBehavior\n",
" # min and max time to wait before repeating task\n",
" min_wait = 10\n",
" max_wait = 200"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we define the locust command we want to run. We are going to run at a hatch rate of 10 and the whole test will \n",
"last 1 minute. Feel free to adjust the parameters below and see how the results differ. The results of the test will \n",
"be saved to two csv files **modeltest_requests.csv** and **modeltest_distribution.csv**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"parsed_url = urlparse(scoring_url)\n",
"cmd = \"locust -H {host} --no-web -c {users} -r {rate} -t {duration} --csv=modeltest --only-summary\".format(\n",
" host=\"{url.scheme}://{url.netloc}\".format(url=parsed_url),\n",
" users=CONCURRENT_REQUESTS, # concurrent users\n",
" rate=10, # hatch rate (users / second)\n",
" duration='1m', # test duration\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"! API_KEY={api_key} SCORE_PATH={parsed_url.path} PYTHONPATH={os.path.abspath('../')} {cmd}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here are the summary results of our test and below that the distribution infromation of those tests. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pd.read_csv(\"modeltest_requests.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pd.read_csv(\"modeltest_distribution.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To tear down the cluster and all related resources go to the [tear down the cluster](07_TearDown.ipynb) notebook."
]
}
],
"metadata": {
"jupytext": {
"formats": "ipynb"
},
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Просмотреть файл

@ -0,0 +1,741 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": false
}
}
}
}
},
"source": [
"# Explore Duplicate Question Matches\n",
"Use this dashboard to explore the relationship between duplicate and original questions."
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"## Setup\n",
"This section loads needed packages, and defines useful functions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"from __future__ import print_function\n",
"\n",
"import math\n",
"\n",
"import ipywidgets as widgets\n",
"import pandas as pd\n",
"import requests\n",
"from azureml.core.webservice import AksWebservice\n",
"from azureml.core.workspace import Workspace\n",
"from dotenv import get_key, find_dotenv\n",
"from azure_utils.machine_learning.utils import get_workspace_from_config\n",
"from azure_utils.utilities import read_questions, text_to_json, get_auth\n",
"from notebooks import DIRECTORY"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = get_workspace_from_config()\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = get_key(env_path, 'aks_service_name')\n",
"aks_service = AksWebservice(ws, name=aks_service_name)\n",
"aks_service.name"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the duplicate questions scoring app's URL."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scoring_url = aks_service.scoring_uri\n",
"api_key = aks_service.get_keys()[0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A constructor function for ID-text contents. Constructs buttons and text areas for each text ID and text passage.\n",
"* Each buttons's description is set to a text's ID, and its click action is set to the handler.\n",
"* Each text area's content is set to a text.\n",
"* A dictionary is created to map IDs to text areas."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def buttons_and_texts(data,\n",
" id,\n",
" answerid,\n",
" text,\n",
" handle_click,\n",
" layout=widgets.Layout(width=\"100%\"),\n",
" n=15):\n",
" \"\"\"Construct buttons, text areas, and a mapping from IDs to text areas.\"\"\"\n",
" items = []\n",
" text_map = {}\n",
" for i in range(min(n, len(data))):\n",
" button = widgets.Button(description=data.iloc[i][id])\n",
" button.answerid = data.iloc[i][answerid] if answerid in data else None\n",
" button.open = False\n",
" button.on_click(handle_click)\n",
" items.append(button)\n",
" text_area = widgets.Textarea(data.iloc[i][text],\n",
" placeholder=data.iloc[i][id],\n",
" layout=layout)\n",
" items.append(text_area)\n",
" text_map[data.iloc[i][id]] = text_area\n",
" return items, text_map"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A constructor function for the duplicates and questions explorer widget. This builds a box containing duplicates and \n",
"question tabs, each in turn containing boxes that contain the buttons and text areas."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def duplicates_questions_widget(duplicates,\n",
" questions,\n",
" layout=widgets.Layout(width=\"100%\")):\n",
" \"\"\"Construct a duplicates and questions exploration widget.\"\"\"\n",
" # Construct the duplicates Tab of buttons and text areas.\n",
" duplicates_items, duplicates_map = buttons_and_texts(\n",
" duplicates,\n",
" duplicates_id,\n",
" duplicates_answerid,\n",
" duplicates_text,\n",
" duplicates_click,\n",
" n=duplicates.shape[0],\n",
" )\n",
" duplicates_tab = widgets.Tab(\n",
" [widgets.VBox(duplicates_items, layout=layout)],\n",
" layout=widgets.Layout(width=\"100%\", height=\"500px\", overflow_y=\"auto\"),\n",
" )\n",
" duplicates_tab.set_title(0, duplicates_title)\n",
" # Construct the questions Tab of buttons and text areas.\n",
" questions_items, questions_map = buttons_and_texts(\n",
" questions,\n",
" questions_id,\n",
" questions_answerid,\n",
" questions_text,\n",
" questions_click,\n",
" n=questions.shape[0],\n",
" )\n",
" questions_tab = widgets.Tab(\n",
" [widgets.VBox(questions_items, layout=layout)],\n",
" layout=widgets.Layout(width=\"100%\", height=\"500px\", overflow_y=\"auto\"),\n",
" )\n",
" questions_tab.set_title(0, questions_title)\n",
" # Put both tabs in an HBox.\n",
" duplicates_questions = widgets.HBox([duplicates_tab, questions_tab],\n",
" layout=layout)\n",
" return duplicates_map, questions_map, duplicates_questions"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A handler function for a question passage button press. If the passage's text window is open, it is collapsed. \n",
"Otherwise, it is opened."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def questions_click(button):\n",
" \"\"\"Respond to a click on a question button.\"\"\"\n",
" global questions_map\n",
" if button.open:\n",
" questions_map[button.description].rows = None\n",
" button.open = False\n",
" else:\n",
" questions_map[button.description].rows = 10\n",
" button.open = True"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"A handler function for a duplicate obligation button press. If the obligation is not selected, select it and update \n",
"the questions tab with its top 15 question passages ordered by match score. Otherwise, if the duplicate's text window \n",
"is open, it is collapsed, else it is opened."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"def duplicates_click(button):\n",
" \"\"\"Respond to a click on a duplicate button.\"\"\"\n",
" global duplicates_map\n",
" if select_duplicate(button):\n",
" duplicates_map[button.description].rows = 10\n",
" button.open = True\n",
" else:\n",
" if button.open:\n",
" duplicates_map[button.description].rows = None\n",
" button.open = False\n",
" else:\n",
" duplicates_map[button.description].rows = 10\n",
" button.open = True\n",
"\n",
"\n",
"def select_duplicate(button):\n",
" \"\"\"Update the displayed questions to correspond to the button's duplicate\n",
" selections. Returns whether or not the selected duplicate changed.\n",
" \"\"\"\n",
" global selected_button, questions_map, duplicates_questions\n",
" if \"selected_button\" not in globals() or button != selected_button:\n",
" if \"selected_button\" in globals():\n",
" selected_button.style.button_color = None\n",
" selected_button.style.font_weight = \"\"\n",
" selected_button = button\n",
" selected_button.style.button_color = \"yellow\"\n",
" selected_button.style.font_weight = \"bold\"\n",
" duplicates_text = duplicates_map[selected_button.description].value\n",
" questions_scores = score_text(duplicates_text)\n",
" ordered_questions = questions.loc[questions_scores[questions_id]]\n",
" questions_items, questions_map = buttons_and_texts(\n",
" ordered_questions,\n",
" questions_id,\n",
" questions_answerid,\n",
" questions_text,\n",
" questions_click,\n",
" n=questions_display,\n",
" )\n",
" if questions_button_color is True and selected_button.answerid is not None:\n",
" set_button_color(questions_items[::2], selected_button.answerid)\n",
" if questions_button_score is True:\n",
" questions_items = [\n",
" item for button, text_area in zip(*[iter(questions_items)] * 2)\n",
" for item in (add_button_prob(button, questions_scores),\n",
" text_area)\n",
" ]\n",
" duplicates_questions.children[1].children[0].children = questions_items\n",
" duplicates_questions.children[1].set_title(0,\n",
" selected_button.description)\n",
" return True\n",
" else:\n",
" return False\n",
"\n",
"\n",
"def add_button_prob(button, questions_scores):\n",
" \"\"\"Return an HBox containing button and its probability.\"\"\"\n",
" id = button.description\n",
" prob = widgets.Label(score_label + \": \" + str(\n",
" int(\n",
" math.ceil(score_scale *\n",
" questions_scores.loc[id][questions_probability]))))\n",
" return widgets.HBox([button, prob])\n",
"\n",
"\n",
"def set_button_color(button, answerid):\n",
" \"\"\"Set each button's color according to its label.\"\"\"\n",
" for i in range(len(button)):\n",
" button[i].style.button_color = (\n",
" \"lightgreen\" if button[i].answerid == answerid else None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Functions for interacting with the web service."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def score_text(text):\n",
" \"\"\"Return a data frame with the original question scores for the text.\"\"\"\n",
" headers = {\n",
" \"content-type\": \"application/json\",\n",
" \"Authorization\": (\"Bearer \" + api_key),\n",
" }\n",
" # jsontext = json.dumps({'input':'{0}'.format(text)})\n",
" jsontext = text_to_json(text)\n",
" result = requests.post(scoring_url, data=jsontext, headers=headers)\n",
" # scores = result.json()['result'][0]\n",
" scores = eval(result.json())\n",
" scores_df = pd.DataFrame(\n",
" scores,\n",
" columns=[questions_id, questions_answerid, questions_probability])\n",
" scores_df[questions_id] = scores_df[questions_id].astype(str)\n",
" scores_df[questions_answerid] = scores_df[questions_answerid].astype(str)\n",
" scores_df = scores_df.set_index(questions_id, drop=False)\n",
" return scores_df"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"Control the appearance of cell output boxes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"%%html\n",
"<style>\n",
".output_wrapper, .output {\n",
" height:auto !important;\n",
" max-height:1000px; /* your desired max-height here */\n",
"}\n",
".output_scroll {\n",
" box-shadow:none !important;\n",
" webkit-box-shadow:none !important;\n",
"}\n",
"</style>"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"## Load data\n",
"\n",
"Load the pre-formatted text of questions."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"questions_title = 'Questions'\n",
"questions_id = 'Id'\n",
"questions_answerid = 'AnswerId'\n",
"questions_text = 'Text'\n",
"questions_probability = 'Probability'\n",
"questions_path = DIRECTORY + '/data_folder/questions.tsv'\n",
"questions = read_questions(questions_path, questions_id, questions_answerid)"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"Load the pre-formatted text of duplicates."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"duplicates_title = 'Duplicates'\n",
"duplicates_id = 'Id'\n",
"duplicates_answerid = 'AnswerId'\n",
"duplicates_text = 'Text'\n",
"duplicates_path = DIRECTORY + '/data_folder/dupes_test.tsv'\n",
"duplicates = read_questions(duplicates_path, duplicates_id, duplicates_answerid)"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": false
}
}
}
}
},
"source": [
"## Explore original questions matched up with duplicate questions\n",
"\n",
"Define other variables and settings used in creating the interface."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"outputs": [],
"source": [
"questions_display = 15\n",
"questions_button_color = True\n",
"questions_button_score = True\n",
"score_label = 'Score'\n",
"score_scale = 100"
]
},
{
"cell_type": "markdown",
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": true
}
}
}
}
},
"source": [
"This builds the exploration widget as a box containing duplicates and question tabs, each in turn containing boxes \n",
"that have for each ID-text pair a button and a text area."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"extensions": {
"jupyter_dashboards": {
"version": 1,
"views": {
"grid_default": {},
"report_default": {
"hidden": false
}
}
}
}
},
"outputs": [],
"source": [
"duplicates_map, questions_map, duplicates_questions = duplicates_questions_widget(duplicates, questions)\n",
"duplicates_questions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To tear down the cluster and related resources go to the [last notebook](08_TearDown.ipynb)."
]
}
],
"metadata": {
"extensions": {
"jupyter_dashboards": {
"activeView": "report_default",
"version": 1,
"views": {
"grid_default": {
"name": "grid",
"type": "grid"
},
"report_default": {
"name": "report",
"type": "report"
}
}
}
},
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

246
notebooks/08_TearDown.ipynb Normal file
Просмотреть файл

@ -0,0 +1,246 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
"\n",
"Licensed under the MIT License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tear it all down\n",
"Use this notebook to clean up the web service, image, model and the AKS cluster created by the tutorial."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.compute import AksCompute\n",
"from azureml.core.image import Image\n",
"from azureml.core.model import Model\n",
"from azureml.core.webservice import AksWebservice\n",
"from azureml.core.workspace import Workspace\n",
"from dotenv import get_key, find_dotenv\n",
"from azure_utils.utilities import get_auth"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"env_path = find_dotenv(raise_error_if_not_found=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's get the workspace information."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ws = Workspace.from_config(auth=get_auth(env_path))\n",
"print(ws.name, ws.resource_group, ws.location, sep=\"\\n\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the web service to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_service_name = get_key(env_path, 'aks_service_name')\n",
"aks_service = AksWebservice(ws, name=aks_service_name)\n",
"print(aks_service.name, aks_service.tags)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the image to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"image_name = get_key(env_path, 'image_name')\n",
"image_version = int(get_key(env_path, 'image_version'))\n",
"image = Image(ws, name=image_name, version=image_version)\n",
"print(image.name, image.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the model to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_name = 'question_match_model'\n",
"model_version = int(get_key(env_path, 'model_version'))\n",
"model = Model(ws, name=model_name, version=model_version)\n",
"print(model.name, model.version)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's retrieve the AKS compute to delete."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"aks_name = get_key(env_path, 'aks_name')\n",
"aks_target = AksCompute(ws, name=aks_name)\n",
"print(aks_target.name, aks_target.get_status())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Delete the service, image and model. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_service.delete()\n",
"image.delete()\n",
"model.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's delete the AKS compute from the workspace. Since we created the cluster through AML, the corresponding cloud \n",
"based objects will also be deleted. If the custer was created externally and attached to the workspace, the below \n",
"would raise an exception and nothing will be changed."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"aks_target.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you also would like to delete the workspace and all experiments in it, you can use the following."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"ws.delete()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, you can delete the resource group with the following."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"resource_group = get_key(env_path, 'resource_group')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!az group delete --yes --name $resource_group"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "az-ml-realtime-score",
"language": "python",
"name": "az-ml-realtime-score"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

194
notebooks/Makefile Normal file
Просмотреть файл

@ -0,0 +1,194 @@
.ONESHELL:
SHELL=/bin/bash
define PROJECT_HELP_MSG
Makefile for testing notebooks
Make sure you have edited the dev_env_template files and renamed it to .dev_env
All the variables loaded in this makefile must come from the .dev_env file
Usage:
make test run all notebooks
make clean delete env and remove files
endef
export PROJECT_HELP_MSG
env_location=.dev_env
PWD:=$(shell pwd)
include ${env_location}
help:
echo "$$PROJECT_HELP_MSG" | less
test: setup test-notebook1 test-notebook2 test-notebook3 test-notebook4 test-notebook5 test-notebook6 test-notebook7 \
test-notebook8 test-notebook-iot1 test-notebook9 test-notebook-iot2
@echo All Notebooks Passed
setup:
conda env create -f environment.yml
ifndef TENANT_ID
@echo starting interactive login
az login -o table
az account set --subscription ${SUBSCRIPTION_ID}
else
@echo using service principal login
az login -t ${TENANT_ID} --service-principal -u ${SP_USERNAME} --password ${SP_PASSWORD}
endif
test-notebook1:
source activate MLAKSDeployAML
@echo Testing 00_AMLConfiguration.ipynb
papermill 00_AMLConfiguration.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p subscription_id ${SUBSCRIPTION_ID} \
-p resource_group ${RESOURCE_GROUP} \
-p workspace_name ${WORKSPACE_NAME} \
-p workspace_region ${WORKSPACE_REGION} \
-p image_name ${IMAGE_NAME} \
test-notebook2:
source activate MLAKSDeployAML
@echo Testing 01_DataPrep.ipynb
papermill 01_DataPrep.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook3:
source activate MLAKSDeployAML
@echo Testing 02_TrainOnLocal.ipynb
papermill 02_TrainOnLocal.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook4:
source activate MLAKSDeployAML
@echo Testing 03_DevelopScoringScript.ipynb
papermill 03_DevelopScoringScript.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
sleep 1m
test-notebook5:
source activate MLAKSDeployAML
@echo Testing 04_CreateImage.ipynb
papermill 04_CreateImage.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
sleep 30
test-notebook6:
source activate MLAKSDeployAML
@echo Testing 05_DeployOnAKS.ipynb
papermill aks/05_DeployOnAKS.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p aks_name ${AKS_NAME} \
-p aks_location ${WORKSPACE_REGION} \
-p aks_service_name ${AKS_SERVICE_NAME}
test-notebook7:
source activate MLAKSDeployAML
@echo Testing 06_SpeedTestWebApp.ipynb
papermill aks/06_SpeedTestWebApp.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook8:
source activate MLAKSDeployAML
@echo Testing 07_RealTimeScoring.ipynb
papermill aks/07_RealTimeScoring.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook-iot1:
source activate MLAKSDeployAML
@echo Testing 05_DeployOnIOTedge.ipynb
export PYTHONPATH=${PWD}:${PYTHONPATH}
cd iotedge
mkdir ./data_folder
cp ../data_folder/dupes_test.tsv ./data_folder
papermill 05_DeployOnIOTedge.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3 \
-p iot_hub_name fstlstnameiothub \
-p device_id mydevice \
-p module_name mymodule
test-notebook9:
source activate MLAKSDeployAML
@echo Testing 08_TearDown.ipynb
papermill aks/08_TearDown.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-notebook-iot2:
source activate MLAKSDeployAML
@echo Testing 06_TearDown.ipynb
export PYTHONPATH=${PWD}:${PYTHONPATH}
papermill iotedge/06_TearDown.ipynb test.ipynb \
--log-output \
--no-progress-bar \
-k python3
test-cookiecutter-aks:
cookiecutter --no-input https://github.com/Microsoft/MLAKSDeployAML.git --checkout yzhang \
subscription_id="${SUBSCRIPTION_ID}" \
workspace_region=${WORKSPACE_REGION} \
deployment_type="aks"
test-cookiecutter-iot:
cookiecutter --no-input https://github.com/Microsoft/MLAKSDeployAML.git --checkout yzhang \
subscription_id=${SUBSCRIPTION_ID} \
workspace_region=${WORKSPACE_REGION} \
deployment_type="iotedge"
remove-notebook:
rm -f test.ipynb
clean: remove-notebook
conda remove --name MLAKSDeployAML -y --all
rm -rf aml_config
rm -rf __pycache__
rm -rf .ipynb_checkpoints
rm -rf data_folder
rm -rf azureml-models
rm -rf score.py lgbmenv.yml model.pkl
rm -rf iotedge/deployment.json iotedge/deviceconfig.sh
rm -rf iotedge/data_folder
notebook:
source activate MLAKSDeployAML
jupyter notebook --port 9999 --ip 0.0.0.0 --no-browser
install-jupytext:
source activate MLAKSDeployAML
conda install -c conda-forge jupytext
convert-to-py:
jupytext --set-formats ipynb,py_scripts//py --sync *.ipynb
sync:
jupytext --sync *.ipynb
convert-to-ipynb:
jupytext --set-formats ipynb *.ipynb
remove-py:
rm -r py_scripts
.PHONY: help test setup clean remove-notebook test-notebook1 test-notebook2 test-notebook3 test-notebook4 \
test-notebook5 test-notebook6 test-notebook7 test-notebook8 test-notebook-iot1 test-notebook9 test-notebook-iot2

9
notebooks/__init__.py Normal file
Просмотреть файл

@ -0,0 +1,9 @@
"""
az-ml-realtime-score - __init__.py
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
"""
import os
DIRECTORY = os.path.dirname(os.path.realpath(__file__))
WORKING_DIRECTORY = os.getcwd()

Просмотреть файл

@ -0,0 +1,12 @@
# Fill in the fields below and rename to .dev_env
# TENANT_ID, SP_USERNAME and SP_PASSWORD are optional. If not supplied Azure cli will default to interactive login
TENANT_ID=
SP_USERNAME=
SP_PASSWORD=
SUBSCRIPTION_ID=
RESOURCE_GROUP="deployrg"
WORKSPACE_NAME="workspace"
WORKSPACE_REGION="eastus"
IMAGE_NAME="deployimg"
AKS_NAME="deployaks"
AKS_SERVICE_NAME="deployservice"

2
pytest.ini Normal file
Просмотреть файл

@ -0,0 +1,2 @@
[pytest]
junit_family=xunit1

11
sample_workspace_conf.yml Normal file
Просмотреть файл

@ -0,0 +1,11 @@
subscription_id: "<>"
resource_group: "<>"
workspace_name: "<>"
workspace_region: "<>"
image_name: "<>"
aks_service_name: "<>"
aks_name: "<>"
aks_location: "<>"
storage_conn_string: "<>"

6
tests/__init__.py Normal file
Просмотреть файл

@ -0,0 +1,6 @@
"""
ai-architecture-template - __init__.py
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
"""

28
tests/test_notebooks.py Normal file
Просмотреть файл

@ -0,0 +1,28 @@
"""
ai-architecture-template - test_notebooks.py
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
"""
import glob
import pytest
from azure_utils.dev_ops.testing_utilities import run_notebook
from notebooks import DIRECTORY
@pytest.mark.parametrize(
"notebook",
[
DIRECTORY + "/00_AMLConfiguration.ipynb",
DIRECTORY + "/01_DataPrep.ipynb",
DIRECTORY + "/02_TrainOnLocal.ipynb",
DIRECTORY + "/03_DevelopScoringScript.ipynb",
DIRECTORY + "/04_CreateImage.ipynb",
DIRECTORY + "/05_DeployOnAKS.ipynb"
]
)
def test_notebook(notebook, add_nunit_attachment):
run_notebook(notebook, add_nunit_attachment, kernel_name="az-ml-realtime-score", root=DIRECTORY)