First Japanese Translation PR (#74)
* Update deploy.yml * Update deploy.yml * Update deploy.yml * Update deploy.yml * Update docusaurus.config.js * Update docusaurus.config.js * Update deploy.yml * Update deploy.yml * Update deploy.yml * Update deploy.yml * Update docusaurus.config.js * Update docusaurus.config.js * add japanese * Update deploy.yml * Update deploy.yml * json for japanese * add japanese translation * add japanese translation * Update deploy.yml * fix : branch to main for github actions workflow * add devcontainer.json * japanese markdown * japanese markdown * docusaurus devcontainer for vscode developer * japanese markdown translation message * Translated into Japanese * Translate workspace.md * Translated installation.md into Japanese * Translated installation.md into Japanese * translate to japanese * Small fix * Add spaces Add spaces before/after English words in Japanese sentences * Add spaces Add spaces before/after English words in Japanese sentences * Translated into Japanese * Replace single quotes with backquotes * Translated logging.md into Japanese * adopt to upstream main * adopt to upstream main * fix link path in japanese markdown Co-authored-by: RyO <graphgear800@gmail.com> Co-authored-by: Keisuke Takahashi <k14i@outlook.com>
|
@ -0,0 +1,11 @@
|
|||
# devcontainer json from docusaurus github repo
|
||||
{
|
||||
"name": "Docusaurus Dev Container",
|
||||
"image": "mcr.microsoft.com/vscode/devcontainers/typescript-node:14-buster",
|
||||
"settings": {
|
||||
"terminal.integrated.shell.linux": "/bin/bash"
|
||||
},
|
||||
"extensions": ["dbaeumer.vscode-eslint", "orta.vscode-jest"],
|
||||
"forwardPorts": [3000],
|
||||
"postCreateCommand": "cd website && yarn install"
|
||||
}
|
|
@ -1,6 +1,10 @@
|
|||
const path = require('path');
|
||||
|
||||
module.exports = {
|
||||
i18n: {
|
||||
defaultLocale: 'en',
|
||||
locales: ['en', 'ja'],
|
||||
},
|
||||
title: 'Azure Machine Learning',
|
||||
tagline: 'Open source cheat sheets for Azure ML',
|
||||
url: 'https://github.com/Azure/',
|
||||
|
@ -21,6 +25,10 @@ module.exports = {
|
|||
},
|
||||
items: [
|
||||
{to: '/docs/cheatsheets/python/v1/cheatsheet', label: 'Python SDK', position: 'left'},
|
||||
{
|
||||
type: 'localeDropdown',
|
||||
position: 'left',
|
||||
},
|
||||
],
|
||||
},
|
||||
footer: {
|
||||
|
|
|
@ -0,0 +1,218 @@
|
|||
{
|
||||
"theme.NotFound.title": {
|
||||
"message": "ページが見つかりません",
|
||||
"description": "The title of the 404 page"
|
||||
},
|
||||
"theme.NotFound.p1": {
|
||||
"message": "お探しのページが見つかりませんでした。",
|
||||
"description": "The first paragraph of the 404 page"
|
||||
},
|
||||
"theme.NotFound.p2": {
|
||||
"message": "このページにリンクしているサイトの所有者に連絡をしてリンクが壊れていることを伝えてください。",
|
||||
"description": "The 2nd paragraph of the 404 page"
|
||||
},
|
||||
"theme.AnnouncementBar.closeButtonAriaLabel": {
|
||||
"message": "閉じる",
|
||||
"description": "The ARIA label for close button of announcement bar"
|
||||
},
|
||||
"theme.blog.paginator.navAriaLabel": {
|
||||
"message": "ブログ記事一覧のナビゲーション",
|
||||
"description": "The ARIA label for the blog pagination"
|
||||
},
|
||||
"theme.blog.paginator.newerEntries": {
|
||||
"message": "新しい記事",
|
||||
"description": "The label used to navigate to the newer blog posts page (previous page)"
|
||||
},
|
||||
"theme.blog.paginator.olderEntries": {
|
||||
"message": "過去の記事",
|
||||
"description": "The label used to navigate to the older blog posts page (next page)"
|
||||
},
|
||||
"theme.blog.post.readingTime.plurals": {
|
||||
"message": "約{readingTime}分",
|
||||
"description": "Pluralized label for \"{readingTime} min read\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)"
|
||||
},
|
||||
"theme.tags.tagsListLabel": {
|
||||
"message": "タグ:",
|
||||
"description": "The label alongside a tag list"
|
||||
},
|
||||
"theme.blog.post.readMore": {
|
||||
"message": "もっと見る",
|
||||
"description": "The label used in blog post item excerpts to link to full blog posts"
|
||||
},
|
||||
"theme.blog.post.paginator.navAriaLabel": {
|
||||
"message": "ブログ記事のナビゲーション",
|
||||
"description": "The ARIA label for the blog posts pagination"
|
||||
},
|
||||
"theme.blog.post.paginator.newerPost": {
|
||||
"message": "新しい記事",
|
||||
"description": "The blog post button label to navigate to the newer/previous post"
|
||||
},
|
||||
"theme.blog.post.paginator.olderPost": {
|
||||
"message": "過去の記事",
|
||||
"description": "The blog post button label to navigate to the older/next post"
|
||||
},
|
||||
"theme.tags.tagsPageTitle": {
|
||||
"message": "タグ",
|
||||
"description": "The title of the tag list page"
|
||||
},
|
||||
"theme.blog.post.plurals": {
|
||||
"message": "{count}件",
|
||||
"description": "Pluralized label for \"{count} posts\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)"
|
||||
},
|
||||
"theme.blog.tagTitle": {
|
||||
"message": "「{tagName}」タグの記事が{nPosts}あります",
|
||||
"description": "The title of the page for a blog tag"
|
||||
},
|
||||
"theme.tags.tagsPageLink": {
|
||||
"message": "全てのタグを見る",
|
||||
"description": "The label of the link targeting the tag list page"
|
||||
},
|
||||
"theme.CodeBlock.copyButtonAriaLabel": {
|
||||
"message": "クリップボードにコードをコピー",
|
||||
"description": "The ARIA label for copy code blocks button"
|
||||
},
|
||||
"theme.CodeBlock.copied": {
|
||||
"message": "コピーしました",
|
||||
"description": "The copied button label on code blocks"
|
||||
},
|
||||
"theme.CodeBlock.copy": {
|
||||
"message": "コピー",
|
||||
"description": "The copy button label on code blocks"
|
||||
},
|
||||
"theme.docs.sidebar.expandButtonTitle": {
|
||||
"message": "サイドバーを開く",
|
||||
"description": "The ARIA label and title attribute for expand button of doc sidebar"
|
||||
},
|
||||
"theme.docs.sidebar.expandButtonAriaLabel": {
|
||||
"message": "サイドバーを開く",
|
||||
"description": "The ARIA label and title attribute for expand button of doc sidebar"
|
||||
},
|
||||
"theme.docs.sidebar.collapseButtonTitle": {
|
||||
"message": "サイドバーを隠す",
|
||||
"description": "The title attribute for collapse button of doc sidebar"
|
||||
},
|
||||
"theme.docs.sidebar.collapseButtonAriaLabel": {
|
||||
"message": "サイドバーを隠す",
|
||||
"description": "The title attribute for collapse button of doc sidebar"
|
||||
},
|
||||
"theme.docs.sidebar.responsiveCloseButtonLabel": {
|
||||
"message": "メニューを閉じる",
|
||||
"description": "The ARIA label for close button of mobile doc sidebar"
|
||||
},
|
||||
"theme.docs.sidebar.responsiveOpenButtonLabel": {
|
||||
"message": "メニューを開く",
|
||||
"description": "The ARIA label for open button of mobile doc sidebar"
|
||||
},
|
||||
"theme.docs.paginator.navAriaLabel": {
|
||||
"message": "ドキュメントのナビゲーション",
|
||||
"description": "The ARIA label for the docs pagination"
|
||||
},
|
||||
"theme.docs.paginator.previous": {
|
||||
"message": "前へ",
|
||||
"description": "The label used to navigate to the previous doc"
|
||||
},
|
||||
"theme.docs.paginator.next": {
|
||||
"message": "次へ",
|
||||
"description": "The label used to navigate to the next doc"
|
||||
},
|
||||
"theme.docs.versions.unreleasedVersionLabel": {
|
||||
"message": "これはリリース前の{siteTitle} {versionLabel}のドキュメントです。",
|
||||
"description": "The label used to tell the user that he's browsing an unreleased doc version"
|
||||
},
|
||||
"theme.docs.versions.unmaintainedVersionLabel": {
|
||||
"message": "これは{siteTitle} {versionLabel}のドキュメントで現在はアクティブにメンテナンスされていません。",
|
||||
"description": "The label used to tell the user that he's browsing an unmaintained doc version"
|
||||
},
|
||||
"theme.docs.versions.latestVersionSuggestionLabel": {
|
||||
"message": "最新のドキュメントは{latestVersionLink} ({versionLabel}) を見てください。",
|
||||
"description": "The label userd to tell the user that he's browsing an unmaintained doc version"
|
||||
},
|
||||
"theme.docs.versions.latestVersionLinkLabel": {
|
||||
"message": "最新バージョン",
|
||||
"description": "The label used for the latest version suggestion link label"
|
||||
},
|
||||
"theme.common.editThisPage": {
|
||||
"message": "このページを編集",
|
||||
"description": "The link label to edit the current page"
|
||||
},
|
||||
"theme.common.headingLinkTitle": {
|
||||
"message": "見出しへの直接リンク",
|
||||
"description": "Title for link to heading"
|
||||
},
|
||||
"theme.lastUpdated.atDate": {
|
||||
"message": "{date}に",
|
||||
"description": "The words used to describe on which date a page has been last updated"
|
||||
},
|
||||
"theme.lastUpdated.byUser": {
|
||||
"message": "{user}が",
|
||||
"description": "The words used to describe by who the page has been last updated"
|
||||
},
|
||||
"theme.lastUpdated.lastUpdatedAtBy": {
|
||||
"message": "{atDate}{byUser}最終更新",
|
||||
"description": "The sentence used to display when a page has been last updated, and by who"
|
||||
},
|
||||
"theme.common.skipToMainContent": {
|
||||
"message": "メインコンテンツまでスキップ",
|
||||
"description": "The skip to content label used for accessibility, allowing to rapidly navigate to main content with keyboard tab/enter navigation"
|
||||
},
|
||||
"theme.SearchPage.documentsFound.plurals": {
|
||||
"message": "{count}件のドキュメントが見つかりました",
|
||||
"description": "Pluralized label for \"{count} documents found\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)"
|
||||
},
|
||||
"theme.SearchPage.existingResultsTitle": {
|
||||
"message": "『{query}』の検索結果",
|
||||
"description": "The search page title for non-empty query"
|
||||
},
|
||||
"theme.SearchPage.emptyResultsTitle": {
|
||||
"message": "ドキュメントを検索",
|
||||
"description": "The search page title for empty query"
|
||||
},
|
||||
"theme.SearchPage.inputPlaceholder": {
|
||||
"message": "ここに検索するキーワードを入力してください",
|
||||
"description": "The placeholder for search page input"
|
||||
},
|
||||
"theme.SearchPage.inputLabel": {
|
||||
"message": "検索",
|
||||
"description": "The ARIA label for search page input"
|
||||
},
|
||||
"theme.SearchPage.algoliaLabel": {
|
||||
"message": "Algoliaで検索",
|
||||
"description": "The ARIA label for Algolia mention"
|
||||
},
|
||||
"theme.SearchPage.noResultsText": {
|
||||
"message": "検索結果が見つかりませんでした",
|
||||
"description": "The paragraph for empty search result"
|
||||
},
|
||||
"theme.SearchPage.fetchingNewResults": {
|
||||
"message": "新しい検索結果を取得しています...",
|
||||
"description": "The paragraph for fetching new search results"
|
||||
},
|
||||
"theme.SearchBar.label": {
|
||||
"message": "検索",
|
||||
"description": "The ARIA label and placeholder for search button"
|
||||
},
|
||||
"index.title": {
|
||||
"message": "Azure Machine Learning 日本語版"
|
||||
},
|
||||
"index.tagline": {
|
||||
"message": "オープンソースの Azure Machine Learning チートシート"
|
||||
},
|
||||
"section1": {
|
||||
"message": "GPU 分散学習"
|
||||
},
|
||||
"section0": {
|
||||
"message": "チートシート"
|
||||
},
|
||||
"section2": {
|
||||
"message": "環境"
|
||||
},
|
||||
"section0.desc": {
|
||||
"message": "Azure ML で頻出するコードに関するチートシートです。"
|
||||
},
|
||||
"section1.desc": {
|
||||
"message": "Azure ML で分散学習をするためのガイドです。"
|
||||
},
|
||||
"section2.desc": {
|
||||
"message": "Azure ML で Python パッケージと Docker イメージを構築・管理します。"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"version.label": {
|
||||
"message": "Next",
|
||||
"description": "The label for version current"
|
||||
},
|
||||
"sidebar.pythonSidebar.category.Python": {
|
||||
"message": "Python",
|
||||
"description": "The label for category Python in sidebar pythonSidebar"
|
||||
},
|
||||
"sidebar.pythonSidebar.category.Getting Started": {
|
||||
"message": "Getting Started",
|
||||
"description": "The label for category Getting Started in sidebar pythonSidebar"
|
||||
},
|
||||
"sidebar.pythonSidebar.category.Azure ML Resources": {
|
||||
"message": "Azure ML Resources",
|
||||
"description": "The label for category Azure ML Resources in sidebar pythonSidebar"
|
||||
},
|
||||
"sidebar.pythonSidebar.category.Guides": {
|
||||
"message": "Guides",
|
||||
"description": "The label for category Guides in sidebar pythonSidebar"
|
||||
},
|
||||
"sidebar.cliSidebar.category.CLI (preview)": {
|
||||
"message": "CLI (preview)",
|
||||
"description": "The label for category CLI (preview) in sidebar cliSidebar"
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1,225 @@
|
|||
---
|
||||
title: チートシート
|
||||
id: cheatsheet
|
||||
description: A cheat sheet for Azure ML.
|
||||
keywords:
|
||||
- azure machine learning
|
||||
- aml
|
||||
- cheatsheet
|
||||
- overview
|
||||
---
|
||||
|
||||
|
||||
## 基本セットアップ
|
||||
|
||||
### ワークスペースへの接続
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
ws = Workspace.from_config()
|
||||
```
|
||||
|
||||
この Workspace オブジェクトは Azure ML 操作における基本的なオブジェクトで、一連のコードを通して共有されます。(`ws`という変数名で参照されることが多いです。)
|
||||
|
||||
ワークスペースの詳細: [Workspaces](./workspace.md)
|
||||
|
||||
### コンピューティングターゲットへの接続
|
||||
|
||||
```python
|
||||
compute_target = ws.compute_targets['<compute-target-name>']
|
||||
```
|
||||
|
||||
**使用例**
|
||||
|
||||
```python
|
||||
compute_target = ws.compute_targets['powerful-gpu']
|
||||
|
||||
config = ScriptRunConfig(
|
||||
compute_target=compute_target, # train.py スクリプトを実行するために使用されるコンピューティングターゲット
|
||||
source_directory='.',
|
||||
script='train.py',
|
||||
)
|
||||
```
|
||||
|
||||
コンピューティングターゲットの詳細: [コンピューティングターゲット](./compute-targets.md)
|
||||
|
||||
### Python 環境の準備
|
||||
|
||||
pip の`requirements.txt`ファイルや Conda の`env.yml`ファイルを使い、コンピューティング環境の Python 環境を Environment オブジェクトとして定義することができます。
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
# 選択肢 1: pip
|
||||
environment = Environment.from_pip_requirements('<env-name>', '<path/to/requirements.txt>')
|
||||
# 選択肢 2: Conda
|
||||
environment = Environment.from_conda_specification('<env-name>', '<path/to/env.yml>')
|
||||
```
|
||||
|
||||
docker イメージを使って環境を準備することもできます。
|
||||
|
||||
**使用例**
|
||||
|
||||
```python
|
||||
environment = Environment.from_pip_requirements('<env-name>', '<path/to/requirements.txt>')
|
||||
|
||||
config = ScriptRunConfig(
|
||||
environment=environment, # Python 環境を設定する
|
||||
source_directory='.',
|
||||
script='train.py',
|
||||
)
|
||||
```
|
||||
|
||||
環境の詳細: [環境](./environment.md)
|
||||
|
||||
|
||||
## コードをサブミットする
|
||||
|
||||
Azure ML 上でコードを実行するためには:
|
||||
|
||||
1. エントリーポイントとなるコードのパス、コードを実行するコンピューティングターゲット、そしてコードを実行する Python 環境の**設定情報を作成**します。
|
||||
2. Azure ML の実験を新規作成または再利用して**サブミット**します。
|
||||
|
||||
### ScriptRunConfig
|
||||
|
||||
典型的なディレクトリ構成例:
|
||||
|
||||
```bash
|
||||
source_directory/
|
||||
script.py # エントリーポイントとなるコード
|
||||
module1.py # script.py により呼ばれるモジュール
|
||||
...
|
||||
```
|
||||
|
||||
リモートコンピューティングクラスター`target: ComputeTarget`上の、Python 環境`env: Environment`で、`$ (env) python <path/to/code>/script.py [arguments]`を実行するには、 `ScriptRunConfig`クラスを使用します。
|
||||
|
||||
```python
|
||||
from azureml.core import ScriptRunConfig
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/code>', # 相対パスでも OK
|
||||
script='script.py',
|
||||
compute_target=compute_target,
|
||||
environment=environment,
|
||||
arguments=arguments,
|
||||
)
|
||||
```
|
||||
|
||||
ScriptRunConfig の引数の詳細: [Command line arguments](./script-run-config.md#command-line-arguments)
|
||||
|
||||
:::info
|
||||
- `compute_target`: もし引数が与えられなかった場合は、スクリプトはローカルマシン上で実行されます。
|
||||
- `environment`: もし引数が与えられなかった場合、Azure ML のデフォルトPython 環境が使用されます。環境の詳細: [Environment](./environment.md)
|
||||
:::
|
||||
|
||||
#### コマンド
|
||||
|
||||
もしも明示的なコマンドを与える場合。
|
||||
|
||||
```python
|
||||
command = 'echo cool && python script.py'.split()
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/code>', # 相対パスも OK
|
||||
command=command,
|
||||
compute_target=compute_target,
|
||||
environment=environment,
|
||||
arguments=arguments,
|
||||
)
|
||||
```
|
||||
|
||||
コマンドの詳細: [コマンドライン引数](./script-run-config.md#コマンドライン引数)
|
||||
|
||||
### 実験
|
||||
|
||||
コードをサブミットするには`実験`を作成します。実験は、サブミットされた一連のコードをグルーピングしてコードの実行履歴を追跡する軽量のコンテナです。 (参照: [Run History](./run-history.md)).
|
||||
|
||||
|
||||
```python
|
||||
exp = Experiment(ws, '<experiment-name>')
|
||||
run = exp.submit(config)
|
||||
print(run.get_portal_url())
|
||||
```
|
||||
|
||||
上記コードで返される Azure ML Studio へのリンクにより、実験の実行をモニタリングすることができます。
|
||||
|
||||
詳細: [ScriptRunConfig](./script-run-config.md)
|
||||
|
||||
### 使用例
|
||||
|
||||
以下はコマンドラインから Conda 環境を使ってトレーニングスクリプト`train.py`をローカルマシン上で実行する典型的な例です。
|
||||
|
||||
```bash
|
||||
$ conda env create -f env.yml # pythorch という名前の conda env を作成
|
||||
$ conda activate pytorch
|
||||
(pytorch) $ cd <path/to/code>
|
||||
(pytorch) $ python train.py --learning_rate 0.001 --momentum 0.9
|
||||
```
|
||||
|
||||
このスクリプトを Azure 上の GPU を使って実行したいと仮定します。
|
||||
|
||||
```python
|
||||
ws = Workspace.from_config()
|
||||
compute_target = ws.compute_targets['powerful-gpu']
|
||||
environment = Environment.from_conda_specification('pytorch', 'env.yml')
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/code>',
|
||||
script='train.py',
|
||||
environment=environment,
|
||||
arguments=['--learning_rate', 0.001, '--momentum', 0.9],
|
||||
)
|
||||
|
||||
run = Experiment(ws, 'PyTorch model training').submit(config)
|
||||
```
|
||||
|
||||
## 分散 GPU 学習
|
||||
|
||||
分散 GPU 学習を有効にするために`ScriptRunConfig`を変更します。
|
||||
|
||||
```python {3,8-9,12,19}
|
||||
from azureml.core import Workspace, Experiment, ScriptRunConfig
|
||||
from azureml.core import Environment
|
||||
from azureml.core.runconfig import MpiConfiguration
|
||||
|
||||
ws = Workspace.from_config()
|
||||
compute_target = ws.compute_targets['powerful-gpu']
|
||||
environment = Environment.from_conda_specification('pytorch', 'env.yml')
|
||||
environment.docker.enabled = True
|
||||
environment.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'
|
||||
|
||||
# それぞれ 4 つの GPU を搭載した 2 つのノード上でトレーニングを行う
|
||||
mpiconfig = MpiConfiguration(process_count_per_node=4, node_count=2)
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/code>', # train.py が含まれるディレクトリ
|
||||
script='train.py',
|
||||
environment=environment,
|
||||
arguments=['--learning_rate', 0.001, '--momentum', 0.9],
|
||||
distributed_job_config=mpiconfig, # 分散学習のための設定を追加
|
||||
)
|
||||
|
||||
run = Experiment(ws, 'PyTorch model training').submit(config)
|
||||
```
|
||||
|
||||
:::info
|
||||
- `mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04`は OpenMPI の docker イメージです。このイメージは Azure ML 上で分散学習を実行する際に必要となります。
|
||||
- `MpiConfiguration`はトレーニングを行うノード数とノードあたりの GPU 数を指定するために使います。
|
||||
:::
|
||||
|
||||
詳細: [Distributed GPU Training](./distributed-training.md)
|
||||
|
||||
## データへの接続
|
||||
|
||||
ワークスペース`ws`のデフォルトデータストアにあるデータをトレーニングスクリプトから扱うためには:
|
||||
|
||||
```python
|
||||
datastore = ws.get_default_datastore()
|
||||
dataset = Dataset.File.from_files(path=(datastore, '<path/on/datastore>'))
|
||||
```
|
||||
詳細: [Data](./data.md)
|
||||
|
||||
コマンドライン引数に以下を渡すことで上記の`dataset`を使用できます。
|
||||
|
||||
```python
|
||||
arguments=['--data', dataset.as_mount()]
|
||||
```
|
|
@ -0,0 +1,182 @@
|
|||
---
|
||||
title: Developing on Azure ML
|
||||
description: Guide to developing your code on Azure ML.
|
||||
keywords:
|
||||
- ssh
|
||||
- development
|
||||
- compute
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
This guide gives some pointers for developing your code on Azure ML. A typical
|
||||
scenario might be testing your distributed training code, or some other aspect
|
||||
of your code that isn't well represented on your local devbox.
|
||||
|
||||
A common pain-point in these scenarios is that iteration on Azure ML can feel
|
||||
slow - especially when compared to developing on a VM.
|
||||
|
||||
**Learning objective.** To improve the development experience on Azure ML
|
||||
to match - or even exceed - that of a "bare" VM.
|
||||
|
||||
## 🚧 The hurdles
|
||||
|
||||
Two main reasons developing on Azure ML can feel slow as compared to a VM are:
|
||||
|
||||
- Any changes to my Python environment force Docker image rebuild which can
|
||||
take >5 minutes.
|
||||
|
||||
- Compute resources are _released_ between iterations, forcing me to wait for
|
||||
new compute to warm up (e.g. pulling Docker images).
|
||||
|
||||
Below we provide some techniques to address these issues, as well as some advantages
|
||||
to working with Azure ML compute directly. We also provide a [example](#example) applying these
|
||||
techniques.
|
||||
|
||||
## 🕰️ Prepare compute for development
|
||||
|
||||
When creating your _compute instance / cluster_ there are a fews things you can
|
||||
do to prepare for development:
|
||||
|
||||
1. **Enable SSH on compute.**
|
||||
|
||||
Supported on both _compute instance_ and _compute targets_. This will allow you to
|
||||
use your compute just like you would a VM.
|
||||
|
||||
:::tip VS Code Remote Extension.
|
||||
VS Code's [remote extension](https://code.visualstudio.com/docs/remote/ssh)
|
||||
allows you to connect to your Azure ML compute resources via SSH.
|
||||
This way you can develop directly in the cloud.
|
||||
:::
|
||||
|
||||
2. **Increase "Idle seconds before scale down".**
|
||||
|
||||
For compute targets you can increase this parameter e.g. to 30 minutes. This means
|
||||
the cluster won't be released between runs while you iterate.
|
||||
|
||||
:::warning
|
||||
Don't forget to roll this back when you're done iterating.
|
||||
:::
|
||||
|
||||
## 🏃♀️ Commands
|
||||
|
||||
Typically you will submit your code to Azure ML via a `ScriptRunConfig` a little like this:
|
||||
|
||||
```python
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/source_directory>',
|
||||
script='script.py',
|
||||
compute_target=target,
|
||||
environment=env,
|
||||
...
|
||||
)
|
||||
```
|
||||
|
||||
:::info
|
||||
For more details on using `ScriptRunConfig` to submit your code see
|
||||
[Running Code in the cloud](script-run-config).
|
||||
:::
|
||||
|
||||
By using the [`command`](script-run-config#commands) argument you can improve your agility.
|
||||
Commands allow you to chain together several steps in one e.g.:
|
||||
|
||||
```python
|
||||
command = "pip install torch && python script.py --learning_rate 2e-5".split()
|
||||
```
|
||||
|
||||
Another example would be to include a setup script:
|
||||
|
||||
```bash title="setup.sh"
|
||||
echo "Running setup script"
|
||||
pip install torch
|
||||
pip install -r requirements.txt
|
||||
export PYTHONPATH=$PWD
|
||||
```
|
||||
|
||||
and then calling it in your command
|
||||
|
||||
```python
|
||||
command = "bash setup.sh && python script.py --learning_rate 2e-5".split()
|
||||
```
|
||||
|
||||
This way Azure ML doesn't have to rebuild the docker image with incremental changes.
|
||||
|
||||
## Advantages
|
||||
|
||||
In addition to matching the development experience on a VM, there are certain benefits to
|
||||
developing on Azure ML compute directly.
|
||||
|
||||
- **Production-ready.** By developing directly in Azure ML you avoid the additional step of porting your
|
||||
VM-developed code to Azure ML later. This is particularly relevant if you intend to
|
||||
run your production code on Azure ML.
|
||||
- **Data access.** If your training script makes use of data in Azure you can use the Azure ML
|
||||
Python SDK to read it (see [Data](data) for examples). The alternative is that you might have to
|
||||
find some way of getting your data onto the VM you are developing on.
|
||||
- **Notebooks.** Azure ML's _compute insances_ come with Jupyter notebooks which can help with quick
|
||||
debugging. Moreover, these notebooks can easily be run against different compute infrastructure
|
||||
and can be a great way to collaborate.
|
||||
|
||||
## Example
|
||||
|
||||
We provide a simple example demonstrating the mechanics of the above steps. Consider the following
|
||||
setup:
|
||||
|
||||
```bash
|
||||
src/
|
||||
.azureml/
|
||||
config.json # workspace connection config
|
||||
train.py # python script we are developing
|
||||
setup.sh # to run on compute before train.py
|
||||
azureml_run.py # submit job to azure
|
||||
```
|
||||
|
||||
```bash title="setup.sh"
|
||||
echo "Running setup script"
|
||||
pip install numpy
|
||||
```
|
||||
|
||||
```python title="train.py"
|
||||
import numpy as np
|
||||
print(np.random.rand())
|
||||
```
|
||||
|
||||
Now from your local machine you can use the Azure ML Python SDK
|
||||
to execute your command in the cloud:
|
||||
|
||||
```python title="azureml_run.py"
|
||||
from azureml.core import Workspace, Experiment, ScriptRunConfig
|
||||
|
||||
# get workspace
|
||||
ws = Workspace.from_config()
|
||||
target = ws.compute_targets['cpucluster']
|
||||
exp = Experiment(ws, 'dev-example')
|
||||
|
||||
command = "bash setup.sh && python script.py".split()
|
||||
|
||||
# set up script run configuration
|
||||
config = ScriptRunConfig(
|
||||
source_directory='.',
|
||||
command=command,
|
||||
compute_target=target,
|
||||
)
|
||||
|
||||
# submit script to AML
|
||||
run = exp.submit(config)
|
||||
print(run.get_portal_url()) # link to ml.azure.com
|
||||
run.wait_for_completion(show_output=True)
|
||||
```
|
||||
|
||||
Now if you needed to update your Python environment for example you can simply
|
||||
add commands to `setup.sh`:
|
||||
|
||||
```bash title="setup.sh"
|
||||
echo "Running setup script"
|
||||
pip install numpy
|
||||
pip install pandas # add additional libraries
|
||||
export CUDA_VISIBLE_DEVICES="0,1" # set environment variables
|
||||
nvidia-smi # run helpful command-line tools
|
||||
```
|
||||
|
||||
without having to rebuild any Docker images.
|
|
@ -0,0 +1,95 @@
|
|||
---
|
||||
title: コンピューティングターゲット
|
||||
description: Guide to setting up and using Azure compute resources in Azure ML.
|
||||
keywords:
|
||||
- compute
|
||||
- cpu
|
||||
- gpu
|
||||
---
|
||||
|
||||
Compute Target (コンピューティングターゲット) は AML の計算環境の概念を抽象化したものです。対象はローカルマシンから Azure VM で構成されるクラスターまで様々です。
|
||||
|
||||
|
||||
### Compute Target の取得
|
||||
|
||||
ワークスペース `ws` にある既存の Compute Target の取得:
|
||||
|
||||
```python
|
||||
from azureml.core import ComputeTarget
|
||||
target = ComputeTarget(ws, '<compute_target_name>')
|
||||
```
|
||||
|
||||
### 既存 Compute Target のリスト
|
||||
|
||||
ワークスペース `ws` にある Compute Target のリストの取得:
|
||||
|
||||
```python
|
||||
ComputeTarget.list(ws): List[ComputeTarget]
|
||||
```
|
||||
|
||||
### 空き状況の確認
|
||||
|
||||
ワークスペースをチームで共有するときには、ジョブを実行する前にワークスペース `ws` の計算環境が利用可能か確認することがよくあります。
|
||||
|
||||
[studio](https://ml.azure.com) から簡単に確認することができます。
|
||||
|
||||
![](img/compute-target.png)
|
||||
|
||||
## Compute Target の作成
|
||||
|
||||
[studio](https://ml.azure.com) から簡単に新しい Compute Target が作成できます。
|
||||
|
||||
"コンピューティング" のメニュー選択> "コンピューティングクラスタ" のタブを選択 > "+ 新規作成" ボタンを選択:
|
||||
|
||||
![](img/create-compute.png)
|
||||
|
||||
作成時に次の情報を入力します。:
|
||||
|
||||
- **コンピューティング名**: 後に studio や Python SDK から参照するのに利用されます。入力必須です。名前は 名前の長さは 2 から 16 文字の間でなければなりません。有効な文字は英字、数字、- 文字です。
|
||||
- **仮想マシンの種類**: "CPU" or "GPU"
|
||||
- **仮想マシンの優先度**: "専用" もしくは "低優先度"
|
||||
> 低優先度の仮想マシンは安く使えますが、計算環境の確保を保証していません。ジョブが途中で中断される場合があります。
|
||||
- **仮想マシンのサイズ**: ドロップダウンリストから選択します。利用可能な仮想マシンのサイズの一覧は[こちら](https://azure.microsoft.com/global-infrastructure/services/?products=virtual-machines)です。
|
||||
- **最小 / 最大ノード数**: Compute Target は実行されたジョブの数に依って最小ノード数と最大ノード数の間でオートスケースします。最小ノード数を 0 に設定することで計算環境上でのジョブが完了すると自動で 0 台に縮小するためコストを節約できます。
|
||||
- **スケールダウンする前のアイドル時間 (秒)**: 計算環境をスケールダウンする前のアイドル時間を指定します。
|
||||
|
||||
備考: 計算環境を常に Azure Machine Learning Workspace と同じリージョンに作成されます。
|
||||
|
||||
### SSH の利用
|
||||
|
||||
管理者ユーザ名とパスワード・SSH キーを設定することで、Compute Target に対して SSH で接続できます。
|
||||
|
||||
![](img/create-compute-ssh.png)
|
||||
|
||||
### 低優先度 の Compute Target
|
||||
|
||||
低優先度の仮想マシンは安く使えますが、計算環境の確保を保証していません。ジョブが途中で中断される場合があります。
|
||||
|
||||
![](img/create-compute-lp.png)
|
||||
|
||||
### SDK 経由での作成
|
||||
|
||||
SDK 経由での Compute Target の作成:
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
from azureml.core.compute import ComputeTarget, AmlCompute
|
||||
from azureml.core.compute_target import ComputeTargetException
|
||||
|
||||
ws = Workspace.from_config() # .azureml フォルダのファイルから接続情報を参照
|
||||
|
||||
# CPU クラスターの名前を選択
|
||||
cpu_cluster_name = "cpu-cluster"
|
||||
|
||||
# 既存のクラスターが無いことを確認
|
||||
try:
|
||||
cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
|
||||
print('Found existing cluster, use it.')
|
||||
except ComputeTargetException:
|
||||
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
|
||||
max_nodes=4,
|
||||
idle_seconds_before_scaledown=2400)
|
||||
cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
|
||||
|
||||
cpu_cluster.wait_for_completion(show_output=True)
|
||||
```
|
|
@ -0,0 +1,319 @@
|
|||
---
|
||||
title: Data
|
||||
description: Guide to working with data in Azure ML.
|
||||
keywords:
|
||||
- data
|
||||
- dataset
|
||||
- datastore
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
## Concepts
|
||||
|
||||
AzureML provides two basic assets for working with data:
|
||||
|
||||
- Datastore
|
||||
- Dataset
|
||||
|
||||
### Datastore
|
||||
|
||||
Provides an interface for numerous Azure Machine Learning storage accounts.
|
||||
|
||||
Each Azure ML workspace comes with a default datastore:
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
ws = Workspace.from_config()
|
||||
datastore = ws.get_default_datastore()
|
||||
```
|
||||
|
||||
which can also be accessed directly from the [Azure Portal](https://portal.azure.com) (under the same
|
||||
resource group as your Azure ML Workspace).
|
||||
|
||||
Datastores are attached to workspaces and are used to store connection information to Azure storage services so you can refer to them by name and don't need to remember the connection information and secret used to connect to the storage services.
|
||||
|
||||
Use this class to perform management operations, including register, list, get, and remove datastores.
|
||||
|
||||
### Dataset
|
||||
|
||||
A dataset is a reference to data - either in a datastore or behind a public URL.
|
||||
|
||||
Datasets provide enhaced capabilities including data lineage (with the notion of versioned datasets).
|
||||
|
||||
|
||||
## Get Datastore
|
||||
|
||||
### Default datastore
|
||||
|
||||
Each workspace comes with a default datastore.
|
||||
|
||||
```python
|
||||
datastore = ws.get_default_datastore()
|
||||
```
|
||||
|
||||
### Register datastore
|
||||
|
||||
Connect to, or create, a datastore backed by one of the multiple data-storage options
|
||||
that Azure provides. For example:
|
||||
|
||||
- Azure Blob Container
|
||||
- Azure Data Lake (Gen1 or Gen2)
|
||||
- Azure File Share
|
||||
- Azure MySQL
|
||||
- Azure PostgreSQL
|
||||
- Azure SQL
|
||||
- Azure Databricks File System
|
||||
|
||||
See the SDK for a comprehensive list of datastore types and authentication options:
|
||||
[Datastores (SDK)](https://docs.microsoft.com/python/api/azureml-core/azureml.core.datastore(class)?view=azure-ml-py).
|
||||
|
||||
#### Register a new datastore
|
||||
|
||||
- To register a store via an **account key**:
|
||||
|
||||
```python
|
||||
datastores = Datastore.register_azure_blob_container(
|
||||
workspace=ws,
|
||||
datastore_name='<datastore-name>',
|
||||
container_name='<container-name>',
|
||||
account_name='<account-name>',
|
||||
account_key='<account-key>',
|
||||
)
|
||||
```
|
||||
|
||||
- To register a store via a **SAS token**:
|
||||
|
||||
```python
|
||||
datastores = Datastore.register_azure_blob_container(
|
||||
workspace=ws,
|
||||
datastore_name='<datastore-name>',
|
||||
container_name='<container-name>',
|
||||
account_name='<account-name>',
|
||||
sas_token='<sas-token>',
|
||||
)
|
||||
```
|
||||
|
||||
### Connect to datastore
|
||||
|
||||
The workspace object `ws` has access to its datastores via
|
||||
|
||||
```python
|
||||
ws.datastores: Dict[str, Datastore]
|
||||
```
|
||||
|
||||
Any datastore that is registered to workspace can thus be accessed by name.
|
||||
|
||||
```python
|
||||
datastore = ws.datastores['<name-of-registered-datastore>']
|
||||
```
|
||||
|
||||
### Link datastore to Azure Storage Explorer
|
||||
|
||||
The workspace object `ws` is a very powerful handle when it comes to managing assets the
|
||||
workspace has access to. For example, we can use the workspace to connect to a datastore
|
||||
in Azure Storage Explorer.
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
ws = Workspace.from_config()
|
||||
datastore = ws.datastores['<name-of-datastore>']
|
||||
```
|
||||
|
||||
- For a datastore that was created using an **account key** we can use:
|
||||
|
||||
```python
|
||||
account_name, account_key = datastore.account_name, datastore.account_key
|
||||
```
|
||||
|
||||
- For a datastore that was created using a **SAS token** we can use:
|
||||
|
||||
```python
|
||||
sas_token = datastore.sas_token
|
||||
```
|
||||
|
||||
The account_name and account_key can then be used directly in Azure Storage Explorer to
|
||||
connect to the Datastore.
|
||||
|
||||
## Blob Datastore
|
||||
|
||||
Move data to and from your [AzureBlobDatastore](https://docs.microsoft.com/python/api/azureml-core/azureml.data.azure_storage_datastore.azureblobdatastore?view=azure-ml-py) object `datastore`.
|
||||
|
||||
### Upload to Blob Datastore
|
||||
|
||||
The AzureBlobDatastore provides APIs for data upload:
|
||||
|
||||
```python
|
||||
datastore.upload(
|
||||
src_dir='./data',
|
||||
target_path='<path/on/datastore>',
|
||||
overwrite=True,
|
||||
)
|
||||
```
|
||||
|
||||
Alternatively, if you are working with multiple files in different locations you can use
|
||||
|
||||
```python
|
||||
datastore.upload_files(
|
||||
files, # List[str] of absolute paths of files to upload
|
||||
target_path='<path/on/datastore>',
|
||||
overwrite=False,
|
||||
)
|
||||
```
|
||||
|
||||
### Download from Blob Datastore
|
||||
|
||||
Download the data from the blob container to the local file system.
|
||||
|
||||
```python
|
||||
datastore.download(
|
||||
target_path, # str: local directory to download to
|
||||
prefix='<path/on/datastore>',
|
||||
overwrite=False,
|
||||
)
|
||||
```
|
||||
|
||||
### Via Storage Explorer
|
||||
|
||||
Azure Storage Explorer is free tool to easily manage your Azure cloud storage
|
||||
resources from Windows, macOS, or Linux. Download it from [here](https://azure.microsoft.com/features/storage-explorer/).
|
||||
|
||||
Azure Storage Explorer gives you a (graphical) file exporer, so you can literally drag-and-drop
|
||||
files into and out of your datastores.
|
||||
|
||||
See ["Link datastore to Azure Storage Explorer"](#link-datastore-to-azure-storage-explorer)
|
||||
above for more details.
|
||||
|
||||
## Read from Datastore
|
||||
|
||||
Reference data in a `Datastore` in your code, for example to use in a remote setting.
|
||||
|
||||
### DataReference
|
||||
|
||||
First, connect to your basic assets: `Workspace`, `ComputeTarget` and `Datastore`.
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
ws: Workspace = Workspace.from_config()
|
||||
compute_target: ComputeTarget = ws.compute_targets['<compute-target-name>']
|
||||
ds: Datastore = ws.get_default_datastore()
|
||||
```
|
||||
|
||||
Create a `DataReference`, either as mount:
|
||||
|
||||
```python
|
||||
data_ref = ds.path('<path/on/datastore>').as_mount()
|
||||
```
|
||||
|
||||
or as download:
|
||||
|
||||
```python
|
||||
data_ref = ds.path('<path/on/datastore>').as_download()
|
||||
```
|
||||
:::info
|
||||
To mount a datastore the workspace need to have read and write access to the underlying storage. For readonly datastore `as_download` is the only option.
|
||||
:::
|
||||
|
||||
#### Consume DataReference in ScriptRunConfig
|
||||
|
||||
Add this DataReference to a ScriptRunConfig as follows.
|
||||
|
||||
```python
|
||||
config = ScriptRunConfig(
|
||||
source_directory='.',
|
||||
script='script.py',
|
||||
arguments=[str(data_ref)], # returns environment variable $AZUREML_DATAREFERENCE_example_data
|
||||
compute_target=compute_target,
|
||||
)
|
||||
|
||||
config.run_config.data_references[data_ref.data_reference_name] = data_ref.to_config()
|
||||
```
|
||||
|
||||
The command-line argument `str(data_ref)` returns the environment variable `$AZUREML_DATAREFERENCE_example_data`.
|
||||
Finally, `data_ref.to_config()` instructs the run to mount the data to the compute target and to assign the
|
||||
above environment variable appropriately.
|
||||
|
||||
#### Without specifying argument
|
||||
|
||||
Specify a `path_on_compute` to reference your data without the need for command-line arguments.
|
||||
|
||||
```python
|
||||
data_ref = ds.path('<path/on/datastore>').as_mount()
|
||||
data_ref.path_on_compute = '/tmp/data'
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='.',
|
||||
script='script.py',
|
||||
compute_target=compute_target,
|
||||
)
|
||||
|
||||
config.run_config.data_references[data_ref.data_reference_name] = data_ref.to_config()
|
||||
```
|
||||
|
||||
## Create Dataset
|
||||
|
||||
### From local data
|
||||
|
||||
#### Upload to datastore
|
||||
|
||||
To upload a local directory `./data/`:
|
||||
|
||||
```python
|
||||
datastore = ws.get_default_datastore()
|
||||
datastore.upload(src_dir='./data', target_path='<path/on/datastore>', overwrite=True)
|
||||
```
|
||||
|
||||
This will upload the entire directory `./data` from local to the default datastore associated
|
||||
to your workspace `ws`.
|
||||
|
||||
#### Create dataset from files in datastore
|
||||
|
||||
To create a dataset from a directory on a datastore at `<path/on/datastore>`:
|
||||
|
||||
```python
|
||||
datastore = ws.get_default_datastore()
|
||||
dataset = Dataset.File.from_files(path=(datastore, '<path/on/datastore>'))
|
||||
```
|
||||
|
||||
## Use Dataset
|
||||
|
||||
### ScriptRunConfig
|
||||
|
||||
To reference data from a dataset in a ScriptRunConfig you can either mount or download the
|
||||
dataset using:
|
||||
|
||||
- `dataset.as_mount(path_on_compute)` : mount dataset to a remote run
|
||||
- `dataset.as_download(path_on_compute)` : download the dataset to a remote run
|
||||
|
||||
**Path on compute** Both `as_mount` and `as_download` accept an (optional) parameter `path_on_compute`.
|
||||
This defines the path on the compute target where the data is made available.
|
||||
|
||||
- If `None`, the data will be downloaded into a temporary directory.
|
||||
- If `path_on_compute` starts with a `/` it will be treated as an **absolute path**. (If you have
|
||||
specified an absolute path, please make sure that the job has permission to write to that directory.)
|
||||
- Otherwise it will be treated as relative to the working directory
|
||||
|
||||
Reference this data in a remote run, for example in mount-mode:
|
||||
|
||||
```python title="run.py"
|
||||
arguments=[dataset.as_mount()]
|
||||
config = ScriptRunConfig(source_directory='.', script='train.py', arguments=arguments)
|
||||
experiment.submit(config)
|
||||
```
|
||||
|
||||
and consumed in `train.py`:
|
||||
|
||||
```python title="train.py"
|
||||
import sys
|
||||
data_dir = sys.argv[1]
|
||||
|
||||
print("===== DATA =====")
|
||||
print("DATA PATH: " + data_dir)
|
||||
print("LIST FILES IN DATA DIR...")
|
||||
print(os.listdir(data_dir))
|
||||
print("================")
|
||||
```
|
||||
|
||||
For more details: [ScriptRunConfig](script-run-config)
|
|
@ -0,0 +1,75 @@
|
|||
---
|
||||
title: Debugging
|
||||
description: Guide to debugging in Azure ML.
|
||||
keywords:
|
||||
- debug
|
||||
- log files
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
## Azure ML Log Files
|
||||
|
||||
Azure ML's log files are an essential resource for debugging your Azure ML workloads.
|
||||
|
||||
| Log file | Description |
|
||||
| - | - |
|
||||
| `20_image_build_log*.txt` | Docker build logs. Only applicable when updating your Environment. Otherwise Azure ML will reuse cached image. <br/><br/> If successful, contains image registry details for the corresponding image.|
|
||||
| `55_azureml-execution*.txt` | Pulls image to compute target. Note, this log only appears once you have secured compute resources.|
|
||||
| `65_job_prep*.txt` | Job preparation: Download your code to compute target and datastores (if requested). |
|
||||
| **`70_driver_log.txt`** | **The standard output from your script. This is where your code's logs (e.g. print statements) show up.** <br/><br/> In the majority of cases you will monitor the logs here. |
|
||||
| `75_job_post*.txt` | Job release: Send logs, release the compute resources back to Azure. |
|
||||
|
||||
:::info
|
||||
You will not necessarily see every file for every run. For example, the `20_image_build_log*.txt` only appears when a new image is built (e.g. when you change you environment).
|
||||
:::
|
||||
|
||||
### Find logs in the Studio
|
||||
|
||||
These log files are available via the Studio UI at https://ml.azure.com under Workspace > Experiment >
|
||||
Run > "Outputs and logs".
|
||||
|
||||
![](img/log-files.png)
|
||||
|
||||
### Streaming logs
|
||||
|
||||
It is also possible to stream these logs directly to your local terminal using a `Run` object,
|
||||
for example:
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, Experiment, ScriptRunConfig
|
||||
ws = Workspace.from_config()
|
||||
config = ScriptRunConfig(...)
|
||||
run = Experiment(ws, 'my-amazing-experiment').submit(config)
|
||||
run.wait_for_completion(show_output=True)
|
||||
```
|
||||
|
||||
## SSH
|
||||
|
||||
It can be useful to SSH into your compute for a variety of reasons - including to assist in debugging.
|
||||
|
||||
:::warning Enable SSH at compute creation
|
||||
SSH needs to be enabled when you create the compute instance / target - see [Compute Targets](compute-targets#with-ssh) for details.
|
||||
:::
|
||||
|
||||
1. Get **public ip** and **port number** for your compute.
|
||||
|
||||
Visit [ml.azure.com](https://ml.azure.com/) > select "Compute" tab > Locate the desired compute instance / target.
|
||||
|
||||
**Note.** The compute needs to be running in order to connect.
|
||||
- In the case of compute instance this just requires turning it on.
|
||||
- For compute targets there should be something running on the cluster. In this case you can select the "Nodes" tab of the cluster ([ml.azure.com](https://ml.azure.com/) > Compute > _your compute target_ > Nodes) to get Public IP & port number for each node.
|
||||
|
||||
2. Open your favorite shell and run:
|
||||
|
||||
```bash
|
||||
ssh azureuser@<public-ip> -p <port-number>
|
||||
```
|
||||
|
||||
|
||||
:::info SSH key pair using RSA
|
||||
We recommend setting up SSH public-private key pair: see [here](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/mac-create-ssh-keys) for more details.
|
||||
:::
|
||||
|
|
@ -0,0 +1,349 @@
|
|||
---
|
||||
title: Distributed GPU Training
|
||||
id: distributed-training
|
||||
description: Guide to distributed training in Azure ML.
|
||||
keywords:
|
||||
- distributed training
|
||||
- mpi
|
||||
- process group
|
||||
- pytorch
|
||||
- horovod
|
||||
- tensorflow
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
## Basic Concepts
|
||||
|
||||
We assume readers already understand the basic concept of distributed GPU training such as _data parallelism, distributed data parallelism, and model parallelism_. This guide aims at helping readers running existing distributed training code on Azure ML.
|
||||
|
||||
:::info
|
||||
If you don't know which type of parallelism to use, for >90% of the time you should use __Distributed Data Parallelism__.
|
||||
:::
|
||||
|
||||
## MPI
|
||||
|
||||
Azure ML offers an MPI job to launch a given number of processes in each node. Users can adopt this approach to run distributed training using either per-process-launcher or per-node-launcher, depending on whether `process_count_per_node` is set to 1 (the default) for per-node-launcher, or equal to the number of devices/GPUs for per-process-launcher. Azure ML handles constructing the full MPI launch command (`mpirun`) behind the scenes.
|
||||
|
||||
:::note
|
||||
Azure ML currently does not allow users to provide the full head-node-launcher command like `mpirun` or the DeepSpeed launcher. This functionality may be added in a future release.
|
||||
:::
|
||||
|
||||
:::caution
|
||||
To use the Azure ML MPI job, the base Docker image used by the job needs to have an MPI library installed. [Open MPI](https://www.open-mpi.org/) is included in all the [AzureML GPU base images](https://github.com/Azure/AzureML-Containers). If you are using a custom Docker image, you are responsible for making sure the image includes an MPI library. Open MPI is recommended, but you can also use a different MPI implementation such as Intel MPI. Azure ML also provides [curated environments](https://docs.microsoft.com/en-us/azure/machine-learning/resource-curated-environments) for popular frameworks.
|
||||
:::
|
||||
|
||||
To run distributed training using MPI, follow these steps:
|
||||
1. Use an Azure ML environment with the preferred deep learning framework and MPI. AzureML provides [curated environment](https://docs.microsoft.com/en-us/azure/machine-learning/resource-curated-environments) for popular frameworks.
|
||||
2. Define `MpiConfiguration` with the desired `process_count_per_node` and `node_count`. `process_count_per_node` should be equal to the number of GPUs per node for per-process-launch, or set to 1 (the default) for per-node-launch if the user script will be responsible for launching the processes per node.
|
||||
3. Pass the `MpiConfiguration` object to the `distributed_job_config` parameter of `ScriptRunConfig`.
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, ScriptRunConfig, Environment, Experiment
|
||||
from azureml.core.runconfig import MpiConfiguration
|
||||
|
||||
curated_env_name = 'AzureML-PyTorch-1.6-GPU'
|
||||
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
|
||||
distr_config = MpiConfiguration(process_count_per_node=4, node_count=2)
|
||||
|
||||
run_config = ScriptRunConfig(
|
||||
source_directory= './src',
|
||||
script='train.py',
|
||||
compute_target=compute_target,
|
||||
environment=pytorch_env,
|
||||
distributed_job_config=distr_config,
|
||||
)
|
||||
|
||||
# submit the run configuration to start the job
|
||||
run = Experiment(ws, "experiment_name").submit(run_config)
|
||||
```
|
||||
|
||||
### Horovod
|
||||
|
||||
If you are using [Horovod](https://horovod.readthedocs.io/en/stable/index.html) for distributed training with the deep learning framework of your choice, you can run distributed training on Azure ML using the MPI job configuration.
|
||||
|
||||
Simply ensure that you have taken care of the following:
|
||||
* The training code is instrumented correctly with Horovod.
|
||||
* Your Azure ML environment contains Horovod and MPI. The PyTorch and TensorFlow curated GPU environments come pre-configured with Horovod and its dependencies.
|
||||
* Create an `MpiConfiguration` with your desired distribution.
|
||||
|
||||
#### Example
|
||||
* [azureml-examples: TensorFlow distributed training using Horovod](https://github.com/Azure/azureml-examples/tree/main/workflows/train/tensorflow/mnist-distributed-horovod)
|
||||
|
||||
### DeepSpeed
|
||||
|
||||
To run distributed training with the [DeepSpeed](https://www.deepspeed.ai/) library on Azure ML, do not use DeepSpeed's custom launcher. Instead, configure an MPI job to launch the training job [with MPI](https://www.deepspeed.ai/getting-started/#mpi-and-azureml-compatibility).
|
||||
|
||||
Ensure that you have taken care of the following:
|
||||
* Your Azure ML environment contains DeepSpeed and its dependencies, Open MPI, and mpi4py.
|
||||
* Create an `MpiConfiguration` with your desired distribution.
|
||||
|
||||
#### Example
|
||||
* [azureml-examples: Distributed training with DeepSpeed on CIFAR-10](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed/cifar)
|
||||
|
||||
### Environment variables from Open MPI
|
||||
|
||||
When running MPI jobs with Open MPI images, the following environment variables for each process launched:
|
||||
1. OMPI_COMM_WORLD_RANK - the rank of the process
|
||||
2. OMPI_COMM_WORLD_SIZE - the world size
|
||||
3. AZ_BATCH_MASTER_NODE - master address with port, MASTER_ADDR:MASTER_PORT
|
||||
4. OMPI_COMM_WORLD_LOCAL_RANK - the local rank of the process on the node
|
||||
5. OMPI_COMM_WORLD_LOCAL_SIZE - number of processes on the node
|
||||
|
||||
:::caution
|
||||
Despite the name, environment variable OMPI_COMM_WORLD_NODE_RANK does not corresponds to the NODE_RANK. To use per-node-launcher, simply set `process_count_per_node=1` and use OMPI_COMM_WORLD_RANK as the NODE_RANK.
|
||||
:::
|
||||
|
||||
## PyTorch
|
||||
|
||||
Azure ML also supports running distributed jobs using PyTorch's native distributed training capabilities (`torch.distributed`).
|
||||
|
||||
:::tip torch.nn.parallel.DistributedDataParallel vs torch.nn.DataParallel and torch.multiprocessing
|
||||
For data parallelism, the [official PyTorch guidance](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#comparison-between-dataparallel-and-distributeddataparallel) is to use DistributedDataParallel (DDP) over DataParallel for both single-node and multi-node distributed training. PyTorch also [recommends using DistributedDataParallel over the multiprocessing package](https://pytorch.org/docs/stable/notes/cuda.html#use-nn-parallel-distributeddataparallel-instead-of-multiprocessing-or-nn-dataparallel). Azure ML documentation and examples will therefore focus on DistributedDataParallel training.
|
||||
:::
|
||||
|
||||
### Process group initialization
|
||||
|
||||
The backbone of any distributed training is based on a group of processes that know each other and can communicate with each other using a backend. For PyTorch, the process group is created by calling [torch.distributed.init_process_group](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) in __all distributed processes__ to collectively form a process group.
|
||||
|
||||
```
|
||||
torch.distributed.init_process_group(backend='nccl', init_method='env://', ...)
|
||||
```
|
||||
|
||||
The most common communication backends used are __mpi__, __nccl__ and __gloo__. For GPU-based training __nccl__ is strongly recommended for best performance and should be used whenever possible.
|
||||
|
||||
`init_method` specifies how each process can discover each other and initialize as well as verify the process group using the communication backend. By default if `init_method` is not specified PyTorch will use the environment variable initialization method (`env://`). This is also the recommended the initialization method to use in your training code to run distributed PyTorch on Azure ML. For environment variable initialization, PyTorch will look for the following environment variables:
|
||||
|
||||
- **MASTER_ADDR** - IP address of the machine that will host the process with rank 0.
|
||||
- **MASTER_PORT** - A free port on the machine that will host the process with rank 0.
|
||||
- **WORLD_SIZE** - The total number of processes. This should be equal to the total number of devices (GPU) used for distributed training.
|
||||
- **RANK** - The (global) rank of the current process. The possible values are 0 to (world size - 1).
|
||||
|
||||
For more information on process group initialization, see the [PyTorch documentation](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group).
|
||||
|
||||
Beyond these, many applications will also need the following environment variables:
|
||||
- **LOCAL_RANK** - The local (relative) rank of the process within the node. The possible values are 0 to (# of processes on the node - 1). This information is useful because many operations such as data preparation only should be performed once per node --- usually on local_rank = 0.
|
||||
- **NODE_RANK** - The rank of the node for multi-node training. The possible values are 0 to (total # of nodes - 1).
|
||||
|
||||
### Launch options
|
||||
|
||||
The Azure ML PyTorch job supports two types of options for launching distributed training:
|
||||
|
||||
1. __Per-process-launcher__: The system will launch all distributed processes for the user, with all the relevant information (e.g. environment variables) to set up the process group.
|
||||
2. __Per-node-launcher__: The user provides Azure ML with the utility launcher that will get run on each node. The utility launcher will handle launching each of the processes on a given node. Locally within each node, RANK and LOCAL_RANK is set up by the launcher. The **torch.distributed.launch** utility and PyTorch Lightning both belong in this category.
|
||||
|
||||
There are no fundamental differences between these launch options; it is largely up to the user's preference or the conventions of the frameworks/libraries built on top of vanilla PyTorch (such as Lightning or Hugging Face).
|
||||
|
||||
The following sections go into more detail on how to configure Azure ML PyTorch jobs for each of the launch options.
|
||||
|
||||
### DistributedDataParallel (per-process-launch)
|
||||
|
||||
Azure ML supports launching each process for the user without the user needing to use a launcher utility like `torch.distributed.launch`.
|
||||
|
||||
To run a distributed PyTorch job, you will just need to do the following:
|
||||
1. Specify the training script and arguments
|
||||
2. Create a `PyTorchConfiguration` and specify the `process_count` as well as the `node_count`. The `process_count` corresponds to the total number of processes you want to run for your job. This should typically equal `# GPUs per node x # nodes`. If `process_count` is not specified, Azure ML will by default launch one process per node.
|
||||
|
||||
Azure ML will set the MASTER_ADDR, MASTER_PORT, WORLD_SIZE, and NODE_RANK environment variables on each node, in addition to setting the process-level RANK and LOCAL_RANK environment variables.
|
||||
|
||||
:::caution
|
||||
In order to use this option for multi-process-per-node training, you will need to use Azure ML Python SDK `>= 1.22.0`, as process_count was introduced in 1.22.0.
|
||||
:::
|
||||
|
||||
```python
|
||||
from azureml.core import ScriptRunConfig, Environment, Experiment
|
||||
from azureml.core.runconfig import PyTorchConfiguration
|
||||
|
||||
curated_env_name = 'AzureML-PyTorch-1.6-GPU'
|
||||
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
|
||||
distr_config = PyTorchConfiguration(process_count=8, node_count=2)
|
||||
|
||||
run_config = ScriptRunConfig(
|
||||
source_directory='./src',
|
||||
script='train.py',
|
||||
arguments=['--epochs', 50],
|
||||
compute_target=compute_target,
|
||||
environment=pytorch_env,
|
||||
distributed_job_config=distr_config,
|
||||
)
|
||||
|
||||
run = Experiment(ws, 'experiment_name').submit(run_config)
|
||||
```
|
||||
|
||||
:::tip
|
||||
If your training script passes information like local rank or rank as script arguments, you can reference the environment variable(s) in the arguments:
|
||||
`arguments=['--epochs', 50, '--local_rank', $LOCAL_RANK]`.
|
||||
:::
|
||||
|
||||
#### Example
|
||||
- [azureml-examples: Distributed training with PyTorch on CIFAR-10](https://github.com/Azure/azureml-examples/tree/main/workflows/train/pytorch/cifar-distributed)
|
||||
|
||||
### Using `torch.distributed.launch` (per-node-launch)
|
||||
|
||||
PyTorch provides a launch utility in [torch.distributed.launch](https://pytorch.org/docs/stable/distributed.html#launch-utility) that users can use to launch multiple processes per node. The `torch.distributed.launch` module will spawn multiple training processes on each of the nodes.
|
||||
|
||||
The following steps will demonstrate how to configure a PyTorch job with a per-node-launcher on Azure ML that will achieve the equivalent of running the following command:
|
||||
|
||||
python -m torch.distributed.launch --nproc_per_node <num processes per node> \
|
||||
--nnodes <num nodes> --node_rank $NODE_RANK --master_addr $MASTER_ADDR \
|
||||
--master_port $MASTER_PORT --use_env \
|
||||
<your training script> <your script arguments>
|
||||
|
||||
1. Provide the `torch.distributed.launch` command to the `command` parameter of the `ScriptRunConfig` constructor. Azure ML will run this command on each node of your training cluster. `--nproc_per_node` should be less than or equal to the number of GPUs available on each node. MASTER_ADDR, MASTER_PORT, and NODE_RANK are all set by Azure ML, so you can just reference the environment variables in the command. Azure ML sets MASTER_PORT to `6105`, but you can pass a different value to the `--master_port` argument of torch.distributed.launch command if you wish. (The launch utility will reset the environment variables.)
|
||||
2. Create a `PyTorchConfiguration` and specify the `node_count`.
|
||||
|
||||
```python
|
||||
from azureml.core import ScriptRunConfig, Environment, Experiment
|
||||
from azureml.core.runconfig import PyTorchConfiguration
|
||||
|
||||
curated_env_name = 'AzureML-PyTorch-1.6-GPU'
|
||||
pytorch_env = Environment.get(workspace=ws, name=curated_env_name)
|
||||
distr_config = PyTorchConfiguration(node_count=2)
|
||||
launch_cmd = "python -m torch.distributed.launch --nproc_per_node 4 --nnodes 2 --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT --use_env train.py --epochs 50".split()
|
||||
|
||||
run_config = ScriptRunConfig(
|
||||
source_directory='./src',
|
||||
command=launch_cmd,
|
||||
compute_target=compute_target,
|
||||
environment=pytorch_env,
|
||||
distributed_job_config=distr_config,
|
||||
)
|
||||
|
||||
run = Experiment(ws, 'experiment_name').submit(run_config)
|
||||
```
|
||||
|
||||
:::tip Single-node multi-GPU training
|
||||
If you are using the launch utility to run single-node multi-GPU PyTorch training, you do not need to specify the `distributed_job_config` parameter of ScriptRunConfig.
|
||||
|
||||
```python
|
||||
launch_cmd = "python -m torch.distributed.launch --nproc_per_node 4 --use_env train.py --epochs 50".split()
|
||||
|
||||
run_config = ScriptRunConfig(
|
||||
source_directory='./src',
|
||||
command=launch_cmd,
|
||||
compute_target=compute_target,
|
||||
environment=pytorch_env,
|
||||
)
|
||||
```
|
||||
:::
|
||||
|
||||
#### Example
|
||||
- [azureml-examples: Distributed training with PyTorch on CIFAR-10](https://github.com/Azure/azureml-examples/tree/main/workflows/train/pytorch/cifar-distributed)
|
||||
|
||||
### PyTorch Lightning
|
||||
|
||||
[PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/) is a lightweight open-source library that provides a high-level interface for PyTorch. Lightning abstracts away much of the lower-level distributed training configurations required for vanilla PyTorch from the user, and allows users to run their training scripts in single GPU, single-node multi-GPU, and multi-node multi-GPU settings. Behind the scene it launches multiple processes for user similar to `torch.distributed.launch`.
|
||||
|
||||
For single-node training (including single-node multi-GPU), you can run your code on Azure ML without needing to specify a `distributed_job_config`. For multi-node training, Lightning requires the following environment variables to be set on each node of your training cluster:
|
||||
|
||||
- MASTER_ADDR
|
||||
- MASTER_PORT
|
||||
- NODE_RANK
|
||||
|
||||
To run multi-node Lightning training on Azure ML, you can largely follow the [per-node-launch guide](#using-distributedddataparallel-per-node-launch):
|
||||
|
||||
- Define the `PyTorchConfiguration` and specify the desired `node_count`. Do not specify `process_count` as Lightning internally handles launching the worker processes for each node.
|
||||
- For PyTorch jobs, Azure ML handles setting the MASTER_ADDR, MASTER_PORT, and NODE_RANK envirnment variables required by Lightning.
|
||||
- Lightning will handle computing the world size from the Trainer flags `--gpus` and `--num_nodes` and manage rank and local rank internally.
|
||||
|
||||
```python
|
||||
from azureml.core import ScriptRunConfig, Experiment
|
||||
from azureml.core.runconfig import PyTorchConfiguration
|
||||
|
||||
nnodes = 2
|
||||
args = ['--max_epochs', 50, '--gpus', 2, '--accelerator', 'ddp', '--num_nodes', nnodes]
|
||||
distr_config = PyTorchConfiguration(node_count=nnodes)
|
||||
|
||||
run_config = ScriptRunConfig(
|
||||
source_directory='./src',
|
||||
script='train.py',
|
||||
arguments=args,
|
||||
compute_target=compute_target,
|
||||
environment=pytorch_env,
|
||||
distributed_job_config=distr_config,
|
||||
)
|
||||
|
||||
run = Experiment(ws, 'experiment_name').submit(run_config)
|
||||
```
|
||||
|
||||
#### Example
|
||||
* [azureml-examples: Multi-node training with PyTorch Lightning](https://github.com/Azure/azureml-examples/blob/main/tutorials/using-pytorch-lightning/4.train-multi-node-ddp.ipynb)
|
||||
|
||||
### Hugging Face Transformers
|
||||
|
||||
Hugging Face provides many [examples](https://github.com/huggingface/transformers/tree/master/examples) for using its Transformers library with `torch.distributed.launch` to run distributed training. To run these examples and your own custom training scripts using the Transformers Trainer API, follow the [Using `torch.distributed.launch`](#using-torchdistributedlaunch-per-node-launch) section.
|
||||
|
||||
Sample job configuration code to fine-tune the BERT large model on the text classification MNLI task using the `run_glue.py` script on one node with 8 GPUs:
|
||||
```python
|
||||
from azureml.core import ScriptRunConfig
|
||||
from azureml.core.runconfig import PyTorchConfiguration
|
||||
|
||||
distr_config = PyTorchConfiguration() # node_count defaults to 1
|
||||
launch_cmd = "python -m torch.distributed.launch --nproc_per_node 8 text-classification/run_glue.py --model_name_or_path bert-large-uncased-whole-word-masking --task_name mnli --do_train --do_eval --max_seq_length 128 --per_device_train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 3.0 --output_dir /tmp/mnli_output".split()
|
||||
|
||||
run_config = ScriptRunConfig(
|
||||
source_directory='./src',
|
||||
command=launch_cmd,
|
||||
compute_target=compute_target,
|
||||
environment=pytorch_env,
|
||||
distributed_job_config=distr_config,
|
||||
)
|
||||
```
|
||||
|
||||
You can also use the [per-process-launch](#distributeddataparallel-per-process-launch) option to run distributed training without using `torch.distributed.launch`. One thing to keep in mind if using this method is that the transformers [TrainingArguments](https://huggingface.co/transformers/main_classes/trainer.html?highlight=launch#trainingarguments) expects the local rank to be passed in as an argument (`--local_rank`). `torch.distributed.launch` takes care of this when `--use_env=False`, but if you are using per-process-launch you will need to explicitly pass this in as an argument to the training script `--local_rank=$LOCAL_RANK` as Azure ML only sets the LOCAL_RANK environment variable.
|
||||
|
||||
## TensorFlow
|
||||
|
||||
If you are using [native distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training) in your training code, such as TensorFlow 2.x's `tf.distribute.Strategy` API, you can launch the distributed job via Azure ML using the `TensorflowConfiguration`.
|
||||
|
||||
To do so, specify a `TensorflowConfiguration` object to the `distributed_job_config` parameter of the `ScriptRunConfig` constructor. If you are using `tf.distribute.experimental.MultiWorkerMirroredStrategy`, specify the `worker_count` in the `TensorflowConfiguration` corresponding to the number of nodes for your training job.
|
||||
|
||||
```python
|
||||
from azureml.core import ScriptRunConfig, Environment, Experiment
|
||||
from azureml.core.runconfig import TensorflowConfiguration
|
||||
|
||||
curated_env_name = 'AzureML-TensorFlow-2.3-GPU'
|
||||
tf_env = Environment.get(workspace=ws, name=curated_env_name)
|
||||
distr_config = TensorflowConfiguration(worker_count=2, parameter_server_count=0)
|
||||
|
||||
run_config = ScriptRunConfig(
|
||||
source_directory='./src',
|
||||
script='train.py',
|
||||
compute_target=compute_target,
|
||||
environment=tf_env,
|
||||
distributed_job_config=distr_config,
|
||||
)
|
||||
|
||||
# submit the run configuration to start the job
|
||||
run = Experiment(ws, "experiment_name").submit(run_config)
|
||||
```
|
||||
|
||||
If your training script uses the parameter server strategy for distributed training, i.e. for legacy TensorFlow 1.x, you will also need to specify the number of parameter servers to use in the job, e.g. `tf_config = TensorflowConfiguration(worker_count=2, parameter_server_count=1)`.
|
||||
|
||||
### TF_CONFIG
|
||||
|
||||
In TensorFlow, the **TF_CONFIG** environment variable is required for training on multiple machines. For TensorFlow jobs, Azure ML will configure and set the TF_CONFIG variable appropriately for each worker before executing your training script.
|
||||
|
||||
You can access TF_CONFIG from your training script if you need to: `os.environ['TF_CONFIG']`.
|
||||
|
||||
Example TF_CONFIG set on a chief worker node:
|
||||
```json
|
||||
TF_CONFIG='{
|
||||
"cluster": {
|
||||
"worker": ["host0:2222", "host1:2222"]
|
||||
},
|
||||
"task": {"type": "worker", "index": 0},
|
||||
"environment": "cloud"
|
||||
}'
|
||||
```
|
||||
|
||||
#### Example
|
||||
- [azureml-examples: Distributed TensorFlow training with MultiWorkerMirroredStrategy](https://github.com/Azure/azureml-examples/tree/main/workflows/train/tensorflow/mnist-distributed)
|
||||
|
||||
## Accelerating GPU training with InfiniBand
|
||||
|
||||
Certain Azure VM series, specifically the NC, ND, and H-series, now have RDMA-capable VMs with SR-IOV and Infiniband support. These VMs communicate over the low latency and high bandwidth InfiniBand network, which is much more performant than Ethernet-based connectivity. SR-IOV for InfiniBand enables near bare-metal performance for any MPI library (MPI is leveraged by many distributed training frameworks and tooling, including NVIDIA's NCCL software.) These SKUs are intended to meet the needs of computationally-intensive, GPU-acclerated machine learning workloads. For more information, see [Accelerating Distributed Training in Azure Machine Learning with SR-IOV](https://techcommunity.microsoft.com/t5/azure-ai/accelerating-distributed-training-in-azure-machine-learning/ba-p/1059050).
|
||||
|
||||
If you create an `AmlCompute` cluster of one of these RDMA-capable, InfiniBand-enabled sizes, such as `Standard_ND40rs_v2`, the OS image will come with the Mellanox OFED driver required to enable InfiniBand preinstalled and preconfigured.
|
|
@ -0,0 +1,108 @@
|
|||
---
|
||||
title: 'Azure ML Containers'
|
||||
description: Guide to containers in Azure ML.
|
||||
keywords:
|
||||
- containers
|
||||
- dockerfile
|
||||
- docker
|
||||
- environment
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
In this post we explain how Azure ML builds the containers used to run your code.
|
||||
|
||||
## Dockerfile
|
||||
|
||||
Each job in Azure ML runs with an associated `Environment`. In practice, each environment
|
||||
corresponds to a Docker image.
|
||||
|
||||
There are numerous ways to define an environment - from specifying a set of required Python packages
|
||||
through to directly providing a custom Docker image. In each case the contents of the associated
|
||||
dockerfile are available directly from the environment object.
|
||||
|
||||
For more background: [Environment](environment)
|
||||
|
||||
#### Example
|
||||
|
||||
Suppose you create an environment - in this example we will work with Conda:
|
||||
|
||||
```yml title="env.yml"
|
||||
name: pytorch
|
||||
channels:
|
||||
- defaults
|
||||
- pytorch
|
||||
dependencies:
|
||||
- python=3.7
|
||||
- pytorch
|
||||
- torchvision
|
||||
```
|
||||
|
||||
We can create and register this as an `Environment` in our workspace `ws` as follows:
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
env = Environment.from_conda_specification('pytorch', 'env.yml')
|
||||
env.register(ws)
|
||||
```
|
||||
|
||||
In order to consume this environment in a remote run, Azure ML builds a docker image
|
||||
that creates the corresponding python environment.
|
||||
|
||||
The dockerfile used to build this image is available directly from the environment object.
|
||||
|
||||
```python
|
||||
details = env.get_image_details(ws)
|
||||
print(details['ingredients']['dockerfile'])
|
||||
```
|
||||
|
||||
Let's take a look:
|
||||
|
||||
```docker title="Dockerfile" {1,7-12}
|
||||
FROM mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20200821.v1@sha256:8cee6f674276dddb23068d2710da7f7f95b119412cc482675ac79ba45a4acf99
|
||||
USER root
|
||||
RUN mkdir -p $HOME/.cache
|
||||
WORKDIR /
|
||||
COPY azureml-environment-setup/99brokenproxy /etc/apt/apt.conf.d/
|
||||
RUN if dpkg --compare-versions `conda --version | grep -oE '[^ ]+$'` lt 4.4.11; then conda install conda==4.4.11; fi
|
||||
COPY azureml-environment-setup/mutated_conda_dependencies.yml azureml-environment-setup/mutated_conda_dependencies.yml
|
||||
RUN ldconfig /usr/local/cuda/lib64/stubs && conda env create -p /azureml-envs/azureml_7459a71437df47401c6a369f49fbbdb6 -
|
||||
f azureml-environment-setup/mutated_conda_dependencies.yml && rm -rf "$HOME/.cache/pip" && conda clean -aqy && CONDA_ROO
|
||||
T_DIR=$(conda info --root) && rm -rf "$CONDA_ROOT_DIR/pkgs" && find "$CONDA_ROOT_DIR" -type d -name __pycache__ -exec rm
|
||||
-rf {} + && ldconfig
|
||||
# AzureML Conda environment name: azureml_7459a71437df47401c6a369f49fbbdb6
|
||||
ENV PATH /azureml-envs/azureml_7459a71437df47401c6a369f49fbbdb6/bin:$PATH
|
||||
ENV AZUREML_CONDA_ENVIRONMENT_PATH /azureml-envs/azureml_7459a71437df47401c6a369f49fbbdb6
|
||||
ENV LD_LIBRARY_PATH /azureml-envs/azureml_7459a71437df47401c6a369f49fbbdb6/lib:$LD_LIBRARY_PATH
|
||||
COPY azureml-environment-setup/spark_cache.py azureml-environment-setup/log4j.properties /azureml-environment-setup/
|
||||
RUN if [ $SPARK_HOME ]; then /bin/bash -c '$SPARK_HOME/bin/spark-submit /azureml-environment-setup/spark_cache.py'; fi
|
||||
ENV AZUREML_ENVIRONMENT_IMAGE True
|
||||
CMD ["bash"]
|
||||
```
|
||||
|
||||
Notice:
|
||||
|
||||
- The base image here is a standard image maintained by Azure ML. Dockerfiles for all base images are available on
|
||||
github: https://github.com/Azure/AzureML-Containers
|
||||
- The dockerfile references `mutated_conda_dependencies.yml` to build the Python environment via Conda.
|
||||
|
||||
Get the contents of `mutated_conda_dependencies.yml` from the environment:
|
||||
|
||||
```python
|
||||
print(env.python.conda_dependencies.serialize_to_string())
|
||||
```
|
||||
|
||||
Which looks like
|
||||
|
||||
```bash title="mutated_conda_dependencies.yml"
|
||||
channels:
|
||||
- defaults
|
||||
- pytorch
|
||||
dependencies:
|
||||
- python=3.7
|
||||
- pytorch
|
||||
- torchvision
|
||||
name: azureml_7459a71437df47401c6a369f49fbbdb6
|
||||
```
|
|
@ -0,0 +1,351 @@
|
|||
---
|
||||
title: Environment
|
||||
description: Guide to working with Python environments in Azure ML.
|
||||
keywords:
|
||||
- environment
|
||||
- python
|
||||
- conda
|
||||
- pip
|
||||
- docker
|
||||
- environment variables
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
Azure ML Environments are used to define the containers where your code will run. In the simplest case you can add custom Python libraries using pip, Conda or directly via the Azure ML Python SDK. If more customization is necessary you can use custom docker images.
|
||||
|
||||
This page provides examples creating environments:
|
||||
|
||||
- From pip `requirements.txt` file
|
||||
- From Conda `env.yml` file
|
||||
- Directly via the Azure ML Python SDK
|
||||
- From custom Docker image
|
||||
|
||||
|
||||
## Azure ML Managed Python Environments
|
||||
|
||||
### From pip
|
||||
|
||||
Create Environment from pip `requirements.txt` file
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
env = Environment.from_pip_requirements('<env-name>', '<path/to/requirements.txt>')
|
||||
```
|
||||
|
||||
### From Conda
|
||||
|
||||
Create Environment from Conda `env.yml` file
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
env = Environment.from_conda_specification('<env-name>', '<path/to/env.yml>')
|
||||
```
|
||||
|
||||
### In Azure ML SDK
|
||||
|
||||
Use the `CondaDependencies` class to create a Python environment in directly with the Azure ML
|
||||
Python SDK:
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
from azureml.core.conda_dependencies import CondaDependencies
|
||||
|
||||
conda = CondaDependencies()
|
||||
|
||||
# add channels
|
||||
conda.add_channel('pytorch')
|
||||
|
||||
# add conda packages
|
||||
conda.add_conda_package('python=3.7')
|
||||
conda.add_conda_package('pytorch')
|
||||
conda.add_conda_package('torchvision')
|
||||
|
||||
# add pip packages
|
||||
conda.add_pip_package('pyyaml')
|
||||
conda.add_pip_package('mpi4py')
|
||||
conda.add_pip_package('deepspeed')
|
||||
|
||||
# create environment
|
||||
env = Environment('pytorch')
|
||||
env.python.conda_dependencies = conda
|
||||
```
|
||||
|
||||
## Custom docker image / dockerfile
|
||||
|
||||
To create an `Environment` from a custom docker image:
|
||||
|
||||
```python
|
||||
env = Environment('<env-name>')
|
||||
env.docker.base_image = '<image-name>'
|
||||
env.docker.base_image_registry.address = '<container-registry-address>'
|
||||
env.docker.base_image_registry.username = '<acr-username>'
|
||||
env.docker.base_image_registry.password = os.environ.get("CONTAINER_PASSWORD")
|
||||
# optional
|
||||
env.python.user_managed_dependencies = True
|
||||
env.python.interpreter_path = '/opt/miniconda/envs/example/bin/python'
|
||||
```
|
||||
|
||||
For example Azure Container Registry addresses are of the form `"<acr-name>.azurecr.io"`.
|
||||
|
||||
**Never check in passwords**. In this example we provide the password via an environment variable.
|
||||
|
||||
To create an `Environment` from a dockerfile:
|
||||
|
||||
```python
|
||||
env = Environment('<env-name>')
|
||||
env.docker.base_dockerfile = './Dockerfile' # path to your dockerfile
|
||||
# optional
|
||||
env.python.user_managed_dependencies = True
|
||||
env.python.interpreter_path = '/opt/miniconda/envs/example/bin/python'
|
||||
```
|
||||
**Remarks.**
|
||||
|
||||
- `user_managed_dependencies = True`: You are responsible for installing all necessary Python
|
||||
libraries, typically in your docker image.
|
||||
- `interpreter_path`: Only used when `user_managed_dependencies=True` and sets the Python interpreter
|
||||
path (e.g. `which python`).
|
||||
|
||||
|
||||
It is possible to have Azure ML manage your Python installation when providing a custom base image. For example, using pip `requirements.txt`:
|
||||
|
||||
```python
|
||||
env = Environment.from_pip_requirements('<env-name>', '<path/to/requirements.txt>')
|
||||
env.docker.base_dockerfile = './Dockerfile'
|
||||
```
|
||||
|
||||
**Note.** In this case Python libraries installed in `Dockerfile` will **not** be available.
|
||||
|
||||
### Build custom docker image for Azure ML
|
||||
|
||||
We **strongly** recommend building your docker image from one of the Azure ML base images available
|
||||
here: [AzureML-Containers GitHub Repo](https://github.com/Azure/AzureML-Containers) - like this:
|
||||
|
||||
```dockerfile title="Dockerfile"
|
||||
FROM mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04
|
||||
...
|
||||
```
|
||||
|
||||
These images come configured with all the requirements to run on Azure ML.
|
||||
|
||||
If user wants to build from scratch, here are a list of requirements and recommendations to keep in mind:
|
||||
- **Conda**: Azure ML uses Conda to manage python environments by default. If you intend to allow Azure ML to manage the Python environment, Conda is required.
|
||||
- **libfuse**: Required when using `Dataset`
|
||||
- **Openmpi**: Required for distributed runs
|
||||
- **nvidia/cuda**: (Recommended) For GPU-based training build image from [nvidia/cuda](https://hub.docker.com/r/nvidia/cuda)
|
||||
- **Mellanox OFED user space drivers** (Recommend) For SKUs with Infiniband
|
||||
|
||||
We suggest users to look at the [dockerfiles of Azure ML base images](https://github.com/Azure/AzureML-Containers) as references.
|
||||
|
||||
### Use custom image from a private registry
|
||||
|
||||
Azure ML can use a custom image from a private registry as long as login information are provided.
|
||||
|
||||
```python
|
||||
env = Environment('<env-name>')
|
||||
env.docker.base_image = "/my/private/img:tag", # image repository path
|
||||
env.docker.base_image_registry.address = "myprivateacr.azurecr.io" # private registry
|
||||
|
||||
# Retrieve username and password from the workspace key vault
|
||||
env.docker.base_image_registry.username = ws.get_default_keyvault().get_secret("username")
|
||||
env.docker.base_image_registry.password = ws.get_default_keyvault().get_secret("password")
|
||||
```
|
||||
|
||||
## Environment Management
|
||||
|
||||
### Registered Environments
|
||||
|
||||
Register an environment `env: Environment` to your workspace `ws` to reuse/share with your team.
|
||||
|
||||
```python
|
||||
env.register(ws)
|
||||
```
|
||||
|
||||
Registered environments can be obtained directly from the workspace handle `ws`:
|
||||
|
||||
```python
|
||||
envs: Dict[str, Environment] = ws.environments
|
||||
```
|
||||
|
||||
This dictionary contains custom environments that have been registered to the workspace as well as a
|
||||
collection of _curated environments_ maintained by Azure ML.
|
||||
|
||||
#### Example.
|
||||
|
||||
```python
|
||||
# create / update, register environment
|
||||
env = Environment.from_pip_requirements('my-env', 'requirements.txt')
|
||||
env.register(ws)
|
||||
|
||||
# use later
|
||||
env = ws.environments['my-env']
|
||||
|
||||
# get a specific version
|
||||
env = Environment.get(ws, 'my-env', version=6)
|
||||
```
|
||||
|
||||
### Save / Load Environments
|
||||
|
||||
Save an environment to a local directory:
|
||||
|
||||
```python
|
||||
env.save_to_directory('<path/to/local/directory>', overwrite=True)
|
||||
```
|
||||
|
||||
This will generate a directory with two (human-understandable and editable) files:
|
||||
|
||||
- `azureml_environment.json` : Metadata including name, version, environment variables and Python and Docker configuration
|
||||
- `conda_dependencies.yml` : Standard conda dependencies YAML (for more details see [Conda docs](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-from-an-environment-yml-file)).
|
||||
|
||||
Load this environment later with
|
||||
|
||||
```python
|
||||
env = Environment.load_from_directory('<path/to/local/directory>')
|
||||
```
|
||||
|
||||
### Environment Variables
|
||||
|
||||
To set environment variables use the `environment_variables: Dict[str, str]` attribute. Environment variables
|
||||
are set on the process where the user script is executed.
|
||||
|
||||
```python
|
||||
env = Environment('example')
|
||||
env.environment_variables['EXAMPLE_ENV_VAR'] = 'EXAMPLE_VALUE'
|
||||
```
|
||||
|
||||
## Hints and tips
|
||||
|
||||
When the conda dependencies are managed by Azure ML (`user_managed_dependencies=False`, by default), Azure ML will check whether the same environment has already been materialized into a docker image in the Azure Container Registry associated with the Azure ML workspace. If it is a new environment, Azure ML will have a job preparation stage to build a new docker image for the new environment. You will see a image build log file in the logs and monitor the image build progress. The job won't start until the image is built and pushed to the container registry.
|
||||
|
||||
This image building process can take some time and delay your job start. To avoid unnecessary image building, consider:
|
||||
|
||||
1. Register an environment that contains most packages you need and reuse when possible.
|
||||
2. If you only need a few extra packages on top of an existing environment,
|
||||
1. If the existing environment is a docker image, use a dockerfile from this docker image so you only need to add one layer to install a few extra packagers.
|
||||
2. Install extra python packages in your user script so the package installation happens in the script run as part of your code instead of asking Azure ML to treat them as part of a new environment. Consider using a [setup script](#advanced-shell-initialization-script).
|
||||
|
||||
Due to intricacy of the python package dependencies and potential version conflict, we recommend use of custom docker image and dockerfiles (based on Azure ML base images) to manage your own python environment. This practice not only gives users full transparency of the environment, but also saves image building times at agile development stage.
|
||||
|
||||
### Build docker images locally and push to Azure Container Registry
|
||||
|
||||
If you have docker installed locally, you can build the docker image from Azure ML environment locally with option to push the image to workspace ACR directly. This is recommended when users are iterating on the dockerfile since local build can utilize cached layers.
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
myenv = Environment(name='<env-name>')
|
||||
registered_env = myenv.register(ws)
|
||||
registered_env.build_local(ws, useDocker=True, pushImageToWorkspaceAcr=True)
|
||||
```
|
||||
|
||||
### Bootstrap Script
|
||||
|
||||
It can be useful to invoke a `bootstrap.sh` script for faster development. One typical example
|
||||
would be to modify the Python installation _at runtime_ to avoid frequent image rebuilding.
|
||||
|
||||
This can be done quite simply with _commands_. First set up your `bootstrap.sh` script.
|
||||
|
||||
```bash title="bootstrap.sh"
|
||||
echo "Running bootstrap.sh"
|
||||
pip install torch==1.8.0+cu111
|
||||
...
|
||||
```
|
||||
|
||||
To have this run ahead of your training script `train.py` make use of the command:
|
||||
|
||||
```python
|
||||
cmd = "bash bootstrap.sh && python train.py --learning_rate 1e-5".split()
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/code>',
|
||||
command=command,
|
||||
compute_target=compute_target,
|
||||
environment=environment,
|
||||
)
|
||||
```
|
||||
|
||||
See [Running Code in the Cloud](script-run-config) for more details on `ScriptRunConfig`.
|
||||
|
||||
### Distributed bootstrapping
|
||||
|
||||
In some cases you may wish to run certain parts of your `bootstrap.sh` script
|
||||
on certain ranks in a distributed setup. This can be achieved with a little care
|
||||
as follows:
|
||||
|
||||
```bash title="bootstrap.sh"
|
||||
MARKER="/tmp/.azureml_bootstrap_complete"
|
||||
|
||||
if [[ $AZ_BATCHAI_TASK_INDEX = 0 ]] ; then
|
||||
echo "Running bootstrap.sh"
|
||||
echo "Installing transformers from source"
|
||||
pip install git+https://github.com/huggingface/transformers
|
||||
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
|
||||
pip install datasets
|
||||
pip install tensorflow
|
||||
echo "Installation complete"
|
||||
touch $MARKER
|
||||
fi
|
||||
echo "Barrier..."
|
||||
while [[ ! -f $MARKER ]]
|
||||
do
|
||||
sleep 1
|
||||
done
|
||||
echo "Bootstrap complete!"
|
||||
```
|
||||
|
||||
This script will wait for local rank 0 (`$AZ_BATCHAI_TASK_INDEX`) to create its `MARKER` file
|
||||
before the other processes continue.
|
||||
|
||||
### Use Keyvault to pass secrets
|
||||
|
||||
#### Workspace Default Keyvault
|
||||
|
||||
Each Azure workspace comes with a keyvault (you can find this in the Azure Portal under the same resource
|
||||
group as your Workspace).
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
|
||||
ws = Workspace.from_config()
|
||||
kv = ws.get_default_keyvault()
|
||||
```
|
||||
|
||||
This can be used both to get and set secrets:
|
||||
|
||||
```python
|
||||
import os
|
||||
from azureml.core import Keyvault
|
||||
|
||||
# add a secret to keyvault
|
||||
kv.set_secret(name="<my-secret>", value=os.environ.get("MY_SECRET"))
|
||||
|
||||
# get a secret from the keyvault
|
||||
secret = kv.get_secret(name="<my-secret>")
|
||||
|
||||
# equivalently
|
||||
secret = run.get_secret(name="<my-secret>")
|
||||
```
|
||||
|
||||
#### Generic Azure Keyvault
|
||||
|
||||
Of course you can also make use of other keyvaults you might have in Azure.
|
||||
|
||||
```python
|
||||
from azure.identity import DefaultAzureCredential
|
||||
from azure.keyvault.secrets import SecretClient
|
||||
|
||||
credential = DefaultAzureCredential()
|
||||
client = SecretClient(vault_url=kv_url, credential=credential)
|
||||
my_secret = client.get_secret(secret_name).value
|
||||
|
||||
env = Environment('example')
|
||||
env.environment_variables['POWERFUL_SECRET'] = my_secret
|
||||
```
|
||||
|
||||
Be sure to add `azure-identity` and `azure-keyvault` to your projects requirements in
|
||||
this case.
|
||||
|
||||
```bash
|
||||
pip install azure-identity azure-keyvault
|
||||
```
|
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/compute-target.png
Normal file
После Ширина: | Высота: | Размер: 181 KiB |
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/create-compute-1.png
Normal file
После Ширина: | Высота: | Размер: 154 KiB |
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/create-compute-lp.png
Normal file
После Ширина: | Высота: | Размер: 7.0 KiB |
|
@ -0,0 +1,3 @@
|
|||
[ZoneTransfer]
|
||||
LastWriterPackageFamilyName=Microsoft.ScreenSketch_8wekyb3d8bbwe
|
||||
ZoneId=3
|
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/create-compute-ssh.png
Normal file
После Ширина: | Высота: | Размер: 23 KiB |
|
@ -0,0 +1,3 @@
|
|||
[ZoneTransfer]
|
||||
LastWriterPackageFamilyName=Microsoft.ScreenSketch_8wekyb3d8bbwe
|
||||
ZoneId=3
|
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/create-compute.png
Normal file
После Ширина: | Высота: | Размер: 216 KiB |
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/log-files.png
Normal file
После Ширина: | Высота: | Размер: 108 KiB |
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/logging-metrics.png
Normal file
После Ширина: | Высота: | Размер: 16 KiB |
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/cheatsheets/python/v1/img/run-ex-sine.png
Normal file
После Ширина: | Высота: | Размер: 24 KiB |
|
@ -0,0 +1,84 @@
|
|||
---
|
||||
title: インストール
|
||||
description: Guide to installing Azure ML Python SDK and setting up key resources.
|
||||
keywords:
|
||||
- azureml-sdk
|
||||
- installation
|
||||
- workspace
|
||||
- compute
|
||||
- cpu
|
||||
- gpu
|
||||
---
|
||||
|
||||
Azure ML Python SDK のインストール:
|
||||
|
||||
```console
|
||||
pip install azureml-sdk
|
||||
```
|
||||
|
||||
### ワークスペースの作成
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
|
||||
ws = Workspace.create(name='<my_workspace_name>', # 任意のワークスペース名
|
||||
subscription_id='<azure-subscription-id>', # サブスクリプションID
|
||||
resource_group='<myresourcegroup>', # 任意のリソースグループ名
|
||||
create_resource_group=True,
|
||||
location='<NAME_OF_REGION>') # リソースを作成するリージョン e.g. 'japaneast'
|
||||
|
||||
# ワークスペースの情報を設定ファイルに書き出し: azureml/config.json
|
||||
ws.write_config(path='.azureml')
|
||||
```
|
||||
|
||||
:::info
|
||||
次回からは以下のように簡単にワークスペースにアクセスすることができます。
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
ws = Workspace.from_config()
|
||||
```
|
||||
:::
|
||||
|
||||
### コンピューティングターゲットの作成
|
||||
|
||||
以下の例はワークスペースにコンピューティングターゲットを作成します。
|
||||
|
||||
- VM の種類: CPU
|
||||
- VM のサイズ: STANDARD_D2_V2
|
||||
- VM クラスターの最大ノード数: 4
|
||||
- VM クラスターのノードが自動的にスケールインするまでのアイドル時間: 2400秒
|
||||
|
||||
GPU を使用したり VM のサイズを変更する場合は以下のコードを変更してください。
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
from azureml.core.compute import ComputeTarget, AmlCompute
|
||||
from azureml.core.compute_target import ComputeTargetException
|
||||
|
||||
ws = Workspace.from_config() # 自動的に .azureml/ ディレクトリを参照
|
||||
|
||||
# 任意のクラスター名
|
||||
cpu_cluster_name = "cpu-cluster"
|
||||
|
||||
try:
|
||||
# クラスターが既に存在するかどうかのチェック
|
||||
cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
|
||||
print('Found existing cluster, use it.')
|
||||
except ComputeTargetException:
|
||||
# もし無ければ作成する
|
||||
compute_config = AmlCompute.provisioning_configuration(
|
||||
vm_size='STANDARD_D2_V2',
|
||||
max_nodes=4,
|
||||
idle_seconds_before_scaledown=2400,)
|
||||
cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
|
||||
cpu_cluster.wait_for_completion(show_output=True)
|
||||
```
|
||||
|
||||
:::info
|
||||
次回からは以下のように簡単にコンピューティングターゲットにアクセスすることができます。
|
||||
|
||||
```python
|
||||
from azureml.core import ComputeTarget
|
||||
cpu_cluster = ComputeTarget(ws, 'cpu-cluster')
|
||||
```
|
||||
:::
|
|
@ -0,0 +1,144 @@
|
|||
---
|
||||
title: メトリック
|
||||
description: Guide to metric logging in Azure ML.
|
||||
keywords:
|
||||
- metric
|
||||
- logging
|
||||
---
|
||||
|
||||
## メトリックの記録
|
||||
|
||||
メトリックは Azure ML の各実行に紐付けて記録され、複数の実行は一つの実験に紐付けられて記録されます。
|
||||
メトリックの履歴の保存と可視化を行います。
|
||||
|
||||
### `log`
|
||||
|
||||
あるメトリックの 1 つの値を実行に記録します。
|
||||
|
||||
```python
|
||||
from azureml.core import Run
|
||||
run = Run.get_context()
|
||||
run.log('metric-name', metric_value)
|
||||
```
|
||||
|
||||
あるメトリックを同一の実行に対して複数回記録することもできます。その場合、記録されたメトリックはチャートで表示されます。
|
||||
|
||||
### `log_row`
|
||||
|
||||
あるメトリックを複数の列として記録します。
|
||||
|
||||
```python
|
||||
from azureml.core import Run
|
||||
run = Run.get_context()
|
||||
run.log_row("Y over X", x=1, y=0.4)
|
||||
```
|
||||
|
||||
:::info その他の記録オプション
|
||||
メトリックの記録に使われる一般的な API は含まれていますが、完全なリストについては[こちら](https://docs.microsoft.com/azure/machine-learning/how-to-log-view-metrics#data-types)を参照してください。
|
||||
:::
|
||||
|
||||
## メトリックを表示する
|
||||
|
||||
メトリックは Azure ML Studio の中で自動的に表示可能になります。[こちら](https://ml.azure.com)のリンク先か、SDK から見ることができます:
|
||||
|
||||
```
|
||||
run.get_workspace_url()
|
||||
```
|
||||
|
||||
"メトリック"タブを選択し、表示したいメトリックを選択します。
|
||||
|
||||
|
||||
![](/img/view-metrics.png)
|
||||
|
||||
### SDK からメトリックを表示する
|
||||
|
||||
実行に記録されたメトリックを確認します。(詳細: [実験と実行](run))
|
||||
|
||||
|
||||
```python
|
||||
metrics = run.get_metrics()
|
||||
# メトリックは Dict[str, List[float]] 形式になっており、
|
||||
# メトリック名と list 形式の値がマッピングされて実行に保存されています。
|
||||
|
||||
metrics.get('metric-name')
|
||||
# 記録された順に並んだメトリックのリスト
|
||||
```
|
||||
|
||||
実験`my-experiment`のメトリック`my-metric`のすべてのレコードを表示する:
|
||||
|
||||
```python
|
||||
experiments = ws.experiments
|
||||
# 実験名と実験オブジェクトのリスト
|
||||
|
||||
exp = experiments['my-experiment']
|
||||
for run in exp.get_runs():
|
||||
metrics = run.get_metrics()
|
||||
|
||||
my_metric = metrics.get('my-metric')
|
||||
if my_metric:
|
||||
print(my_metric)
|
||||
```
|
||||
|
||||
## 例
|
||||
|
||||
### MLFlow を使って記録する
|
||||
|
||||
[MLFlow](https://mlflow.org/) を使って Azure ML にメトリックを記録します。
|
||||
|
||||
```python
|
||||
from azureml.core import Run
|
||||
|
||||
# コードから実行中の実験や実行が含まれるワークスペースに接続する
|
||||
run = Run.get_context()
|
||||
ws = run.experiment.workspace
|
||||
|
||||
# ワークスペースを ml-flow-tracking-uri に関連付ける
|
||||
mlflow_url = ws.get_mlflow_tracking_uri()
|
||||
```
|
||||
|
||||
### PyTorch Lightning を使って記録する
|
||||
|
||||
この例は:
|
||||
- Lightning の`TensorBoardLogger`を含みます。
|
||||
- Azure ML の`Run.get_context()`を使って Lightning の`MLFlowLogger`を設定します。
|
||||
- Azure ML の実行の一部として使うときはこのロガーを追加するだけです。
|
||||
|
||||
```python
|
||||
import pytorch_lightning as pl
|
||||
|
||||
run = None
|
||||
try:
|
||||
from azureml.core.run import Run, _OfflineRun
|
||||
run = Run.get_context()
|
||||
if isinstance(run, _OfflineRun):
|
||||
run = None
|
||||
except ImportError:
|
||||
print("Couldn't import azureml.core.run.Run")
|
||||
|
||||
def get_logger():
|
||||
tb_logger = pl.loggers.TensorBoardLogger('logs/')
|
||||
logger = [tb_logger]
|
||||
|
||||
if run is not None:
|
||||
mlflow_url = run.experiment.workspace.get_mlflow_tracking_uri()
|
||||
mlf_logger = pl.loggers.MLFlowLogger(
|
||||
experiment_name=run.experiment.name,
|
||||
tracking_uri=mlflow_url,
|
||||
)
|
||||
mlf_logger._run_id = run.id
|
||||
logger.append(mlf_logger)
|
||||
|
||||
return logger
|
||||
```
|
||||
|
||||
ここでこのロガーを lightning の`Trainer`クラスに含めます:
|
||||
|
||||
```python
|
||||
logger = get_logger()
|
||||
|
||||
trainer = pl.Trainer.from_argparse_args(
|
||||
args=args,
|
||||
logger=logger,
|
||||
)
|
||||
trainer.fit(model)
|
||||
```
|
|
@ -0,0 +1,74 @@
|
|||
---
|
||||
title: Run History
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
|
||||
Azure ML can supercharge your ML workloads in (at least!) two ways:
|
||||
|
||||
- AML Compute: Providing powerful compute resoures to train larger models
|
||||
- Run history: Best-in-class lineage and reproducability
|
||||
|
||||
In this article we focus on Run History - and why you need it in your life!
|
||||
|
||||
As teams progress to running dozens, and eventually hundreds of experiments, having
|
||||
some way to organize them is essential. Run History is a service that provides a number
|
||||
features that quickly become essential to your ML-model builders:
|
||||
|
||||
### Experiments and Runs
|
||||
|
||||
When you are running dozens of experiments in multiple different projects, having a clear
|
||||
way to organize and search though the results is key. Azure ML provides two concepts to help
|
||||
with this: `Run`s and `Experiment`s.
|
||||
|
||||
#### Runs
|
||||
A run is a single execution of your code - usually a training script. The run has a life-cycle:
|
||||
the code is prepared to be submited to Azure ML (e.g. via a ScriptRunConfig), then the code is
|
||||
submitted
|
||||
|
||||
Once the code is submitted to Azure ML (for example, via a `ScriptRunConfig`) a `Run` object is
|
||||
created. This compute target is prepared (nodes are provisioned, containers hosting your Python
|
||||
environment are fired up), the entry point script is called (`$ python run.py [args]`) and logs
|
||||
start being generated:
|
||||
|
||||
```console
|
||||
Files already downloaded and verified
|
||||
epoch=1, batch= 2000: loss 2.19
|
||||
epoch=1, batch= 4000: loss 1.82
|
||||
epoch=1, batch= 6000: loss 1.66
|
||||
...
|
||||
```
|
||||
|
||||
You may log metrics to Azure ML with `run.log('<metric_name>', metric_value)` and monitor them in the studio:
|
||||
|
||||
![](img/logging-metrics.png)
|
||||
|
||||
The training concludes, usually some model files are saved, and the nodes are
|
||||
released.
|
||||
|
||||
But the story doesn't end there. The run persists even after the nodes are returned
|
||||
to Azure. You can always return, either in code or via the studio, to see a history
|
||||
of your runs, all their outputs and metrics, and the exact code that was used to generate them.
|
||||
|
||||
#### Experiments
|
||||
|
||||
An Experiment is a collection of runs. All runs belongs to an Experiment. Usually
|
||||
an Experiment is tied to a specific work item, for example, "Finetune Bert-Large",
|
||||
and will possess a number of runs as you iterate toward this goal.
|
||||
|
||||
### Snapshot
|
||||
|
||||
When you submit your code to run in Azure ML, a _snapshot_ is taken. This is a copy of the exact
|
||||
code that ran. Think of this as version control for your experiments. Want to reproduce the
|
||||
results from that experiment 2-months ago even though you've iterated on the model and the
|
||||
training script in the meantime? No problem, snapshot has you covered!
|
||||
|
||||
You have total control of what goes into the snapshot with the `.amlignore` file. This plays
|
||||
the same role as a `.gitignore` so you can efficiently manage what to include in the snapshot.
|
||||
|
||||
### Metrics
|
||||
|
||||
As you run experiments, you track metrics - from validation loss through to GPU load. Analysing these metrics is essential to determining your best model. With Run History, these metrics are stored for all your runs.
|
|
@ -0,0 +1,119 @@
|
|||
---
|
||||
title: Experiment and Run
|
||||
description: Guide to running code with Azure ML
|
||||
keywords:
|
||||
- run
|
||||
- experiment
|
||||
- submit
|
||||
- remote
|
||||
- ScriptRunConfig
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
## Concepts
|
||||
|
||||
### Run
|
||||
|
||||
A run represents a single execution of your code.
|
||||
|
||||
Azure ML is a machine-learning service that facilitates running your code in
|
||||
the cloud. A `Run` is an abstraction layer around each such submission, and is used to
|
||||
monitor the job in real time as well as keep a history of your results.
|
||||
|
||||
### Experiments
|
||||
|
||||
An experiment is a light-weight container for `Run`. Use experiments to submit
|
||||
and track runs.
|
||||
|
||||
Create an experiment in your workspace `ws`.
|
||||
|
||||
```python
|
||||
from azureml.core import Experiment
|
||||
exp = Experiment(ws, '<experiment-name>')
|
||||
```
|
||||
|
||||
## Create Run
|
||||
|
||||
### Via ScriptRunConfig
|
||||
|
||||
Usually a run is created by submitting a ScriptRunConfig.
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, Experiment, ScriptRunConfig
|
||||
ws = Workspace.from_config()
|
||||
exp = Experiment(ws, '<experiment-name>')
|
||||
|
||||
config = ScriptRunConfig(source_directory=<'<path/to/script>'>, script='train.py', ...)
|
||||
run = exp.submit(config)
|
||||
```
|
||||
|
||||
For more details: [ScriptRunConfig](script-run-config)
|
||||
|
||||
### Get Context
|
||||
|
||||
Code that is running within Azure ML is associated to a `Run`. The submitted code
|
||||
can access its own run.
|
||||
|
||||
```py
|
||||
from azureml.core import Run
|
||||
run = Run.get_context()
|
||||
```
|
||||
|
||||
#### Example: Logging metrics to current run context
|
||||
|
||||
A common use-case is logging metrics in a training script.
|
||||
|
||||
```py title="train.py"
|
||||
from azureml.core import Run
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
# training code
|
||||
for epoch in range(n_epochs):
|
||||
model.train()
|
||||
...
|
||||
val = model.evaluate()
|
||||
run.log('validation', val)
|
||||
```
|
||||
|
||||
When this code is submitted to Azure ML (e.g. via ScriptRunConfig) it will log metrics to its assocaited run.
|
||||
|
||||
For more details: [Logging Metrics](logging)
|
||||
|
||||
### Interactive
|
||||
|
||||
In an interactive setting e.g. a Jupyter notebook
|
||||
|
||||
```python
|
||||
run = exp.start_logging()
|
||||
```
|
||||
|
||||
#### Example: Jupyter notebook
|
||||
|
||||
A common use case for interacive logging is to train a model in a notebook.
|
||||
|
||||
```py
|
||||
from azureml.core import Workspace
|
||||
from azureml.core import Experiment
|
||||
ws = Workspace.from_config()
|
||||
exp = Experiment(ws, 'example')
|
||||
|
||||
run = exp.start_logging() # start interactive run
|
||||
print(run.get_portal_url()) # get link to studio
|
||||
|
||||
# toy example in place of e.g. model
|
||||
# training or exploratory data analysis
|
||||
import numpy as np
|
||||
for x in np.linspace(0, 10):
|
||||
y = np.sin(x)
|
||||
run.log_row('sine', x=x, y=y) # log metrics
|
||||
|
||||
run.complete() # stop interactive run
|
||||
```
|
||||
|
||||
Follow the link to the run to see the metric logging in real time.
|
||||
|
||||
![](img/run-ex-sine.png)
|
|
@ -0,0 +1,284 @@
|
|||
---
|
||||
title: クラウド上でコードを実行する
|
||||
description: Guide to running code with Azure ML
|
||||
keywords:
|
||||
- run
|
||||
- experiment
|
||||
- submit
|
||||
- remote
|
||||
- ScriptRunConfig
|
||||
---
|
||||
|
||||
## 実験と実行
|
||||
|
||||
Azure ML は機械学習コードのクラウド上での実行を支援するサービスです。`実行`は Azure ML にサブミットされたジョブの履歴をただ保存するだけではなく、リアルタイムで監視することもできる抽象レイヤーです。
|
||||
|
||||
- 実行: 一度のコード実行を表します。詳細: [実行](#実行)
|
||||
- 実験: 実験は`実行`の軽量なコンテナです。実験は実行の Azure ML へのサブミットと追跡に使われます。
|
||||
|
||||
ワークスペース`ws`に実験を作成します。
|
||||
|
||||
```python
|
||||
from azureml.core import Experiment
|
||||
exp = Experiment(ws, '<experiment-name>')
|
||||
```
|
||||
|
||||
## ScriptRunConfig
|
||||
|
||||
一般的に Azure ML では`ScriptRunConfig`を使って実行するコードの情報と実行のための設定情報をパッケージ化して、クラウド上にサブミットして実行します。
|
||||
|
||||
実行しようとしているコードが以下のディレクトリ構成だとします。
|
||||
|
||||
```bash
|
||||
source_directory/
|
||||
script.py # コードのエントリーポイント
|
||||
module1.py # script.py によって呼ばれるモジュール
|
||||
...
|
||||
```
|
||||
|
||||
`ScriptRunConfig`を使って`script.py`をクラウド上で実行するための設定:
|
||||
|
||||
```python
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/source_directory>',
|
||||
script='script.py',
|
||||
compute_target=target,
|
||||
environment=env,
|
||||
arguments = [
|
||||
'--learning_rate', 0.001,
|
||||
'--momentum', 0.9,
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
ここで:
|
||||
|
||||
- `source_directory='source_directory'` : 実行するコードが存在するローカルディレクトリ。
|
||||
- `script='script.py'` : 実行する Python スクリプト。必ずしも`source_directory`のルートにある必要はない。
|
||||
- `compute_taget=target` : 参照 [コンピューティングターゲット](compute-targets)
|
||||
- `environment` : 参照 [環境](environment)
|
||||
- `arguments` : 参照 [コマンドライン引数](#コマンドライン引数)
|
||||
|
||||
このコードを Azure ML にサブミットする:
|
||||
|
||||
```python
|
||||
exp = Experiment(ws, '<exp-name>')
|
||||
run = exp.submit(config)
|
||||
print(run)
|
||||
run.wait_for_completion(show_output=True)
|
||||
```
|
||||
|
||||
このコードはターミナル上にログストリームを出力するだけではなく、サブミットされた実行を Web 上で監視するためのリンクを出力します。(https://ml.azure.com)
|
||||
|
||||
## コマンドライン引数
|
||||
|
||||
スクリプトにコマンドライン引数を渡すには`ScriptRunConfig`の中にある`arguments`パラメータを使います。
|
||||
引数は list 形式で指定します:
|
||||
|
||||
```python
|
||||
arguments = [first, second, third, ...]
|
||||
```
|
||||
|
||||
このとき引数は下記のコマンドライン引数のように渡されます:
|
||||
|
||||
```console
|
||||
$ python script.py first second third ...
|
||||
```
|
||||
|
||||
名前付きの引数もサポートされます:
|
||||
|
||||
```python
|
||||
arguments = ['--first_arg', first_val, '--second_arg', second_val, ...]
|
||||
```
|
||||
|
||||
引数には`int`、`float`、`str`などのデータ型に加えて他の参照型のデータも使えます。
|
||||
|
||||
コマンドライン引数の詳細: [Use dataset in a remote run](dataset#use-dataset-in-a-remote-run)
|
||||
|
||||
### 引数の例 1: `sys.argv`
|
||||
|
||||
この例では 2 つの引数をスクリプトに渡します。コンソールから実行する場合:
|
||||
|
||||
```console title="console"
|
||||
$ python script.py 0.001 0.9
|
||||
```
|
||||
|
||||
これを`ScriptRunConfig`の中の`arguments`を使って表現する場合:
|
||||
|
||||
```python title="run.py"
|
||||
arguments = [0.001, 0.9]
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='.',
|
||||
script='script.py',
|
||||
arguments=arguments,
|
||||
)
|
||||
```
|
||||
|
||||
これらの引数はスクリプトの中で通常のコマンドライン引数と同じように使えます:
|
||||
|
||||
```python title="script.py"
|
||||
import sys
|
||||
learning_rate = sys.argv[1] # 0.001 を受け取る
|
||||
momentum = sys.argv[2] # 0.9 を受け取る
|
||||
```
|
||||
|
||||
### 引数の例 2: `argparse`
|
||||
|
||||
この例では 2 つの名前付きの引数をスクリプトに渡します。コンソールから実行する場合:
|
||||
|
||||
```console title="console"
|
||||
$ python script.py --learning_rate 0.001 --momentum 0.9
|
||||
```
|
||||
|
||||
これを`ScriptRunConfig`の中の`arguments`を使って表現する場合:
|
||||
|
||||
```python title="run.py"
|
||||
arguments = [
|
||||
'--learning_rate', 0.001,
|
||||
'--momentum', 0.9,
|
||||
]
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='.',
|
||||
script='script.py',
|
||||
arguments=arguments,
|
||||
)
|
||||
```
|
||||
|
||||
これらの引数はスクリプトの中で通常のコマンドライン引数と同じように使えます:
|
||||
|
||||
```python title="script.py"
|
||||
import argparse
|
||||
parser = argparse.Argparser()
|
||||
parser.add_argument('--learning_rate', type=float)
|
||||
parser.add_argument('--momentum', type=float)
|
||||
args = parser.parse_args()
|
||||
|
||||
learning_rate = args.learning_rate # 0.001 を受け取る
|
||||
momentum = args.momentum # 0.9 を受け取る
|
||||
```
|
||||
|
||||
## コマンド
|
||||
|
||||
明示的に実行するコマンドを与えることもできます。
|
||||
|
||||
```python
|
||||
command = 'python script.py'.split()
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='<path/to/code>',
|
||||
command=command,
|
||||
compute_target=compute_target,
|
||||
environment=environment,
|
||||
)
|
||||
```
|
||||
|
||||
この例は`command`引数の代わりに`script='script.py'`という引数をするということと同じです。
|
||||
|
||||
このオプションは多くの柔軟性を与えます。例えば:
|
||||
|
||||
- **環境変数の設定**: よくある例:
|
||||
|
||||
```python
|
||||
command = 'export PYTHONPATH=$PWD && python script.py'.split()
|
||||
```
|
||||
|
||||
```python
|
||||
command = f'export RANK={rank} && python script.py'.split()
|
||||
```
|
||||
|
||||
- **セットアップスクリプトの実行**: データのダウンロードや環境変数の設定を行うセットアップスクリプトの実行。
|
||||
|
||||
```python
|
||||
command = 'python setup.py && python script.py'.split()
|
||||
```
|
||||
|
||||
## データセットの使用
|
||||
|
||||
### 引数から
|
||||
|
||||
`ScriptRunConfig`に引数としてデータセットを渡します。
|
||||
|
||||
```py
|
||||
# データセットを作成する
|
||||
datastore = ws.get_default_datastore()
|
||||
dataset = Dataset.File.from_files(path=(datastore, '<path/on/datastore>'))
|
||||
|
||||
arguments = ['--dataset', dataset.as_mount()]
|
||||
|
||||
config = ScriptRunConfig(
|
||||
source_directory='.',
|
||||
script='script.py',
|
||||
arguments=arguments,
|
||||
)
|
||||
```
|
||||
|
||||
この例では`script.py`から参照可能な実行に対してデータセットがマウントされます。
|
||||
|
||||
## 実行
|
||||
|
||||
### インタラクティブ
|
||||
|
||||
Jupyter Notebookなどを使う場合のインタラクティブ設定
|
||||
|
||||
```python
|
||||
run = exp.start_logging()
|
||||
```
|
||||
|
||||
#### 例 : Jupyter Notebook
|
||||
|
||||
よくあるユースケースはノートブック内で学習中のモデルのログをインタラクティブに表示する場合です。
|
||||
|
||||
```py
|
||||
from azureml.core import Workspace
|
||||
from azureml.core import Experiment
|
||||
ws = Workspace.from_config()
|
||||
exp = Experiment(ws, 'example')
|
||||
|
||||
run = exp.start_logging() # インタラクティブ実行の開始
|
||||
print(run.get_portal_url()) # Azure ML Studio へのリンクの取得
|
||||
|
||||
# モデル学習コードのダミー
|
||||
# 実際は学習やEDAなど
|
||||
import numpy as np
|
||||
for x in np.linspace(0, 10):
|
||||
y = np.sin(x)
|
||||
run.log_row('sine', x=x, y=y) # メトリックのロギング
|
||||
|
||||
run.complete() # インタラクティブ実行の終了
|
||||
```
|
||||
|
||||
Azure ML Studio へのリンクからリアルタイムでメトリックのログや実行を確認できます。
|
||||
|
||||
![](img/run-ex-sine.png)
|
||||
|
||||
### コンテキストの取得
|
||||
|
||||
Azrue ML 上で実行されているコードは`実行`に関連付けられます。サブミットされたコードは`実行`からアクセスすることができます。
|
||||
|
||||
```py
|
||||
from azureml.core import Run
|
||||
run = Run.get_context()
|
||||
```
|
||||
|
||||
#### 例: 実行中のコードのメトリックをログに保存する
|
||||
|
||||
よくあるユースケースはトレーニングスクリプト内でのメトリックのログです。
|
||||
|
||||
```py title="train.py"
|
||||
from azureml.core import Run
|
||||
|
||||
run = Run.get_context()
|
||||
|
||||
# training code
|
||||
for epoch in range(n_epochs):
|
||||
model.train()
|
||||
...
|
||||
val = model.evaluate()
|
||||
run.log('validation', val)
|
||||
```
|
||||
|
||||
このコードが Azrue ML (例えば`ScriptRunConfig`により) にサブミットされた時、このコードは関連付けられている`実行`にメトリックのログを保存します。
|
||||
|
||||
詳細: [メトリック](logging)
|
|
@ -0,0 +1,82 @@
|
|||
# Templates
|
||||
|
||||
## Introduction
|
||||
|
||||
Cookiecutter is a simple command-line tool that allows you to quickly create
|
||||
new projects from pre-defined templates. Let's see it in action!
|
||||
|
||||
First go ahead and get cookiecutter using your environment manager of choice,
|
||||
for example:
|
||||
|
||||
```bash
|
||||
pip install cookiecutter
|
||||
```
|
||||
|
||||
Then give this repo a home
|
||||
|
||||
```bash
|
||||
cd ~/repos # or wherever your repos call home :-)
|
||||
git clone <this-repo>
|
||||
```
|
||||
|
||||
To create a new project from the `ScriptRunConfig` template for example, simply
|
||||
run
|
||||
|
||||
```bash
|
||||
cookiecutter path/to/cheatsheet/repo/templates/ScriptRunConfig
|
||||
```
|
||||
|
||||
See [ScriptRunConfig](#ScriptRunConfig) for more details for this template.
|
||||
|
||||
## Project Templates
|
||||
|
||||
- ScriptRunConfig: Create a project to run a script in AML making use of the
|
||||
ScriptRunConfig class. This template is well suited for smaller projects and
|
||||
is especially handy for testing.
|
||||
|
||||
### ScriptRunConfig
|
||||
|
||||
[Cookiecutter](https://cookiecutter.readthedocs.io/en/1.7.2/README.html)
|
||||
template for setting up an AML
|
||||
[ScriptRunConfig](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.scriptrunconfig?view=azure-ml-py)
|
||||
used to run your script in Azure.
|
||||
|
||||
#### Usage
|
||||
|
||||
Run the cookiecutter command
|
||||
|
||||
```bash
|
||||
cookiecutter <path/to/cookiecutter/templates>/ScriptRunConfig
|
||||
```
|
||||
|
||||
to create a new `ScriptRunConfig` project.
|
||||
|
||||
**Note.** Install with `pip install cookiecutter` (see
|
||||
[cookiecutter docs](https://cookiecutter.readthedocs.io/en/1.7.2/installation.html)
|
||||
for more installation options)
|
||||
|
||||
You will be prompted for the following:
|
||||
|
||||
- `directory_name`: The desired name of the directory (default:
|
||||
"aml-src-script")
|
||||
- `script_name`: The name of the python script to be run in Azure (default:
|
||||
"script")
|
||||
- `subscription_id`: Your Azure Subscription ID
|
||||
- `resource_group`: Your Azure resource group name
|
||||
- `workspace_name`: Your Azure ML workspace name
|
||||
- `compute_target_name`: The name of the Azure ML compute target to run the
|
||||
script on (default: "local", will run on your box)
|
||||
|
||||
Cookiecutter creates a new project with the following layout.
|
||||
|
||||
```bash
|
||||
{directory_name}/
|
||||
{script_name}.py # the script you want to run in the cloud
|
||||
run.py # wraps your script in ScriptRunConfig to send to Azure
|
||||
config.json # your Azure ML metadata
|
||||
readme.md # this readme file!
|
||||
```
|
||||
|
||||
See
|
||||
[ScriptRunConfig](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.scriptrunconfig?view=azure-ml-py)
|
||||
for more options and details on configuring runs.
|
|
@ -0,0 +1,32 @@
|
|||
---
|
||||
title: Troubleshooting
|
||||
id: troubleshooting
|
||||
description: A cheat sheet for Azure ML.
|
||||
keywords:
|
||||
- azure machine learning
|
||||
- aml
|
||||
- troubleshooting
|
||||
---
|
||||
|
||||
:::note
|
||||
このコンテンツはお使いの言語では利用できません。
|
||||
:::
|
||||
|
||||
### Error: az acr login- APIVersion 2020-11-01-preview is not available.
|
||||
**Description**
|
||||
NotImplementedError occurred when building image using az acr.
|
||||
```bash
|
||||
az acr build --image $image_name --subscription $ws.subscription_id --registry $cr --file docker/Dockerfile docker/
|
||||
```
|
||||
The error:
|
||||
```text
|
||||
NotImplementedError: APIVersion 2020-11-01-preview is not available.
|
||||
```
|
||||
|
||||
**Solution** This is a problem related with the version of az cli. Please update az cli by running
|
||||
```bash
|
||||
az upgrade --yes
|
||||
```
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
---
|
||||
title: Workspace
|
||||
description: Azure ML ワークスペースの概要
|
||||
keywords:
|
||||
- workspace
|
||||
---
|
||||
|
||||
ワークスペースは、Azure ML で用いられる基本的なオブジェクトであり、他の多くのクラスのコンストラクタの中で使用されます。
|
||||
このドキュメントを通して、私たちは頻繁にワークスペース・オブジェクトのインスタンス化を省略し、単純に `ws` を参照します。
|
||||
|
||||
新規ワークスペースの作成についての説明が必要でしたら、[インストール](installation)を見てください。
|
||||
|
||||
## ワークスペースを取得する
|
||||
|
||||
AMLアセットへの接続に用いられる `Workspace` オブジェクトをインスタンス化します。
|
||||
|
||||
```python title="run.py"
|
||||
from azureml.core import Workspace
|
||||
ws = Workspace(
|
||||
subscription_id="<subscription_id>",
|
||||
resource_group="<resource_group>",
|
||||
workspace_name="<workspace_name>",
|
||||
)
|
||||
```
|
||||
|
||||
利便性のために、ワークスペースのメタデータを `config.json` 内に保存します。
|
||||
|
||||
```json title=".azureml/config.json"
|
||||
{
|
||||
"subscription_id": <subscription-id>,
|
||||
"resource_group": <resource-group>,
|
||||
"workspace_name": <workspace-name>
|
||||
}
|
||||
```
|
||||
|
||||
### 役立つメソッド
|
||||
|
||||
- `ws.write_config(path, file_name)` : あなたの代わりに `config.json` を書き出します。 `path` はデフォルトでカレントワーキングディレクトリ内の `.azureml/` 、 `file_name` はデフォルトで `config.json` です。
|
||||
- `Workspace.from_config(path, _file_name)`: コンフィグからワークスペースの設定を読み込みます。そのパラメーターは、カレントディレクトリで検索を開始するのがデフォルトです。
|
||||
|
||||
:::info
|
||||
これらを `.azureml/` ディレクトリに格納するのが推奨されます。 `Workspace.from_config` メソッドでは _デフォルトで_ このパスが検索されるためです。
|
||||
:::
|
||||
|
||||
## ワークスペースのアセットを取得する
|
||||
|
||||
ワークスペースは、以下の Azure ML アセットへのハンドラを提供します。
|
||||
|
||||
### Compute Targets
|
||||
|
||||
ワークスペースにアタッチされた全ての Compute ターゲットを取得します。
|
||||
|
||||
```python
|
||||
ws.compute_targets: Dict[str, ComputeTarget]
|
||||
```
|
||||
|
||||
### Datastores
|
||||
|
||||
ワークスペースに登録された全てのデータストアを取得します。
|
||||
|
||||
```python
|
||||
ws.datastores: Dict[str, Datastore]
|
||||
```
|
||||
|
||||
ワークスペースのデフォルトのデータストアを取得します。
|
||||
|
||||
```python
|
||||
ws.get_default_datastore(): Datastore
|
||||
```
|
||||
|
||||
### Keyvault
|
||||
|
||||
ワークスペースのデフォルトの Keyvault を取得します。
|
||||
|
||||
```python
|
||||
ws.get_default_keyvault(): Keyvault
|
||||
```
|
||||
|
||||
### Environments
|
||||
|
||||
ワークスペースに登録された Environments を取得します。
|
||||
|
||||
```python
|
||||
ws.environments: Dict[str, Environment]
|
||||
```
|
||||
|
||||
### MLFlow
|
||||
|
||||
MLFlow の tracking URI を取得します。
|
||||
|
||||
```python
|
||||
ws.get_mlflow_tracking_uri(): str
|
||||
```
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
---
|
||||
title: Contributing
|
||||
description: Guide to contributing.
|
||||
---
|
||||
|
||||
## Issues
|
||||
|
||||
All forms of feedback are welcome through [issues](https://github.com/Azure/azureml-cheatsheets/issues) - please follow the pre-defined templates where applicable.
|
||||
|
||||
## Pull requests
|
||||
|
||||
Pull requests (PRs) to this repo require review and approval by the Azure Machine Learning team to merge. Please follow the pre-defined template and read all relevant sections below.
|
||||
|
||||
Make PR's against the `main` branch.
|
||||
|
||||
```bash
|
||||
git clone git@github.com:Azure/azureml-cheatsheets.git
|
||||
cd azureml-cheatsheets
|
||||
git checkout -b user/contrib
|
||||
...
|
||||
gh pr create
|
||||
```
|
||||
|
||||
- When a PR arrives against `main` GitHub actions (deploy) will test the build is successful
|
||||
- When the PR is merged the change will be automatically deployed to `gh-pages` branch (and the webpage will be updated)
|
||||
|
||||
99% of contributions should only need the following:
|
||||
|
||||
- Add markdown files to the `website/docs/cheatsheets` folder
|
||||
- Update the `sidebar.js` file to add a page to the sidebar
|
||||
- Put any images in `website/docs/cheatsheets/<path-to-cheat-sheet-directory>/img/` and refer to them like this: `![](img/<image-name>.png)`
|
||||
|
||||
If you need to do anything more than adding a new page to the sidebar (e.g.
|
||||
modify the nav bar) then please refer to the [Docusaurus 2 documentation](https://v2.docusaurus.io/).
|
||||
|
||||
## Previewing changes locally
|
||||
|
||||
- Install [npm](https://nodejs.org/en/download/) and [yarn](https://classic.yarnpkg.com/en/docs/install#windows-stable).
|
||||
|
||||
- Initial Docusaurus installation: (**First time only**)
|
||||
|
||||
```bash
|
||||
cd website
|
||||
npm install
|
||||
```
|
||||
|
||||
|
||||
- Run local server while developing:
|
||||
|
||||
```bash
|
||||
cd website
|
||||
yarn start
|
||||
```
|
||||
|
||||
See Docusaurus instructions [here](https://v2.docusaurus.io/docs/installation) for more details.
|
|
@ -0,0 +1,34 @@
|
|||
---
|
||||
title: Deployment
|
||||
description: One-time website deployment setup.
|
||||
---
|
||||
|
||||
## Deployment
|
||||
|
||||
This article describes the one-time process for deploying the cheat sheeets as a GitHub Pages website.
|
||||
|
||||
This repo has GitHub actions in place that automate deployment by watching the `main` branch.
|
||||
If you are interested in how deployment works then read on!
|
||||
|
||||
### GitHub Actions
|
||||
|
||||
We use GitHub actions to automate deployment. Set up was as follows:
|
||||
|
||||
- Generated new SSH key
|
||||
- NB. Since there was an existing ssh key tied the repo a new key was generated (in a different location) `/tmp/.ssh/id_rsa`
|
||||
- Add public key to repo's [deploy key](https://developer.github.com/v3/guides/managing-deploy-keys/)
|
||||
- NB. Allow write access
|
||||
- Add private key as [GitHub secret](https://help.github.com/en/actions/configuring-and-managing-workflows/creating-and-storing-encrypted-secrets)
|
||||
- We use repo-level (not org level) secret
|
||||
- Secret is named `GH_PAGES_DEPLOY`
|
||||
- `xclip -sel clip < /tmp/.ssh/id_rsa`
|
||||
|
||||
### Manual
|
||||
|
||||
It is possible to make manual deployments without use of the GitHub action above.
|
||||
|
||||
```console
|
||||
GIT_USER=<Your GitHub username> USE_SSH=true yarn deploy
|
||||
```
|
||||
|
||||
If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch.
|
|
@ -0,0 +1,115 @@
|
|||
---
|
||||
title: Issue Triage Process
|
||||
id: issues
|
||||
description: GitHub issue triage process for Azure Machine Learning.
|
||||
keywords:
|
||||
- azure machine learning
|
||||
- aml
|
||||
- azure
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This page defines the triage process for Azure Machine Learning (AML) repositories.
|
||||
|
||||
## Repositories
|
||||
|
||||
AML examples:
|
||||
|
||||
- https://github.com/Azure/MachineLearningNotebooks
|
||||
- https://github.com/Azure/azureml-examples
|
||||
|
||||
Azure core:
|
||||
|
||||
- https://github.com/Azure/azure-cli
|
||||
- https://github.com/Azure/azure-cli-extensions
|
||||
- https://github.com/Azure/azure-powershell
|
||||
- https://github.com/Azure/azure-rest-api-specs
|
||||
- https://github.com/Azure/azure-sdk-for-js
|
||||
- https://github.com/Azure/azure-sdk-for-python
|
||||
|
||||
> To request a repository to be added, [open an issue](https://github.com/Azure/azureml-web/issues)
|
||||
|
||||
## Code of Conduct
|
||||
|
||||
All interactions on GitHub must follow the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
|
||||
|
||||
## Priority
|
||||
|
||||
GitHub user experience.
|
||||
|
||||
## Metrics
|
||||
|
||||
- FQR: first quality response
|
||||
- TTC: time to close
|
||||
|
||||
## Goals
|
||||
|
||||
- triage issue area and type in <3 hrs
|
||||
- FQR <8 hrs
|
||||
- TTC for questions <5 days
|
||||
- TTC for bugs <30 days
|
||||
|
||||
## SLA
|
||||
|
||||
- triage <1 day
|
||||
- FQR <3 days
|
||||
|
||||
## Labels
|
||||
|
||||
### Areas
|
||||
|
||||
#### Foundations
|
||||
|
||||
- `Foundations/Data`
|
||||
- `Foundations/Compute`
|
||||
- `Foundations/Infrastructure`
|
||||
- `Foundations/Admin`
|
||||
|
||||
#### Experiences
|
||||
|
||||
- `Experiences/UI`
|
||||
- `Experiences/Lifecycle`
|
||||
- `Experiences/Intelligence`
|
||||
- `Experiences/Inference`
|
||||
|
||||
#### Pipelines
|
||||
|
||||
- `Pipelines/UI`
|
||||
- `Pipelines/Aether`
|
||||
|
||||
### Issue types
|
||||
|
||||
- `bug`
|
||||
- `question`
|
||||
- `feature-request`
|
||||
|
||||
### Other
|
||||
|
||||
- `needs-details`: additional details needed from author
|
||||
- `v2`: planned for AMLv2
|
||||
|
||||
## Process
|
||||
|
||||
### Triage
|
||||
|
||||
Initial triage will be performed by the GitHub v-team. On initial triage, assign the correct area label and issue type.
|
||||
|
||||
If the issue needs obvious clarification before this can be done, kindly ask the user. If the issue has no path to closing without user response, mark it as `needs-details`.
|
||||
|
||||
After initial triage, it is up to each area (Experiences, Foundations, Pipelines) to further triage as necessary to the correct engineering team members.
|
||||
|
||||
One type of issue may be changed to another, i.e. for an issue like “can I do X” could end up as a feature request for X. Simply change the issue labels as appropriate. In some cases, it might make sense to open a new issue and close the original instead of changing the label.
|
||||
|
||||
Once the issue is understood, it is up to each area to appropriately route through internal tools such as ADO, maintaining the GitHub issue as the point of communication with the user. Major developments should be communicated back to the user.
|
||||
|
||||
### Closing
|
||||
|
||||
Issues may be closed by their creator at anytime, which is preferred, **especially for questions**.
|
||||
|
||||
Additionally, issues may be closed once:
|
||||
|
||||
- `needs-details`: user/author has not responded for 5+ days with no other path to closure
|
||||
- `question`: the question has been thoroughly answered with relevant links, documentation, and examples and has no follow-up questions from user(s) in 48 hrs
|
||||
- `bug`: the bug fix has been released, tested, and the user confirms the solution or does not respond for 48 hrs after being made aware of the fix
|
||||
- `feature-request`: the feature has been released, tested, and the user confirms the solution or does not respond for 48 hrs after being made aware of the release
|
|
@ -0,0 +1,481 @@
|
|||
{
|
||||
"Imports Group: Basic": {
|
||||
"prefix": ["import-basic"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace # connect to workspace",
|
||||
"from azureml.core import Experiment # connect/create experiments",
|
||||
"from azureml.core import ComputeTarget # connect to compute",
|
||||
"from azureml.core import Environment # manage e.g. Python environments",
|
||||
"from azureml.core import Datastore, Dataset # work with data",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import collection of basic Azure ML classes"
|
||||
},
|
||||
|
||||
"Import Workspace": {
|
||||
"prefix": ["import-workspace"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import Workspace class"
|
||||
},
|
||||
|
||||
"Import Compute Target": {
|
||||
"prefix": ["import-compute-target"],
|
||||
"body": [
|
||||
"from azureml.core import ComputeTarget",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import ComputeTarget class"
|
||||
},
|
||||
|
||||
"Import Environment": {
|
||||
"prefix": ["import-environment"],
|
||||
"body": [
|
||||
"from azureml.core import Environment",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import Environment class"
|
||||
},
|
||||
|
||||
"Import ScriptRunConfig": {
|
||||
"prefix": ["import-script-run-config", "import-src"],
|
||||
"body": [
|
||||
"from azureml.core import ScriptRunConfig",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import ScriptRunConfig class"
|
||||
},
|
||||
|
||||
"Import Dataset": {
|
||||
"prefix": ["import-dataset"],
|
||||
"body": [
|
||||
"from azureml.core import Dataset",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import Dataset class"
|
||||
},
|
||||
|
||||
"Import Datastore": {
|
||||
"prefix": ["import-datastore"],
|
||||
"body": [
|
||||
"from azureml.core import Datastore",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import Datastore class"
|
||||
},
|
||||
|
||||
"Import Run": {
|
||||
"prefix": ["import-run"],
|
||||
"body": [
|
||||
"from azureml.core import Run",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import Run class"
|
||||
},
|
||||
|
||||
"Import Conda Dependencies": {
|
||||
"prefix": ["import-conda-dependencies"],
|
||||
"body": [
|
||||
"from azureml.core.conda_dependencies import CondaDependencies",
|
||||
"$0"
|
||||
],
|
||||
"description": "Import CondaDependencies class"
|
||||
},
|
||||
|
||||
"Get Workspace From Config": {
|
||||
"prefix": ["get-workspace-config", "ws-config"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace",
|
||||
"ws = Workspace.from_config()",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get Azure ML Workspace from config"
|
||||
},
|
||||
|
||||
"Get Workspace": {
|
||||
"prefix": ["get-workspace", "get-ws"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace",
|
||||
"ws = Workspace.get(",
|
||||
" name='${1:name}',",
|
||||
" subscription_id='${2:subscription_id}',",
|
||||
" resource_group='${3:resource_group}',",
|
||||
")",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get Azure ML Workspace"
|
||||
},
|
||||
|
||||
"Get Compute": {
|
||||
"prefix": ["get-compute"],
|
||||
"body": [
|
||||
"from azureml.core import ComputeTarget",
|
||||
"target = ComputeTarget(${2:ws}, '${1:<compute_target_name>}')",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get Azure ML Compute Target"
|
||||
},
|
||||
|
||||
"Get Compute with SSH": {
|
||||
"prefix": ["get-compute-ssh"],
|
||||
"body": [
|
||||
"from azureml.core.compute import AmlCompute",
|
||||
"from azureml.core.compute_target import ComputeTargetException",
|
||||
"",
|
||||
"ssh_public_key = 'public-key-here'",
|
||||
"compute_config = AmlCompute.provisioning_configuration(",
|
||||
" vm_size='$2',",
|
||||
" min_nodes=$3,",
|
||||
" max_nodes=$4,",
|
||||
" admin_username='$5',",
|
||||
" admin_user_ssh_key=ssh_public_key,",
|
||||
" vm_priority='${6|lowpriority,dedicated|}',",
|
||||
" remote_login_port_public_access='Enabled',",
|
||||
" )",
|
||||
"",
|
||||
"cluster = ComputeTarget.create(",
|
||||
" workspace=${7:workspace_name},",
|
||||
" name='${8:target_name}',",
|
||||
" compute_config=compute_config,",
|
||||
")",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get Azure ML Compute Target with SSH"
|
||||
},
|
||||
|
||||
"Get Environment": {
|
||||
"prefix": ["get-environment"],
|
||||
"body": [
|
||||
"from azureml.core import Environment",
|
||||
"${2:env} = Environment('${1:<env-name>}')",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get Azure ML Environment"
|
||||
},
|
||||
|
||||
"Get Environment From Pip": {
|
||||
"prefix": ["get-environment-pip", "env-pip"],
|
||||
"body": [
|
||||
"from azureml.core import Environment",
|
||||
"env = Environment.from_pip_requirements(",
|
||||
" name='${1:env_name}',",
|
||||
" file_path='${2:requirements.txt}',",
|
||||
")",
|
||||
"$0"
|
||||
],
|
||||
"description": "Create environment from pip requirements.txt"
|
||||
},
|
||||
|
||||
"Get Environment From Conda": {
|
||||
"prefix": ["get-environment-conda", "env-conda"],
|
||||
"body": [
|
||||
"from azureml.core import Environment",
|
||||
"env = Environment.from_conda_specification(",
|
||||
" name='${1:env_name}',",
|
||||
" file_path='${2:env.yml}',",
|
||||
")",
|
||||
"$0"
|
||||
],
|
||||
"description": "Create environment from Conda env.yml file"
|
||||
},
|
||||
|
||||
"Get Environment From SDK": {
|
||||
"prefix": ["get-environment-sdk", "env-sdk"],
|
||||
"body": [
|
||||
"from azureml.core import Environment",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies",
|
||||
"env = Environment('${1:my-env}')",
|
||||
"",
|
||||
"conda = CondaDependencies()",
|
||||
"",
|
||||
"# add channels",
|
||||
"conda.add_channel('$2')",
|
||||
"",
|
||||
"# add conda packages",
|
||||
"conda.add_conda_package('$3')",
|
||||
"",
|
||||
"# add pip packages",
|
||||
"conda.add_pip_package('$4')",
|
||||
"",
|
||||
"# add conda dependencies to environment",
|
||||
"env.python.conda_dependencies = conda",
|
||||
"$0"
|
||||
],
|
||||
"description": "Create environment using CondaDependencies class"
|
||||
},
|
||||
|
||||
"Get Environment From Custom image": {
|
||||
"prefix": ["get-environment-custom-image", "env-image"],
|
||||
"body": [
|
||||
"from azureml.core import Environment",
|
||||
"env = Environment('${1:my-env}')",
|
||||
"",
|
||||
"env.docker.enabled = True",
|
||||
"",
|
||||
"# base image for DockerHub",
|
||||
"env.docker.base_image = '${2}'",
|
||||
"",
|
||||
"# if you are using base image from a Dockerfile",
|
||||
"# env.docker.base_image = None",
|
||||
"# env.docker.base_dockerfile = './Dockerfile'",
|
||||
"",
|
||||
"# The user_managed_dependencies flag to True will use your custom image's built-in Python environment. ",
|
||||
"env.python.user_managed_dependencies = True",
|
||||
"",
|
||||
"$0"
|
||||
],
|
||||
"description": "Create environment using Custom image"
|
||||
},
|
||||
|
||||
"Workspace Compute Targets": {
|
||||
"prefix": ["ws-compute-target"],
|
||||
"body": [
|
||||
"target = ws.compute_targets['${1:target-name}']",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get compute target from workspace"
|
||||
},
|
||||
|
||||
"Workspace Environments": {
|
||||
"prefix": ["ws-environment"],
|
||||
"body": [
|
||||
"env = ws.environments['${1:env-name}']",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get environment from workspace"
|
||||
},
|
||||
|
||||
"Workspace Datastores": {
|
||||
"prefix": ["ws-datastore"],
|
||||
"body": [
|
||||
"datastore = ws.datastores['${1:datastore-name}']",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get datastore from workspace"
|
||||
},
|
||||
|
||||
"Workspace Datasets": {
|
||||
"prefix": ["ws-dataset"],
|
||||
"body": [
|
||||
"dataset = ws.datasets['${1:dataset-name}']",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get dataset from workspace"
|
||||
},
|
||||
|
||||
"Workspace Experiment": {
|
||||
"prefix": ["ws-experiment"],
|
||||
"body": [
|
||||
"exp = ws.experiments['${1:experiment-name}']",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get (existing) experiment from workspace"
|
||||
},
|
||||
|
||||
"Workspace Models": {
|
||||
"prefix": ["ws-model"],
|
||||
"body": [
|
||||
"model = ws.models['${1:model-name}']",
|
||||
"$0"
|
||||
],
|
||||
"description": "Get model from workspace"
|
||||
},
|
||||
|
||||
"Script Run Config": {
|
||||
"prefix": ["script-run-config", "src"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace, Experiment, ScriptRunConfig",
|
||||
"",
|
||||
"# get workspace",
|
||||
"ws = Workspace.from_config()",
|
||||
"",
|
||||
"# get compute target",
|
||||
"target = ws.compute_targets['${1:target-name}']",
|
||||
"",
|
||||
"# get registered environment",
|
||||
"env = ws.environments['${2:env-name}']",
|
||||
"",
|
||||
"# get/create experiment",
|
||||
"exp = Experiment(ws, '${3:experiment_name}')",
|
||||
"",
|
||||
"# set up script run configuration",
|
||||
"config = ScriptRunConfig(",
|
||||
" source_directory='${4:.}',",
|
||||
" script='${5:script.py}',",
|
||||
" compute_target=target,",
|
||||
" environment=env,",
|
||||
" arguments=[${6:'--meaning', 42}],",
|
||||
")",
|
||||
"",
|
||||
"# submit script to AML",
|
||||
"run = exp.submit(config)",
|
||||
"print(run.get_portal_url()) # link to ml.azure.com",
|
||||
"run.wait_for_completion(show_output=True)",
|
||||
"$0"
|
||||
],
|
||||
"description": "Set up ScriptRunConfig including compute target, environment and experiment"
|
||||
},
|
||||
|
||||
"Script Run Config with Command": {
|
||||
"prefix": ["script-run-config-command", "command-src", "src-command"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace, Experiment, ScriptRunConfig",
|
||||
"",
|
||||
"# get workspace",
|
||||
"ws = Workspace.from_config()",
|
||||
"",
|
||||
"# get compute target",
|
||||
"target = ws.compute_targets['${1:target-name}']",
|
||||
"",
|
||||
"# get registered environment",
|
||||
"env = ws.environments['${2:env-name}']",
|
||||
"",
|
||||
"# get/create experiment",
|
||||
"exp = Experiment(ws, '${3:experiment_name}')",
|
||||
"",
|
||||
"# create command",
|
||||
"command = 'python ${4:script.py} ${5:--argument value}'.split()",
|
||||
"",
|
||||
"# set up script run configuration",
|
||||
"config = ScriptRunConfig(",
|
||||
" source_directory='${6:.}',",
|
||||
" command=command,",
|
||||
" compute_target=target,",
|
||||
" environment=env,",
|
||||
")",
|
||||
"",
|
||||
"# submit script to AML",
|
||||
"run = exp.submit(config)",
|
||||
"print(run.get_portal_url()) # link to ml.azure.com",
|
||||
"run.wait_for_completion(show_output=True)",
|
||||
"$0"
|
||||
],
|
||||
"description": "Set up ScriptRunConfig using command argument"
|
||||
},
|
||||
"Script Run Config with Distributed Config": {
|
||||
"prefix": ["script-run-config-distributed", "distributed-src", "src-distributed"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace, ScriptRunConfig, Environment, Experiment",
|
||||
"from azureml.core.runconfig import MpiConfiguration",
|
||||
"",
|
||||
"# get workspace",
|
||||
"ws = Workspace.from_config()",
|
||||
"",
|
||||
"# get compute target",
|
||||
"target = ws.compute_targets['${1:target-name}']",
|
||||
"",
|
||||
"# get curated environment",
|
||||
"curated_env_name = '${2:AzureML-PyTorch-1.6-GPU}'",
|
||||
"env = Environment.get(workspace=ws, name=curated_env_name)",
|
||||
"",
|
||||
"# get/create experiment",
|
||||
"exp = Experiment(ws, '${3:experiment_name}')",
|
||||
"",
|
||||
"# distributed job configuration",
|
||||
"distributed_job_config = MpiConfiguration(process_count_per_node=4, node_count=2)",
|
||||
"",
|
||||
"# set up script run configuration",
|
||||
"config = ScriptRunConfig(",
|
||||
" source_directory='${4:.}',",
|
||||
" script='${5:script.py}',",
|
||||
" compute_target=target,",
|
||||
" environment=env,",
|
||||
" distributed_job_config=distributed_job_config,",
|
||||
")",
|
||||
"",
|
||||
"# submit script to AML",
|
||||
"run = exp.submit(config)",
|
||||
"print(run.get_portal_url()) # link to ml.azure.com",
|
||||
"run.wait_for_completion(show_output=True)",
|
||||
"$0"
|
||||
],
|
||||
"description": "Set up ScriptRunConfig for distributed training."
|
||||
},
|
||||
|
||||
"Run Details Widget": {
|
||||
"prefix": ["run-details-widget"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace,Experiment,Run",
|
||||
"from azureml.widgets import RunDetails",
|
||||
"",
|
||||
"# get workspace",
|
||||
"ws = Workspace.from_config()",
|
||||
"",
|
||||
"# get/create experiment",
|
||||
"exp = Experiment(ws, '${1:experiment_name}')",
|
||||
"",
|
||||
"# get run",
|
||||
"run = Run(exp,'${2:run_id}')",
|
||||
"",
|
||||
"# submit script to AML",
|
||||
"RunDetails(run).show()",
|
||||
"$0"
|
||||
],
|
||||
"description": "Represents a Jupyter notebook widget used to view the progress of model training."
|
||||
},
|
||||
|
||||
"Consume Dataset": {
|
||||
"prefix": ["consume-dataset"],
|
||||
"body": [
|
||||
"#azureml-core of version 1.0.72 or higher is required",
|
||||
"from azureml.core import Workspace, Dataset",
|
||||
"",
|
||||
"# get/create experiment",
|
||||
"ws = Workspace.from_config()",
|
||||
"",
|
||||
"# get dataset",
|
||||
"dataset = Dataset.get_by_name(ws, name='${1:dataset_name}')",
|
||||
"dataset.download(target_path='.', overwrite=False)",
|
||||
"$0"
|
||||
],
|
||||
"description": "Download Azure ML dataset to current working directory"
|
||||
},
|
||||
|
||||
"Create Tabular Dataset": {
|
||||
"prefix": ["create-tabular-dataset"],
|
||||
"body": [
|
||||
"from azureml.core import Workspace, Datastore, Dataset",
|
||||
"",
|
||||
"datastore_name = '${1:datastore_name}'",
|
||||
"",
|
||||
"# get workspace",
|
||||
"ws = Workspace.from_config()",
|
||||
"",
|
||||
"# retrieve an existing datastore in the workspace by name",
|
||||
"datastore = Datastore.get(ws, datastore_name)",
|
||||
"",
|
||||
"# create a TabularDataset from 1 file paths in datastore",
|
||||
"datastore_paths = [(datastore, ${2:file_path})]",
|
||||
"",
|
||||
"custom_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)",
|
||||
"$0"
|
||||
],
|
||||
"description": "Create Azure ML tabular dataset."
|
||||
},
|
||||
|
||||
"Create File Dataset": {
|
||||
"prefix": ["create-file-dataset"],
|
||||
"body": [
|
||||
"# create a FileDataset pointing to files in 'animals' folder and its subfolders recursively",
|
||||
"from azureml.core import Workspace, Datastore, Dataset",
|
||||
"",
|
||||
"datastore_name = '${1:datastore_name}'",
|
||||
"",
|
||||
"# get workspace",
|
||||
"ws = Workspace.from_config()",
|
||||
"",
|
||||
"# retrieve an existing datastore in the workspace by name",
|
||||
"datastore = Datastore.get(ws, datastore_name)",
|
||||
"",
|
||||
"# create a FileDataset pointing to files in your folder and its subfolders recursively, you can also use public web urls paths",
|
||||
"datastore_paths = [(datastore, ${2:file_path})]",
|
||||
"",
|
||||
"custom_ds = Dataset.File.from_files(path=datastore_paths)",
|
||||
"$0"
|
||||
],
|
||||
"description": "Create Azure ML file dataset."
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
# VS Code Snippets
|
||||
|
||||
Notes for contributing Azure ML Snippets.
|
||||
|
||||
For using snippets see https://azure.github.io/azureml-web/docs/vs-code-snippets/snippets.
|
||||
|
||||
1. Add snippets to `python.json`. For more details on VS Code snippets: [vs-code-docs](https://code.visualstudio.com/docs/editor/userdefinedsnippets)
|
||||
2. Run `python snippets-parser.py` to automatically update the `snippets.md` (which will document your changes)
|
||||
3. Make a PR to the `main` branch and request a review.
|
|
@ -0,0 +1,112 @@
|
|||
import json
|
||||
from typing import List
|
||||
|
||||
|
||||
class Snippet:
|
||||
"""Handle json snippets
|
||||
|
||||
Parse json (VS Code) snippets file and generate markdown summary.
|
||||
"""
|
||||
|
||||
def __init__(self, name, snippet_json):
|
||||
self.name = name
|
||||
self.description = snippet_json.get("description")
|
||||
self.prefix = self._read_prefix(snippet_json.get("prefix"))
|
||||
self.body = snippet_json.get("body")
|
||||
|
||||
def __repr__(self):
|
||||
return f"Snippet({self.name})"
|
||||
|
||||
@staticmethod
|
||||
def _read_prefix(prefix):
|
||||
"""Guarentee prefix is of type List."""
|
||||
if type(prefix) == list:
|
||||
return prefix
|
||||
else:
|
||||
assert type(prefix) == str
|
||||
return [prefix]
|
||||
|
||||
def to_markdown(self) -> List[str]:
|
||||
"""Convert snippet to markdown (as list of lines)."""
|
||||
lines = []
|
||||
|
||||
# add heading
|
||||
heading = f"### {self.name}"
|
||||
lines.append(heading)
|
||||
lines.append("")
|
||||
|
||||
# add description
|
||||
description = f"Description: {self.description}"
|
||||
lines.append(description)
|
||||
lines.append("")
|
||||
|
||||
# add prefix(es)
|
||||
if len(self.prefix) > 1:
|
||||
prefix = f"Prefixes: "
|
||||
else:
|
||||
prefix = f"Prefix: "
|
||||
for p in self.prefix:
|
||||
prefix += f"`{p}`, "
|
||||
prefix = prefix[:-2] # remove trailing comma and whitespace
|
||||
lines.append(prefix)
|
||||
lines.append("")
|
||||
|
||||
# add python snippet
|
||||
lines.append("```python")
|
||||
for line in self.body:
|
||||
if line == "$0":
|
||||
continue
|
||||
lines.append(line)
|
||||
lines.append("```")
|
||||
|
||||
return lines
|
||||
|
||||
@staticmethod
|
||||
def _convert_to_json(body):
|
||||
json_body = []
|
||||
for line in body[:-1]:
|
||||
line = '"' + line + '",'
|
||||
json_body.append(line)
|
||||
line = '"' + body[-1] + '"'
|
||||
json_body.append(line)
|
||||
return json_body
|
||||
|
||||
|
||||
frontmatter = """---
|
||||
title: VS Code Snippets
|
||||
description: A collection of VS Code Snippets for working with Azure ML.
|
||||
---
|
||||
|
||||
We have compiled a collection of useful templates in the form of
|
||||
[VS code snippets](https://code.visualstudio.com/docs/editor/userdefinedsnippets).
|
||||
|
||||
![VS Code Snippets](vs-code-snippets-demo.gif)
|
||||
|
||||
To add these snippets to your VS Code: `ctrl+shift+p` > Type 'Configure user
|
||||
snippets' > Select `python.json`. All of these snippets are available here:
|
||||
[python.json](https://github.com/Azure/azureml-web/blob/main/website/docs/vs-code-snippets/python.json)
|
||||
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# parse snippets
|
||||
with open("python.json") as f:
|
||||
snippets_file = json.load(f)
|
||||
|
||||
snippets = []
|
||||
for name, snippet_json in snippets_file.items():
|
||||
snippet = Snippet(name, snippet_json)
|
||||
snippets.append(snippet)
|
||||
|
||||
# create file and write frontmatter
|
||||
md_filename = "snippets.md"
|
||||
with open(md_filename, "w") as f:
|
||||
# write frontmatter
|
||||
f.writelines(frontmatter)
|
||||
|
||||
# write each snippet
|
||||
for snippet in snippets:
|
||||
lines = snippet.to_markdown()
|
||||
for line in lines:
|
||||
f.write(line + "\n")
|
|
@ -0,0 +1,494 @@
|
|||
---
|
||||
title: VS Code Snippets
|
||||
description: A collection of VS Code Snippets for working with Azure ML.
|
||||
---
|
||||
|
||||
We have compiled a collection of useful templates in the form of
|
||||
[VS code snippets](https://code.visualstudio.com/docs/editor/userdefinedsnippets).
|
||||
|
||||
![VS Code Snippets](vs-code-snippets-demo.gif)
|
||||
|
||||
To add these snippets to your VS Code: `ctrl+shift+p` > Type 'Configure user
|
||||
snippets' > Select `python.json`. All of these snippets are available here:
|
||||
[python.json](https://github.com/Azure/azureml-web/blob/main/website/docs/vs-code-snippets/python.json)
|
||||
|
||||
### Imports Group: Basic
|
||||
|
||||
Description: Import collection of basic Azure ML classes
|
||||
|
||||
Prefix: `import-basic`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace # connect to workspace
|
||||
from azureml.core import Experiment # connect/create experiments
|
||||
from azureml.core import ComputeTarget # connect to compute
|
||||
from azureml.core import Environment # manage e.g. Python environments
|
||||
from azureml.core import Datastore, Dataset # work with data
|
||||
```
|
||||
### Import Workspace
|
||||
|
||||
Description: Import Workspace class
|
||||
|
||||
Prefix: `import-workspace`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
```
|
||||
### Import Compute Target
|
||||
|
||||
Description: Import ComputeTarget class
|
||||
|
||||
Prefix: `import-compute-target`
|
||||
|
||||
```python
|
||||
from azureml.core import ComputeTarget
|
||||
```
|
||||
### Import Environment
|
||||
|
||||
Description: Import Environment class
|
||||
|
||||
Prefix: `import-environment`
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
```
|
||||
### Import ScriptRunConfig
|
||||
|
||||
Description: Import ScriptRunConfig class
|
||||
|
||||
Prefixes: `import-script-run-config`, `import-src`
|
||||
|
||||
```python
|
||||
from azureml.core import ScriptRunConfig
|
||||
```
|
||||
### Import Dataset
|
||||
|
||||
Description: Import Dataset class
|
||||
|
||||
Prefix: `import-dataset`
|
||||
|
||||
```python
|
||||
from azureml.core import Dataset
|
||||
```
|
||||
### Import Datastore
|
||||
|
||||
Description: Import Datastore class
|
||||
|
||||
Prefix: `import-datastore`
|
||||
|
||||
```python
|
||||
from azureml.core import Datastore
|
||||
```
|
||||
### Import Run
|
||||
|
||||
Description: Import Run class
|
||||
|
||||
Prefix: `import-run`
|
||||
|
||||
```python
|
||||
from azureml.core import Run
|
||||
```
|
||||
### Import Conda Dependencies
|
||||
|
||||
Description: Import CondaDependencies class
|
||||
|
||||
Prefix: `import-conda-dependencies`
|
||||
|
||||
```python
|
||||
from azureml.core.conda_dependencies import CondaDependencies
|
||||
```
|
||||
### Get Workspace From Config
|
||||
|
||||
Description: Get Azure ML Workspace from config
|
||||
|
||||
Prefixes: `get-workspace-config`, `ws-config`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
ws = Workspace.from_config()
|
||||
```
|
||||
### Get Workspace
|
||||
|
||||
Description: Get Azure ML Workspace
|
||||
|
||||
Prefixes: `get-workspace`, `get-ws`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace
|
||||
ws = Workspace.get(
|
||||
name='${1:name}',
|
||||
subscription_id='${2:subscription_id}',
|
||||
resource_group='${3:resource_group}',
|
||||
)
|
||||
```
|
||||
### Get Compute
|
||||
|
||||
Description: Get Azure ML Compute Target
|
||||
|
||||
Prefix: `get-compute`
|
||||
|
||||
```python
|
||||
from azureml.core import ComputeTarget
|
||||
target = ComputeTarget(${2:ws}, '${1:<compute_target_name>}')
|
||||
```
|
||||
### Get Compute with SSH
|
||||
|
||||
Description: Get Azure ML Compute Target with SSH
|
||||
|
||||
Prefix: `get-compute-ssh`
|
||||
|
||||
```python
|
||||
from azureml.core.compute import AmlCompute
|
||||
from azureml.core.compute_target import ComputeTargetException
|
||||
|
||||
ssh_public_key = 'public-key-here'
|
||||
compute_config = AmlCompute.provisioning_configuration(
|
||||
vm_size='$2',
|
||||
min_nodes=$3,
|
||||
max_nodes=$4,
|
||||
admin_username='$5',
|
||||
admin_user_ssh_key=ssh_public_key,
|
||||
vm_priority='${6|lowpriority,dedicated|}',
|
||||
remote_login_port_public_access='Enabled',
|
||||
)
|
||||
|
||||
cluster = ComputeTarget.create(
|
||||
workspace=${7:workspace_name},
|
||||
name='${8:target_name}',
|
||||
compute_config=compute_config,
|
||||
)
|
||||
```
|
||||
### Get Environment
|
||||
|
||||
Description: Get Azure ML Environment
|
||||
|
||||
Prefix: `get-environment`
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
${2:env} = Environment('${1:<env-name>}')
|
||||
```
|
||||
### Get Environment From Pip
|
||||
|
||||
Description: Create environment from pip requirements.txt
|
||||
|
||||
Prefixes: `get-environment-pip`, `env-pip`
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
env = Environment.from_pip_requirements(
|
||||
name='${1:env_name}',
|
||||
file_path='${2:requirements.txt}',
|
||||
)
|
||||
```
|
||||
### Get Environment From Conda
|
||||
|
||||
Description: Create environment from Conda env.yml file
|
||||
|
||||
Prefixes: `get-environment-conda`, `env-conda`
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
env = Environment.from_conda_specification(
|
||||
name='${1:env_name}',
|
||||
file_path='${2:env.yml}',
|
||||
)
|
||||
```
|
||||
### Get Environment From SDK
|
||||
|
||||
Description: Create environment using CondaDependencies class
|
||||
|
||||
Prefixes: `get-environment-sdk`, `env-sdk`
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
from azureml.core.conda_dependencies import CondaDependencies
|
||||
env = Environment('${1:my-env}')
|
||||
|
||||
conda = CondaDependencies()
|
||||
|
||||
# add channels
|
||||
conda.add_channel('$2')
|
||||
|
||||
# add conda packages
|
||||
conda.add_conda_package('$3')
|
||||
|
||||
# add pip packages
|
||||
conda.add_pip_package('$4')
|
||||
|
||||
# add conda dependencies to environment
|
||||
env.python.conda_dependencies = conda
|
||||
```
|
||||
### Get Environment From Custom image
|
||||
|
||||
Description: Create environment using Custom image
|
||||
|
||||
Prefixes: `get-environment-custom-image`, `env-image`
|
||||
|
||||
```python
|
||||
from azureml.core import Environment
|
||||
env = Environment('${1:my-env}')
|
||||
|
||||
env.docker.enabled = True
|
||||
|
||||
# base image for DockerHub
|
||||
env.docker.base_image = '${2}'
|
||||
|
||||
# if you are using base image from a Dockerfile
|
||||
# env.docker.base_image = None
|
||||
# env.docker.base_dockerfile = './Dockerfile'
|
||||
|
||||
# The user_managed_dependencies flag to True will use your custom image's built-in Python environment.
|
||||
env.python.user_managed_dependencies = True
|
||||
|
||||
```
|
||||
### Workspace Compute Targets
|
||||
|
||||
Description: Get compute target from workspace
|
||||
|
||||
Prefix: `ws-compute-target`
|
||||
|
||||
```python
|
||||
target = ws.compute_targets['${1:target-name}']
|
||||
```
|
||||
### Workspace Environments
|
||||
|
||||
Description: Get environment from workspace
|
||||
|
||||
Prefix: `ws-environment`
|
||||
|
||||
```python
|
||||
env = ws.environments['${1:env-name}']
|
||||
```
|
||||
### Workspace Datastores
|
||||
|
||||
Description: Get datastore from workspace
|
||||
|
||||
Prefix: `ws-datastore`
|
||||
|
||||
```python
|
||||
datastore = ws.datastores['${1:datastore-name}']
|
||||
```
|
||||
### Workspace Datasets
|
||||
|
||||
Description: Get dataset from workspace
|
||||
|
||||
Prefix: `ws-dataset`
|
||||
|
||||
```python
|
||||
dataset = ws.datasets['${1:dataset-name}']
|
||||
```
|
||||
### Workspace Experiment
|
||||
|
||||
Description: Get (existing) experiment from workspace
|
||||
|
||||
Prefix: `ws-experiment`
|
||||
|
||||
```python
|
||||
exp = ws.experiments['${1:experiment-name}']
|
||||
```
|
||||
### Workspace Models
|
||||
|
||||
Description: Get model from workspace
|
||||
|
||||
Prefix: `ws-model`
|
||||
|
||||
```python
|
||||
model = ws.models['${1:model-name}']
|
||||
```
|
||||
### Script Run Config
|
||||
|
||||
Description: Set up ScriptRunConfig including compute target, environment and experiment
|
||||
|
||||
Prefixes: `script-run-config`, `src`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, Experiment, ScriptRunConfig
|
||||
|
||||
# get workspace
|
||||
ws = Workspace.from_config()
|
||||
|
||||
# get compute target
|
||||
target = ws.compute_targets['${1:target-name}']
|
||||
|
||||
# get registered environment
|
||||
env = ws.environments['${2:env-name}']
|
||||
|
||||
# get/create experiment
|
||||
exp = Experiment(ws, '${3:experiment_name}')
|
||||
|
||||
# set up script run configuration
|
||||
config = ScriptRunConfig(
|
||||
source_directory='${4:.}',
|
||||
script='${5:script.py}',
|
||||
compute_target=target,
|
||||
environment=env,
|
||||
arguments=[${6:'--meaning', 42}],
|
||||
)
|
||||
|
||||
# submit script to AML
|
||||
run = exp.submit(config)
|
||||
print(run.get_portal_url()) # link to ml.azure.com
|
||||
run.wait_for_completion(show_output=True)
|
||||
```
|
||||
### Script Run Config with Command
|
||||
|
||||
Description: Set up ScriptRunConfig using command argument
|
||||
|
||||
Prefixes: `script-run-config-command`, `command-src`, `src-command`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, Experiment, ScriptRunConfig
|
||||
|
||||
# get workspace
|
||||
ws = Workspace.from_config()
|
||||
|
||||
# get compute target
|
||||
target = ws.compute_targets['${1:target-name}']
|
||||
|
||||
# get registered environment
|
||||
env = ws.environments['${2:env-name}']
|
||||
|
||||
# get/create experiment
|
||||
exp = Experiment(ws, '${3:experiment_name}')
|
||||
|
||||
# create command
|
||||
command = 'python ${4:script.py} ${5:--argument value}'.split()
|
||||
|
||||
# set up script run configuration
|
||||
config = ScriptRunConfig(
|
||||
source_directory='${6:.}',
|
||||
command=command,
|
||||
compute_target=target,
|
||||
environment=env,
|
||||
)
|
||||
|
||||
# submit script to AML
|
||||
run = exp.submit(config)
|
||||
print(run.get_portal_url()) # link to ml.azure.com
|
||||
run.wait_for_completion(show_output=True)
|
||||
```
|
||||
### Script Run Config with Distributed Config
|
||||
|
||||
Description: Set up ScriptRunConfig for distributed training.
|
||||
|
||||
Prefixes: `script-run-config-distributed`, `distributed-src`, `src-distributed`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, ScriptRunConfig, Environment, Experiment
|
||||
from azureml.core.runconfig import MpiConfiguration
|
||||
|
||||
# get workspace
|
||||
ws = Workspace.from_config()
|
||||
|
||||
# get compute target
|
||||
target = ws.compute_targets['${1:target-name}']
|
||||
|
||||
# get curated environment
|
||||
curated_env_name = '${2:AzureML-PyTorch-1.6-GPU}'
|
||||
env = Environment.get(workspace=ws, name=curated_env_name)
|
||||
|
||||
# get/create experiment
|
||||
exp = Experiment(ws, '${3:experiment_name}')
|
||||
|
||||
# distributed job configuration
|
||||
distributed_job_config = MpiConfiguration(process_count_per_node=4, node_count=2)
|
||||
|
||||
# set up script run configuration
|
||||
config = ScriptRunConfig(
|
||||
source_directory='${4:.}',
|
||||
script='${5:script.py}',
|
||||
compute_target=target,
|
||||
environment=env,
|
||||
distributed_job_config=distributed_job_config,
|
||||
)
|
||||
|
||||
# submit script to AML
|
||||
run = exp.submit(config)
|
||||
print(run.get_portal_url()) # link to ml.azure.com
|
||||
run.wait_for_completion(show_output=True)
|
||||
```
|
||||
### Run Details Widget
|
||||
|
||||
Description: Represents a Jupyter notebook widget used to view the progress of model training.
|
||||
|
||||
Prefix: `run-details-widget`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace,Experiment,Run
|
||||
from azureml.widgets import RunDetails
|
||||
|
||||
# get workspace
|
||||
ws = Workspace.from_config()
|
||||
|
||||
# get/create experiment
|
||||
exp = Experiment(ws, '${1:experiment_name}')
|
||||
|
||||
# get run
|
||||
run = Run(exp,'${2:run_id}')
|
||||
|
||||
# submit script to AML
|
||||
RunDetails(run).show()
|
||||
```
|
||||
### Consume Dataset
|
||||
|
||||
Description: Download Azure ML dataset to current working directory
|
||||
|
||||
Prefix: `consume-dataset`
|
||||
|
||||
```python
|
||||
#azureml-core of version 1.0.72 or higher is required
|
||||
from azureml.core import Workspace, Dataset
|
||||
|
||||
# get/create experiment
|
||||
ws = Workspace.from_config()
|
||||
|
||||
# get dataset
|
||||
dataset = Dataset.get_by_name(ws, name='${1:dataset_name}')
|
||||
dataset.download(target_path='.', overwrite=False)
|
||||
```
|
||||
### Create Tabular Dataset
|
||||
|
||||
Description: Create Azure ML tabular dataset.
|
||||
|
||||
Prefix: `create-tabular-dataset`
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, Datastore, Dataset
|
||||
|
||||
datastore_name = '${1:datastore_name}'
|
||||
|
||||
# get workspace
|
||||
ws = Workspace.from_config()
|
||||
|
||||
# retrieve an existing datastore in the workspace by name
|
||||
datastore = Datastore.get(ws, datastore_name)
|
||||
|
||||
# create a TabularDataset from 1 file paths in datastore
|
||||
datastore_paths = [(datastore, ${2:file_path})]
|
||||
|
||||
custom_ds = Dataset.Tabular.from_delimited_files(path=datastore_paths)
|
||||
```
|
||||
### Create File Dataset
|
||||
|
||||
Description: Create Azure ML file dataset.
|
||||
|
||||
Prefix: `create-file-dataset`
|
||||
|
||||
```python
|
||||
# create a FileDataset pointing to files in 'animals' folder and its subfolders recursively
|
||||
from azureml.core import Workspace, Datastore, Dataset
|
||||
|
||||
datastore_name = '${1:datastore_name}'
|
||||
|
||||
# get workspace
|
||||
ws = Workspace.from_config()
|
||||
|
||||
# retrieve an existing datastore in the workspace by name
|
||||
datastore = Datastore.get(ws, datastore_name)
|
||||
|
||||
# create a FileDataset pointing to files in your folder and its subfolders recursively, you can also use public web urls paths
|
||||
datastore_paths = [(datastore, ${2:file_path})]
|
||||
|
||||
custom_ds = Dataset.File.from_files(path=datastore_paths)
|
||||
```
|
Двоичные данные
website/i18n/ja/docusaurus-plugin-content-docs/current/vs-code-snippets/vs-code-snippets-demo.gif
Normal file
После Ширина: | Высота: | Размер: 416 KiB |
|
@ -0,0 +1,46 @@
|
|||
{
|
||||
"link.title.Resources": {
|
||||
"message": "Resources",
|
||||
"description": "The title of the footer links column with title=Resources in the footer"
|
||||
},
|
||||
"link.title.Support": {
|
||||
"message": "Support",
|
||||
"description": "The title of the footer links column with title=Support in the footer"
|
||||
},
|
||||
"link.title.GitHub": {
|
||||
"message": "GitHub",
|
||||
"description": "The title of the footer links column with title=GitHub in the footer"
|
||||
},
|
||||
"link.item.label.Azure ML - Microsoft Docs": {
|
||||
"message": "Azure ML - Microsoft Docs",
|
||||
"description": "The label of footer link with label=Azure ML - Microsoft Docs linking to https://docs.microsoft.com/azure/machine-learning"
|
||||
},
|
||||
"link.item.label.Azure ML - Python API": {
|
||||
"message": "Azure ML - Python API",
|
||||
"description": "The label of footer link with label=Azure ML - Python API linking to https://docs.microsoft.com/python/api/overview/azure/ml/?view=azure-ml-py"
|
||||
},
|
||||
"link.item.label.GitHub issues": {
|
||||
"message": "GitHub issues",
|
||||
"description": "The label of footer link with label=GitHub issues linking to https://github.com/Azure/azureml-cheatsheets/issues"
|
||||
},
|
||||
"link.item.label.Stack Overflow": {
|
||||
"message": "Stack Overflow",
|
||||
"description": "The label of footer link with label=Stack Overflow linking to https://stackoverflow.microsoft.com/questions/tagged/10888"
|
||||
},
|
||||
"link.item.label.Cheat sheets": {
|
||||
"message": "Cheat sheets",
|
||||
"description": "The label of footer link with label=Cheat sheets linking to https://github.com/Azure/azureml-cheatsheets"
|
||||
},
|
||||
"link.item.label.Azure ML Examples": {
|
||||
"message": "Azure ML Examples",
|
||||
"description": "The label of footer link with label=Azure ML Examples linking to https://github.com/Azure/azureml-examples"
|
||||
},
|
||||
"link.item.label.Contribution": {
|
||||
"message": "Contribution",
|
||||
"description": "The label of footer link with label=Contribution linking to /docs/misc/contributing"
|
||||
},
|
||||
"copyright": {
|
||||
"message": "Copyright © 2021 Microsoft Corporation",
|
||||
"description": "The footer copyright"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
{
|
||||
"title": {
|
||||
"message": "Azure Machine Learning",
|
||||
"description": "The title in the navbar"
|
||||
},
|
||||
"item.label.Python SDK": {
|
||||
"message": "Python SDK",
|
||||
"description": "Navbar item with label Python SDK"
|
||||
}
|
||||
}
|
|
@ -5,37 +5,44 @@ import Link from '@docusaurus/Link';
|
|||
import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
|
||||
import useBaseUrl from '@docusaurus/useBaseUrl';
|
||||
import styles from './styles.module.css';
|
||||
import Translate, {translate} from '@docusaurus/Translate';
|
||||
|
||||
const features = [
|
||||
{
|
||||
title: 'Cheat Sheet',
|
||||
title: <Translate id="section0">Cheat Sheet</Translate>,
|
||||
pageUrl: 'docs/cheatsheets/python/v1/cheatsheet',
|
||||
imageUrl: 'img/undraw_docusaurus_mountain.svg',
|
||||
description: (
|
||||
<>
|
||||
<Translate id="section0.desc">
|
||||
A cheat sheet for common use cases with AML.
|
||||
Get 80% of what you need in 20% of the documentation.
|
||||
</Translate>
|
||||
</>
|
||||
),
|
||||
},
|
||||
{
|
||||
title: 'Distributed GPU Training',
|
||||
title: <Translate id="section1">Distributed GPU Training</Translate>,
|
||||
pageUrl: 'docs/cheatsheets/python/v1/distributed-training',
|
||||
imageUrl: 'img/undraw_docusaurus_react.svg',
|
||||
description: (
|
||||
<>
|
||||
<Translate id="section1.desc">
|
||||
Guide to getting your distributed training code running in Azure ML.
|
||||
</Translate>
|
||||
</>
|
||||
),
|
||||
},
|
||||
{
|
||||
title: 'Environments',
|
||||
title: <Translate id="section2">Environments</Translate>,
|
||||
pageUrl: 'docs/cheatsheets/python/v1/environment',
|
||||
imageUrl: 'img/undraw_docusaurus_tree.svg',
|
||||
description: (
|
||||
<>
|
||||
<Translate id="section2.desc">
|
||||
Set up and manage your Python environments and docker images
|
||||
in Azure ML.
|
||||
</Translate>
|
||||
</>
|
||||
),
|
||||
},
|
||||
|
@ -65,8 +72,8 @@ function Home() {
|
|||
description="A user guide to Azure ML <head />">
|
||||
<header className={clsx('hero hero--primary', styles.heroBanner)}>
|
||||
<div className="container">
|
||||
<h1 className="hero__title">{siteConfig.title}</h1>
|
||||
<p className="hero__subtitle">{siteConfig.tagline}</p>
|
||||
<h1 className="hero__title"><Translate id="index.title">{siteConfig.title}</Translate></h1>
|
||||
<p className="hero__subtitle"><Translate id="index.tagline">{siteConfig.tagline}</Translate></p>
|
||||
<div className={styles.buttons}>
|
||||
<Link
|
||||
className={clsx(
|
||||
|
|