This commit is contained in:
Cindy Weng 2022-09-09 10:51:46 +01:00
Родитель 352d9ebbf7 b5ed0dee2c
Коммит b4ef102154
121 изменённых файлов: 26020 добавлений и 166 удалений

13
.github/ISSUE_TEMPLATE/config.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,13 @@
blank_issues_enabled: false
contact_links:
- name: MLOps v2 solution accelerators discussions.
url: https://github.com/azure/mlops-v2/discussions
about: >-
Please ask questions and start open-ended discussions here.
Use issues for well-defined work in the solution accelerator repositories.
- name: Azure ML CLI issues.
url: https://github.com/azure/azure-cli-extensions/issues/new/choose
about: Please open issues with the Azure ML CLI extension here.
- name: Azure ML Python SDK issues.
url: https://github.com/azure/azure-sdk-for-python/issues/new/choose
about: Please open issues with the Azure ML Python SDK here.

21
.github/ISSUE_TEMPLATE/repository-issue.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,21 @@
---
name: Suggest an enhancement for this repository.
about: Have an idea for improvements to this repository?
title: '[repo] <title>'
labels: ''
assignees: ''
---
## Why?
<!-- What problem is this solving? -->
## How?
<!-- How are you suggesting it gets solved? -->
## Anything else?
<!--
Links? References? Anything that will give us more context about the issue that you are encountering!
-->

25
.github/ISSUE_TEMPLATE/solution-accelerator-request.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,25 @@
---
name: Request or suggest a new solution accelerator.
about: Have an idea for a new solution accelerator?
title: '[new accelerator] <title>'
labels: ''
assignees: ''
---
## Why doesn't an existing solution accelerator work?
<!-- Concisely explain why a new solution accelerator is needed. -->
## What work is needed?
<!--
Concisely explain the infrastructure and MLOps work needed.
Include as much detail as possible in how this would fit into the
overall solution accelerator.
-->
## Anything else?
<!--
Links? References? Anything that will give us more context about the issue that you are encountering!
-->

13
.github/PULL_REQUEST_TEMPLATE.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,13 @@
# PR into Azure/mlops-v2
## Checklist
I have:
- [ ] read and followed the contributing guidelines
## Changes
-
fixes #

14
.pre-commit-config.yaml Normal file
Просмотреть файл

@ -0,0 +1,14 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.2.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
# Opinionated code formatter to forget about formatting
- repo: https://github.com/psf/black
rev: 21.12b0
hooks:
- id: black
additional_dependencies: ['click==8.0.4']

Просмотреть файл

@ -12,3 +12,5 @@ dependencies:
- pandas==1.2.1
- joblib==1.0.0
- matplotlib==3.3.3
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector

Просмотреть файл

@ -0,0 +1,16 @@
channels:
- defaults
- anaconda
- conda-forge
dependencies:
- python=3.7.5
- pip
- pip:
- azureml-mlflow==1.38.0
- azureml-sdk==1.38.0
- scikit-learn==0.24.1
- pandas==1.2.1
- joblib==1.0.0
- matplotlib==3.3.3
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector

Просмотреть файл

@ -53,10 +53,18 @@ def parse_args():
parser.add_argument("--val_data", type=str, help="Path to test dataset")
parser.add_argument("--test_data", type=str, help="Path to test dataset")
parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
args = parser.parse_args()
return args
def log_training_data(df, table_name):
from obs.collector import Online_Collector
collector = Online_Collector(table_name)
collector.batch_collect(df)
def main(args):
'''Read, split, and save datasets'''
@ -93,6 +101,9 @@ def main(args):
val.to_parquet((Path(args.val_data) / "val.parquet"))
test.to_parquet((Path(args.test_data) / "test.parquet"))
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
log_training_data(data, args.table_name)
if __name__ == "__main__":

Просмотреть файл

@ -2,5 +2,5 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.sch
name: blue
endpoint_name: taxi-fare-online
model: azureml:taxi-model@latest
instance_type: Standard_F2s_v2
instance_type: Standard_DS2_v2
instance_count: 1

Просмотреть файл

Просмотреть файл

@ -8,6 +8,8 @@ inputs:
input: #using local data, will create an anonymous data asset
type: uri_folder
path: ../../../data/
enable_monitoring:
table_name: 'taximonitoring'
outputs:
train_data:
@ -35,9 +37,12 @@ jobs:
--train_data ${{outputs.train_data}}
--val_data ${{outputs.val_data}}
--test_data ${{outputs.test_data}}
--enable_monitoring ${{inputs.enable_monitoring}}
environment: azureml:taxi-train-env@latest
inputs:
raw_data: ${{parent.inputs.input}}
enable_monitoring: ${{parent.inputs.enable_monitoring}}
table_name: ${{parent.inputs.table_name}}
outputs:
train_data: ${{parent.outputs.train_data}}
val_data: ${{parent.outputs.val_data}}

Просмотреть файл

@ -11,7 +11,7 @@ variables:
- name: version
value: aml-cli-v2
- name: endpoint_name
value: taxi-fare-batch
value: taxi-batch-$(namespace)$(postfix)$(environment)
- name: endpoint_type
value: batch
@ -28,6 +28,8 @@ resources:
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: CreateBatchEndpoint

Просмотреть файл

@ -25,10 +25,14 @@ resources:
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
<<<<<<< HEAD
- repository: rai-vnext-preview # Template Repo
name: Azure/rai-vnext-preview # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
=======
ref: main
>>>>>>> main
stages:
- stage: DeployTrainingPipeline
@ -50,6 +54,7 @@ stages:
build_type: conda
environment_name: taxi-train-env
environment_file: mlops/azureml/train/train-env.yml
enable_monitoring: $(enable_monitoring)
- checkout: rai-vnext-preview
path: s/
- template: register-rai-components.yml
@ -60,3 +65,6 @@ stages:
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_file: mlops/azureml/train/pipeline.yml
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
enable_monitoring: $(enable_monitoring)

Просмотреть файл

@ -7,11 +7,11 @@ variables:
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- template: ../../../../config-infra-dev.yml
- name: version
value: aml-cli-v2
- name: endpoint_name
value: taxi-fare-online
value: taxi-online-$(namespace)$(postfix)$(environment)
- name: endpoint_type
value: online
@ -29,6 +29,7 @@ resources:
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: CreateOnlineEndpoint

Просмотреть файл

@ -0,0 +1,44 @@
name: deploy-batch-endpoint-pipeline
on:
workflow_dispatch:
jobs:
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
create-compute:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: batch-cluster
size: STANDARD_DS3_V2
min_instances: 0
max_instances: 5
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-endpoint:
needs: [get-config,create-compute]
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
endpoint_name: ${{ format('taxi-batch-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-deployment:
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config,create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
endpoint_name: ${{ format('taxi-batch-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
deployment_name: eptestdeploy
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -0,0 +1,29 @@
name: deploy-model-training-pipeline
on:
workflow_dispatch:
jobs:
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
register-environment:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
environment_file: mlops/azureml/train/train-env.yml
conda_file: data-science/environment/train-conda.yml
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
run-pipeline:
needs: [get-config,register-environment]
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
parameters-file: mlops/azureml/train/pipeline.yml
job-name: test
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -0,0 +1,42 @@
name: deploy-online-endpoint-pipeline
on:
workflow_dispatch:
jobs:
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
create-endpoint:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
endpoint_type: online
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-deployment:
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config,create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
endpoint_type: online
deployment_name: taxi-online-dp
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
allocate-traffic:
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
needs: [get-config,create-deployment]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
traffic_allocation: taxi-online-dp=100
endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -1,3 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
ap_vm_image: ubuntu-20.04
@ -16,7 +19,7 @@ variables:
training_env_name: credit-training
# Training AzureML Environment conda yaml
training_env_conda_yaml: data-science/environments/train.yml
training_env_conda_yaml: data-science/environment/train.yml
# Name for the training pipeline
training_pipeline_name: credit-training
@ -27,8 +30,12 @@ variables:
training_target_min_nodes: 0
training_target_max_nodes: 4
# Training arguments specification; use azureml:dataset_name:version to reference an AML Dataset for --data_path
training_arguments: --data_path azureml:uci-credit:1
# Training arguments specification
training_arguments: ''
# Training datasets specification
# Syntax: <name>:<version>:<mode>:<steps (names separated by +)>
training_datasets: uci-credit:1:download:prep
# Name under which the model will be registered
model_name: credit-ci
@ -47,7 +54,7 @@ variables:
batch_env_name: credit-batch
# Batch AzureML Environment conda yaml
batch_env_conda_yaml: data-science/environments/batch.yml
batch_env_conda_yaml: data-science/environment/batch.yml
# Name for the batch scoring pipeline
batch_pipeline_name: credit-batch-scoring
@ -73,3 +80,7 @@ variables:
batch_process_count_per_node: 1
batch_node_count: 1
# Monitoring settings
scoring_table_name: scoringdata
training_table_name: mlmonitoring

Просмотреть файл

@ -0,0 +1,18 @@
name: batch-monitoring
channels:
- defaults
- anaconda
- conda-forge
dependencies:
- python=3.7.5
- pip
- pip:
- azureml-defaults==1.38.0
- azureml-mlflow==1.38.0
- azureml-sdk==1.38.0
- azureml-interpret==1.38.0
- scikit-learn==0.24.1
- pandas==1.2.1
- joblib==1.0.0
- matplotlib==3.3.3
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector

Просмотреть файл

@ -1,3 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
name: mnist-train
channels:
- defaults

Просмотреть файл

@ -0,0 +1,22 @@
name: train
channels:
- defaults
- anaconda
- conda-forge
dependencies:
- python=3.7.5
- pip
- pip:
- azureml-mlflow==1.38.0
- azureml-sdk==1.38.0
- scikit-learn==0.24.1
- pandas==1.2.1
- joblib==1.0.0
- matplotlib==3.3.3
- fairlearn==0.7.0
- azureml-contrib-fairness==1.38.0
- interpret-community==0.24.1
- interpret-core==0.2.7
- azureml-interpret==1.38.0
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector

Просмотреть файл

@ -1,3 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import sys
import argparse
@ -40,10 +43,10 @@ def parse_args():
def main():
# Parse command-line arguments
args = parse_args()
prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
model_path = os.path.join(args.model_path, run.parent.id)
explainer_path = os.path.join(args.explainer_path, run.parent.id)
evaluation_path = os.path.join(args.evaluation_path, run.parent.id)
prepared_data_path = args.prepared_data_path
model_path = args.model_path
explainer_path = args.explainer_path
evaluation_path = args.evaluation_path
# Make sure evaluation output path exists
if not os.path.exists(evaluation_path):
@ -111,6 +114,10 @@ def main():
for model_run in Model.list(ws):
if model_run.name == args.model_name:
mdl_path = Model.download(model_run, exist_ok=True)
if 'model.pkl' in mdl_path:
mdl = joblib.load(mdl_path)
else:
mdl = joblib.load(os.path.join(mdl_path, 'model.pkl'))
test_accuracies[model_run.id] = mdl.score(X_test, y_test)

Просмотреть файл

@ -1,3 +1,6 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import sys
import argparse
@ -17,14 +20,25 @@ ws = run.experiment.workspace
def parse_args():
parser = argparse.ArgumentParser(description="UCI Credit example")
parser.add_argument("--data_path", type=str, default='data/', help="Directory path to training data")
parser.add_argument("--uci-credit", type=str, default='data/', help="Directory path to training data")
parser.add_argument("--prepared_data_path", type=str, default='prepared_data/', help="prepared data directory")
return parser.parse_args()
parser.add_argument("--enable_monitoring", type=str, default="false", help="enable logging to ADX")
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
return parser.parse_known_args()
def log_training_data(df, table_name):
from obs.collector import Online_Collector
from datetime import timedelta
print("If there is an Authorization error, check your Azure KeyVault secret named kvmonitoringspkey. Terraform might put single quotation marks around the secret. Remove the single quotes and the secret should work.")
collector = Online_Collector(table_name)
df["timestamp"] = [pd.to_datetime('now') - timedelta(days=x) for x in range(len(df))]
collector.batch_collect(df)
def main():
# Parse command-line arguments
args = parse_args()
prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
args, unknown = parse_args()
prepared_data_path = args.prepared_data_path
# Make sure data output path exists
if not os.path.exists(prepared_data_path):
@ -34,7 +48,7 @@ def main():
mlflow.sklearn.autolog()
# Read training data
df = pd.read_csv(os.path.join(args.data_path, 'credit.csv'))
df = pd.read_csv(os.path.join(args.uci_credit, 'credit.csv'))
random_data = np.random.rand(len(df))
@ -62,5 +76,8 @@ def main():
val.to_csv(VAL_PATH, index=False)
test.to_csv(TEST_PATH, index=False)
if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
log_training_data(df, args.table_name)
if __name__ == '__main__':
main()

Просмотреть файл

@ -1,3 +1,5 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import glob
@ -6,33 +8,55 @@ import argparse
import numpy as np
import pandas as pd
import joblib
from datetime import timedelta
from azureml.core.model import Model
model = None
explainer = None
collector = None
def init():
global model, explainer
global model, explainer, collector
print("Started batch scoring by running init()")
parser = argparse.ArgumentParser('batch_scoring')
parser.add_argument('--model_name', type=str, help='Model to use for batch scoring')
parser = argparse.ArgumentParser("batch_scoring")
parser.add_argument("--model_name", type=str, help="Model to use for batch scoring")
parser.add_argument(
"--enable_monitoring", type=str, help="Enable Monitoring", default="false"
)
parser.add_argument("--table_name", type=str, help="Table Name for logging data")
args, _ = parser.parse_known_args()
model_path = Model.get_model_path(args.model_name)
print(f"Model path: {model_path}")
model = joblib.load(os.path.join(model_path, 'model.pkl'))
if "model.pkl" in model_path:
model = joblib.load(model_path)
else:
model = joblib.load(os.path.join(model_path, "model.pkl"))
# load the explainer
explainer_path = os.path.join(Model.get_model_path(args.model_name), "explainer")
# explainer = joblib.load(explainer_path)
if (
args.enable_monitoring.lower() == "true"
or args.enable_monitoring == "1"
or args.enable_monitoring.lower() == "yes"
):
from obs.collector import Online_Collector
collector = Online_Collector(args.table_name)
def run(file_list):
print(f"Files to process: {file_list}")
results = pd.DataFrame(columns=["Sno", "ProbaGoodCredit", "ProbaBadCredit", "FeatureImportance"])
results = pd.DataFrame(
columns=["Sno", "ProbaGoodCredit", "ProbaBadCredit", "FeatureImportance"]
)
all_results = []
for filename in file_list:
df = pd.read_csv(filename)
@ -55,5 +79,14 @@ def run(file_list):
# result = pd.concat([sno, proba, explanation], axis=1)
result = pd.concat([sno, proba], axis=1)
results = results.append(result)
all_results.append(pd.concat([df, proba], axis=1))
print(f"Batch scored: {filename}")
if collector:
full_results = pd.concat(all_results)
full_results["timestamp"] = [
pd.to_datetime("now") - timedelta(days=x) for x in range(len(full_results))
]
collector.batch_collect(full_results)
return results

Просмотреть файл

@ -1,3 +1,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import sys
import argparse
@ -30,8 +34,8 @@ def main():
# Parse command-line arguments
args = parse_args()
prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
model_path = os.path.join(args.model_path, run.parent.id)
prepared_data_path = args.prepared_data_path
model_path = args.model_path
# Make sure model output path exists
if not os.path.exists(model_path):

Просмотреть файл

@ -0,0 +1,31 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- template: ../../config-aml.yml
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk
trigger:
- none
pool:
vmImage: $(ap_vm_image)
stages:
- stage: DeployDriftJob
displayName: Deploy Drift Job
jobs:
- job: DeployDriftJob
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/deploy-drift-detection.yml@mlops-templates

Просмотреть файл

@ -24,6 +24,7 @@ resources:
name: Azure/mlops-templates
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployBatchScoringPipeline
@ -41,10 +42,14 @@ stages:
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
parameters:
environment_name: $(batch_env_name)
environment_conda_yaml: $(batch_env_conda_yaml)
build_type: 'conda'
environment_file: $(batch_env_conda_yaml)
enable_monitoring: $(enable_monitoring)
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: scoring
- template: templates/${{ variables.version }}/deploy-batch-scoring-pipeline.yml@mlops-templates
parameters:
enable_monitoring: $(enable_monitoring)
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates

Просмотреть файл

@ -24,6 +24,7 @@ resources:
name: Azure/mlops-templates # need to change org name from Azure when pulling the template
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployTrainingPipeline
@ -41,7 +42,9 @@ stages:
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
parameters:
environment_name: $(training_env_name)
environment_conda_yaml: $(training_env_conda_yaml)
build_type: 'conda'
environment_file: $(training_env_conda_yaml)
enable_monitoring: $(enable_monitoring)
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
@ -49,5 +52,7 @@ stages:
parameters:
compute_type: training
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
parameters:
enable_monitoring: $(enable_monitoring)
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates

Просмотреть файл

@ -6,17 +6,22 @@ variables:
# Global
ap_vm_image: ubuntu-20.04
namespace: mlopsv2
postfix: 0621
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
postfix: 0659
location: westus
environment: dev
enable_aml_computecluster: true
enable_aml_secure_workspace: true
enable_monitoring: false
# Azure DevOps
ado_service_connection_rg: Azure-ARM-Dev
ado_service_connection_aml_ws: Azure-ARM-Dev
# IaC
# DO NOT TOUCH
# For pipeline reference
resource_group: rg-$(namespace)-$(postfix)$(environment)
aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
application_insights: mlw-$(namespace)-$(postfix)$(environment)
@ -24,10 +29,10 @@ variables:
container_registry: cr$(namespace)$(postfix)$(environment)
storage_account: st$(namespace)$(postfix)$(environment)
# Terraform
# For terraform reference
terraform_version: 0.14.7
terraform_workingdir: infrastructure/terraform
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf-state
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tfstate
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
terraform_st_container_name: default
terraform_st_key: mlops-tab

Просмотреть файл

@ -7,17 +7,22 @@ variables:
# Global
ap_vm_image: ubuntu-20.04
namespace: mlopsv2
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
postfix: 0518
location: westus
location: westeurope
environment: prod
enable_aml_computecluster: true
enable_aml_secure_workspace: false
enable_monitoring: true
# Azure DevOps
ado_service_connection_rg: Azure-ARM-Dev
ado_service_connection_aml_ws: Azure-ARM-Dev
ado_service_connection_rg: Azure-ARM-Prod
ado_service_connection_aml_ws: Azure-ARM-Prod
# IaC
# DO NOT TOUCH
# For pipeline reference
resource_group: rg-$(namespace)-$(postfix)$(environment)
aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
application_insights: mlw-$(namespace)-$(postfix)$(environment)
@ -25,10 +30,10 @@ variables:
container_registry: cr$(namespace)$(postfix)$(environment)
storage_account: st$(namespace)$(postfix)$(environment)
# Terraform
# For terraform reference
terraform_version: 0.14.7
terraform_workingdir: infrastructure
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf-state
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tfstate
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
terraform_st_container_name: default
terraform_st_key: mlops-tab

Просмотреть файл

@ -0,0 +1,46 @@
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
FROM nvcr.io/nvidia/pytorch:22.04-py3
##############################################################################
# NCCL TESTS
##############################################################################
ENV NCCL_TESTS_TAG=v2.11.0
# NOTE: adding gencodes to support K80, M60, V100, A100
RUN mkdir /tmp/nccltests && \
cd /tmp/nccltests && \
git clone -b ${NCCL_TESTS_TAG} https://github.com/NVIDIA/nccl-tests.git && \
cd nccl-tests && \
make \
MPI=1 MPI_HOME=/opt/hpcx/ompi \
NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80" \
CUDA_HOME=/usr/local/cuda && \
cp ./build/* /usr/local/bin && \
rm -rf /tmp/nccltests
# Install dependencies missing in this container
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
COPY requirements.txt ./
RUN pip install -r requirements.txt
# RUN python -m pip install azureml-defaults==1.41.0 \
# mlflow==1.25.1 \
# azureml-mlflow==1.41.0 \
# transformers==4.18.0 \
# psutil==5.9.0
# add ndv4-topo.xml
RUN mkdir /opt/microsoft/
ADD ./ndv4-topo.xml /opt/microsoft
# to use on A100, enable env var below in your job
# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
# adjusts the level of info from NCCL tests
ENV NCCL_DEBUG="INFO"
ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
ENV NCCL_SOCKET_IFNAME="eth0"

Просмотреть файл

@ -0,0 +1,35 @@
<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
<system version="1">
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
<cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
<pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
<pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
</pci>
</cpu>
</system>

Просмотреть файл

@ -0,0 +1,14 @@
# for local testing (cpu)
torchvision==0.12.0
torch==1.11.0
transformers==4.18.0
# for metrics reporting/plotting
mlflow==1.25.1
azureml-mlflow==1.41.0
matplotlib==3.5.2
tqdm==4.64.0
psutil==5.9.0
# for unit testing
pytest==7.1.2

Просмотреть файл

@ -0,0 +1,20 @@
# NOTE: install these requirements to run the unit tests
# CV packages
torchvision==0.12.0
torch==1.11.0
transformers==4.18.0
# for metrics reporting/plotting
mlflow==1.25.1
azureml-mlflow==1.41.0
matplotlib==3.5.2
tqdm==4.64.0
psutil==5.9.0
# for unit testing
pytest==7.1.2
pytest-cov==2.12.1
# Fix: force protobuf downgrade to avoid exception
protobuf==3.20.1

Просмотреть файл

@ -0,0 +1,108 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Original Author: Jeff Omhover (MSFT)
"""
This script contains methods to hangle inputs for pytorch model training
using the COCO dataset https://cocodataset.org/.
"""
import glob
import logging
import os
import torchvision
def find_image_subfolder(current_root):
"""Identifies the right level of a directory
that matches with torchvision.datasets.ImageFolder requirements.
In particular, if images are in current_root/foo/bar/category_X/*.jpg
we will want to feed current_root/foo/bar/ to ImageFolder.
Args:
current_root (str): a given directory
Returns:
image_folder (str): the subfolder containing multiple subdirs
"""
if not os.path.isdir(current_root):
raise FileNotFoundError(
f"While identifying the image folder, provided current_root={current_root} is not a directory."
)
sub_directories = glob.glob(os.path.join(current_root, "*"))
if len(sub_directories) == 1:
# let's do it recursively
return find_image_subfolder(sub_directories[0])
if len(sub_directories) == 0:
raise FileNotFoundError(
f"While identifying image folder under {current_root}, we found no content at all. The image folder is empty."
)
else:
return current_root
def build_image_datasets(
train_images_dir: str,
valid_images_dir: str,
input_size: int = 224,
):
"""
Args:
train_images_dir (str): path to the directory containing training images
valid_images_dir (str): path to the directory containing validation images
input_size (int): input size expected by the model
Returns:
train_dataset (torchvision.datasets.VisionDataset): training dataset
valid_dataset (torchvision.datasets.VisionDataset): validation dataset
labels (Dict[str, int]): labels
"""
logger = logging.getLogger(__name__)
# identify the right level of sub directory
train_images_dir = find_image_subfolder(train_images_dir)
logger.info(f"Creating training dataset from {train_images_dir}")
train_transform = torchvision.transforms.Compose(
[
torchvision.transforms.RandomResizedCrop(input_size),
torchvision.transforms.RandomHorizontalFlip(),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
mean=[0.485, 0.456, 0.405], std=[0.229, 0.224, 0.225]
),
]
)
train_dataset = torchvision.datasets.ImageFolder(
root=train_images_dir, transform=train_transform
)
logger.info(
f"ImageFolder loaded training image from {train_images_dir}: samples={len(train_dataset)}, #classes={len(train_dataset.classes)} classes={train_dataset.classes}"
)
# identify the right level of sub directory
valid_images_dir = find_image_subfolder(valid_images_dir)
logger.info(f"Creating validation dataset from {valid_images_dir}")
valid_transform = torchvision.transforms.Compose(
[
torchvision.transforms.Resize(input_size),
torchvision.transforms.CenterCrop(input_size),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
mean=[0.485, 0.456, 0.405], std=[0.229, 0.224, 0.225]
),
]
)
valid_dataset = torchvision.datasets.ImageFolder(
root=valid_images_dir, transform=valid_transform
)
logger.info(
f"ImageFolder loaded validation image from {valid_images_dir}: samples={len(valid_dataset)}, #classes={len(valid_dataset.classes)} classes={valid_dataset.classes}"
)
return train_dataset, valid_dataset, train_dataset.classes

Просмотреть файл

@ -0,0 +1 @@
from .model_loader import MODEL_ARCH_LIST, get_model_metadata, load_model

Просмотреть файл

@ -0,0 +1,88 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Original Author: Jeff Omhover (MSFT)
"""
This script provides code to load and setup a variety of models from multiple libraries.
"""
MODEL_ARCH_MAP = {
# TorchVision models
"resnet18": {"input_size": 224, "library": "torchvision"},
"resnet34": {"input_size": 224, "library": "torchvision"},
"resnet50": {"input_size": 224, "library": "torchvision"},
"resnet101": {"input_size": 224, "library": "torchvision"},
"resnet152": {"input_size": 224, "library": "torchvision"},
"alexnet": {"input_size": 224, "library": "torchvision"},
"vgg11": {"input_size": 224, "library": "torchvision"},
"vgg11_bn": {"input_size": 224, "library": "torchvision"},
"vgg13": {"input_size": 224, "library": "torchvision"},
"vgg13_bn": {"input_size": 224, "library": "torchvision"},
"vgg16": {"input_size": 224, "library": "torchvision"},
"vgg16_bn": {"input_size": 224, "library": "torchvision"},
"vgg19": {"input_size": 224, "library": "torchvision"},
"vgg19_bn": {"input_size": 224, "library": "torchvision"},
"densenet121": {"input_size": 224, "library": "torchvision"},
"densenet169": {"input_size": 224, "library": "torchvision"},
"densenet201": {"input_size": 224, "library": "torchvision"},
"densenet161": {"input_size": 224, "library": "torchvision"},
# Swin HuggingFace models
"microsoft/swin-tiny-patch4-window7-224": {"input_size": 224, "library": "swin"},
"microsoft/swin-small-patch4-window7-224": {"input_size": 224, "library": "swin"},
"microsoft/swin-base-patch4-window7-224": {"input_size": 224, "library": "swin"},
"microsoft/swin-base-patch4-window7-224-in22k": {
"input_size": 224,
"library": "swin",
},
"microsoft/swin-large-patch4-window7-224": {"input_size": 224, "library": "swin"},
"microsoft/swin-large-patch4-window7-224-in22k": {
"input_size": 224,
"library": "swin",
},
"microsoft/swin-base-patch4-window12-384": {"input_size": 384, "library": "swin"},
"microsoft/swin-base-patch4-window12-384-in22k": {
"input_size": 384,
"library": "swin",
},
"microsoft/swin-large-patch4-window12-384": {"input_size": 384, "library": "swin"},
"microsoft/swin-large-patch4-window12-384-in22k": {
"input_size": 384,
"library": "swin",
},
# test model (super small)
"test": {"input_size": 32, "library": "test"},
}
MODEL_ARCH_LIST = list(MODEL_ARCH_MAP.keys())
def get_model_metadata(model_arch: str):
"""Returns the model metadata"""
if model_arch in MODEL_ARCH_MAP:
return MODEL_ARCH_MAP[model_arch]
else:
raise NotImplementedError(f"model_arch={model_arch} is not implemented yet.")
def load_model(model_arch: str, output_dimension: int = 1, pretrained: bool = True):
"""Loads a model from a given arch and sets it up for training"""
if model_arch not in MODEL_ARCH_MAP:
raise NotImplementedError(f"model_arch={model_arch} is not implemented yet.")
if MODEL_ARCH_MAP[model_arch]["library"] == "torchvision":
from .torchvision_models import load_torchvision_model
return load_torchvision_model(model_arch, output_dimension, pretrained)
if MODEL_ARCH_MAP[model_arch]["library"] == "swin":
from .swin_models import load_swin_model
return load_swin_model(model_arch, output_dimension, pretrained)
if MODEL_ARCH_MAP[model_arch]["library"] == "test":
from .test_model import load_test_model
return load_test_model(model_arch, output_dimension, pretrained)
raise NotImplementedError(
f"library {MODEL_ARCH_MAP[model_arch]['library']} is not implemented yet."
)

Просмотреть файл

@ -0,0 +1,30 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Original Author: Jeff Omhover (MSFT)
"""
This script provides code to load and setup a variety of models from torchvision.models.
"""
import logging
import torch
from transformers import SwinConfig, SwinForImageClassification
def load_swin_model(
model_arch: str, output_dimension: int = 1, pretrained: bool = True
):
"""Loads a model from a given arch and sets it up for training"""
logger = logging.getLogger(__name__)
logger.info(
f"Loading model from arch={model_arch} pretrained={pretrained} output_dimension={output_dimension}"
)
if pretrained:
model = SwinForImageClassification.from_pretrained(model_arch)
else:
model = SwinForImageClassification(config=SwinConfig())
model.classifier = torch.nn.Linear(model.swin.num_features, output_dimension)
return model

Просмотреть файл

@ -0,0 +1,40 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Original Author: Jeff Omhover (MSFT)
"""
Creates a super simple 32x32 CNN model for testing.
From the CIFAR10 tutorial https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
"""
import logging
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self, output_dimension):
super().__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, output_dimension)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1) # flatten all dimensions except batch
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def load_test_model(
model_arch: str, output_dimension: int = 1, pretrained: bool = True
):
"""Loads a model from a given arch and sets it up for training"""
return Net(output_dimension)

Просмотреть файл

@ -0,0 +1,44 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Original Author: Jeff Omhover (MSFT)
"""
This script provides code to load and setup a variety of models from torchvision.models.
"""
import logging
import torch
import torchvision.models as models
def load_torchvision_model(
model_arch: str, output_dimension: int = 1, pretrained: bool = True
):
"""Loads a model from a given arch and sets it up for training"""
logger = logging.getLogger(__name__)
logger.info(
f"Loading model from arch={model_arch} pretrained={pretrained} output_dimension={output_dimension}"
)
if hasattr(models, model_arch):
model = getattr(models, model_arch)(pretrained=pretrained)
else:
raise NotImplementedError(
f"model_arch={model_arch} is not implemented in torchvision model zoo."
)
# see https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
if model_arch.startswith("resnet"):
model.fc = torch.nn.Linear(model.fc.in_features, output_dimension)
elif model_arch == "alexnet":
model.classifier[6] = torch.nn.Linear(4096, output_dimension)
elif model_arch.startswith("vgg"):
model.classifier[6] = torch.nn.Linear(4096, output_dimension)
elif model_arch.startswith("densenet"):
model.classifier = torch.nn.Linear(1024, output_dimension)
else:
raise NotImplementedError(
f"loading model_arch={model_arch} is not implemented yet in our custom code."
)
return model

Просмотреть файл

@ -0,0 +1,390 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Original Author: Jeff Omhover (MSFT)
"""
This script provides some helper code to help with pytorch profiling.
"""
import os
import time
import logging
import torch
import mlflow
import tempfile
from torch.profiler import ProfilerActivity
from typing import Any
def markdown_trace_handler(dir_name: str, rank: int = 0):
"""This handler can be used inside torch.profiler call to output
tables in markdown format"""
def _handler_fn(prof) -> None:
if not os.path.isdir(dir_name):
try:
os.makedirs(dir_name, exist_ok=True)
except Exception:
raise RuntimeError("Can't create directory: " + dir_name)
# Note: trying to identify a unique name for the file
file_name = os.path.join(
dir_name,
f"stacks_rank{rank}_step{prof.step_num}_t{int(time.time() * 1000)}.ms",
)
logging.getLogger(__name__).info(
f"Exporting profiler trace as markdown at {file_name}"
)
# generate report in markdown format
markdown = ["# Pytorch Profiler report"]
markdown.append("## Average by cuda time")
markdown.append("```")
markdown.append(
prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)
)
markdown.append("```")
with open(file_name, "w") as out_file:
out_file.write("\n".join(markdown))
return _handler_fn
def composite_trace_handler(handler_list):
"""This can call multiple trace handlers inside one"""
def _handler_fn(prof) -> None:
for handler in handler_list:
handler(prof)
return _handler_fn
def export_stack_trace_handler(
dir_name: str, rank: int = 0, metrics=["self_cuda_time_total"]
):
"""This handler can be used inside torch.profiler call to output
tables in markdown format"""
def _handler_fn(prof) -> None:
if not os.path.isdir(dir_name):
try:
os.makedirs(dir_name, exist_ok=True)
except Exception:
raise RuntimeError("Can't create directory: " + dir_name)
# Note: trying to identify a unique name for the file
for metric in metrics:
file_name = os.path.join(
dir_name,
f"stacks_{metric}_rank{rank}_step{prof.step_num}_t{ int(time.time() * 1000)}.txt",
)
logging.getLogger(__name__).info(
f"Exporting {metric} stacks as text at {file_name}"
)
prof.export_stacks(file_name, metric)
return _handler_fn
class PyTorchProfilerHandler:
"""This class handles the initialization and setup of PyTorch profiler"""
def __init__(self, enabled=False, rank=None):
"""Constructor.
Args:
enabled (bool): is profiling enabled?
export_format (str): generate 'markdown' or 'tensorboard' profile in mlflow artifacts
rank (int): rank of the current process/node
"""
self.logger = logging.getLogger(__name__)
self.enabled = enabled
self.rank = rank
self.profiler_output_tmp_dir = None
self.profiler = None
def start_profiler(self):
"""Setup and start the pytorch profiler.
Returns:
profiler (torch.profiler): the profiler
"""
if self.enabled:
self.profiler_output_tmp_dir = tempfile.TemporaryDirectory()
self.logger.info(
f"Starting profiler (enabled=True) with tmp dir {self.profiler_output_tmp_dir.name}."
)
## profiler activities CPU/GPU
activities = [ProfilerActivity.CPU]
if torch.cuda.is_available():
self.logger.info(f"Enabling CUDA in profiler.")
activities.append(ProfilerActivity.CUDA)
## handlers for exporting profile at each step
# we're creating a list to export in multiple formats
trace_handlers = []
# export in markdown
markdown_logs_export = os.path.join(
self.profiler_output_tmp_dir.name, "markdown"
)
trace_handlers.append(
markdown_trace_handler(markdown_logs_export, rank=self.rank)
)
# export stacks in txt
stacks_logs_export = os.path.join(
self.profiler_output_tmp_dir.name, "stacks"
)
stack_metrics = ["self_cpu_time_total"]
if torch.cuda.is_available():
stack_metrics.append("self_cuda_time_total")
trace_handlers.append(
export_stack_trace_handler(
stacks_logs_export, rank=self.rank, metrics=stack_metrics
)
)
# export tensorboard
# NOTE: removed due to segfault in pytorch 1.11.0
# will need to be uncommented for pytorch 1.11.1 which has a fix
# tensorboard_logs_export = os.path.join(
# self.profiler_output_tmp_dir.name, "tensorboard_logs"
# )
# trace_handlers.append(torch.profiler.tensorboard_trace_handler(
# tensorboard_logs_export
# ))
# profiler takes 1 handler, we're composing all above in a single handler
trace_handler = composite_trace_handler(trace_handlers)
# process every single step
profiler_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1)
# initialize profiler
self.profiler = torch.profiler.profile(
schedule=profiler_schedule,
record_shapes=True,
with_flops=True,
profile_memory=True,
activities=activities,
with_stack=True, # needed to export stacks
on_trace_ready=trace_handler,
)
self.profiler.start()
else:
self.logger.info(f"Profiler not started (enabled=False).")
self.profiler = None
return self.profiler
def stop_profiler(self) -> None:
"""Stops the pytorch profiler and logs the outputs using mlflow"""
if self.profiler:
self.logger.info(f"Stopping profiler.")
self.profiler.stop()
# log via mlflow
self.logger.info(
f"MLFLOW log {self.profiler_output_tmp_dir.name} as an artifact."
)
mlflow.log_artifacts(
self.profiler_output_tmp_dir.name, artifact_path="profiler"
)
self.logger.info(
f"Clean up profiler temp dir {self.profiler_output_tmp_dir.name}"
)
self.profiler_output_tmp_dir.cleanup()
else:
self.logger.info(
"Not stopping profiler as it was not started in the first place."
)
class LogTimeBlock(object):
"""This class should be used to time a code block.
The time diff is computed from __enter__ to __exit__.
Example
-------
```python
with LogTimeBlock("my_perf_metric_name"):
print("(((sleeping for 1 second)))")
time.sleep(1)
```
"""
def __init__(self, name, **kwargs):
"""
Constructs the LogTimeBlock.
Args:
name (str): key for the time difference (for storing as metric)
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
"""
# kwargs
self.step = kwargs.get("step", None)
self.enabled = kwargs.get("enabled", True)
# internal variables
self.name = name
self.start_time = None
self._logger = logging.getLogger(__name__)
def __enter__(self):
"""Starts the timer, gets triggered at beginning of code block"""
if not self.enabled:
return
self.start_time = time.time() # starts "timer"
def __exit__(self, exc_type, value, traceback):
"""Stops the timer and stores accordingly
gets triggered at beginning of code block.
Note:
arguments are by design for with statements.
"""
if not self.enabled:
return
run_time = time.time() - self.start_time # stops "timer"
self._logger.info(
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
)
mlflow.log_metric(self.name + ".time", run_time)
class LogDiskIOBlock(object):
def __init__(self, name, **kwargs):
"""
Constructs the LogDiskUsageBlock.
Args:
name (str): key for the time difference (for storing as metric)
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
"""
# kwargs
self.step = kwargs.get("step", None)
self.enabled = kwargs.get("enabled", True)
# internal variables
self.name = name
self.process_id = os.getpid() # focus on current process
self.start_time = None
self.start_disk_counters = None
self._logger = logging.getLogger(__name__)
def __enter__(self):
"""Get initial values, gets triggered at beginning of code block"""
if not self.enabled:
return
try:
import psutil
self.start_time = time.time()
self.start_disk_counters = psutil.Process(self.process_id).io_counters()
except ModuleNotFoundError:
self.logger.critical("import psutil failed, cannot display disk stats.")
def __exit__(self, exc_type, value, traceback):
"""Stops the timer and stores accordingly
gets triggered at beginning of code block.
Note:
arguments are by design for with statements.
"""
if not self.enabled:
return
try:
import psutil
except ModuleNotFoundError:
self.logger.critical("import psutil failed, cannot display disk stats.")
return
run_time = time.time() - self.start_time
disk_io_metrics = {}
end_disk_counters = psutil.Process(self.process_id).io_counters()
disk_io_metrics[f"{self.name}.disk.read"] = (
end_disk_counters.read_bytes - self.start_disk_counters.read_bytes
) / (1024 * 1024)
disk_io_metrics[f"{self.name}.disk.write"] = (
end_disk_counters.write_bytes - self.start_disk_counters.write_bytes
) / (1024 * 1024)
self._logger.info(
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
)
self._logger.info(f"--- disk_io_metrics: {disk_io_metrics}s [step={self.step}]")
mlflow.log_metrics(disk_io_metrics)
class LogTimeOfIterator: # lgtm [py/iter-returns-non-self]
"""This class is intended to "wrap" an existing Iterator
and log metrics for each next() call"""
def __init__(
self,
wrapped_sequence: Any,
name: str,
enabled: bool = True,
async_collector: dict = None,
):
self.wrapped_sequence = wrapped_sequence
self.wrapped_iterator = None
# for metrics
self.enabled = enabled
self.name = name
self.iterator_times = []
self.metrics = {}
self.async_collector = async_collector
self._logger = logging.getLogger(__name__)
def __iter__(self):
"""Creates the iterator"""
if self.enabled:
start_time = time.time()
# if enabled, creates iterator from wrapped_sequence
self.wrapped_iterator = self.wrapped_sequence.__iter__()
self.metrics[f"{self.name}.init"] = time.time() - start_time
# return self
return self
else:
# if disabled, return the iterator from wrapped_sequence
# so that LogTimeOfIterator.__next__() will never get called
return self.wrapped_sequence.__iter__()
def __next__(self):
"""Iterates"""
try:
start_time = time.time()
next_val = self.wrapped_iterator.__next__()
self.iterator_times.append(time.time() - start_time)
return next_val
except StopIteration as e:
self.log_metrics()
raise e
def log_metrics(self):
"""Logs metrics once iterator is finished"""
self.metrics[f"{self.name}.count"] = len(self.iterator_times)
self.metrics[f"{self.name}.time.sum"] = sum(self.iterator_times)
self.metrics[f"{self.name}.time.first"] = self.iterator_times[0]
if self.async_collector is not None:
self._logger.info(f"Async MLFLOW: {self.metrics}")
for k in self.metrics:
self.async_collector[k] = self.metrics[k]
else:
self._logger.info(f"MLFLOW: {self.metrics}")
mlflow.log_metrics(self.metrics)

Просмотреть файл

@ -0,0 +1,960 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
# Original Author: Jeff Omhover (MSFT)
"""
This script implements a Distributed PyTorch training sequence.
IMPORTANT: We have tagged the code with the following expressions to walk you through
the key implementation details.
Using your editor, search for those strings to get an idea of how to implement:
- DISTRIBUTED : how to implement distributed pytorch
- MLFLOW : how to implement mlflow reporting of metrics and artifacts
- PROFILER : how to implement pytorch profiler
"""
import argparse
import json
import logging
import os
import sys
import time
import traceback
from distutils.util import strtobool
import mlflow
# the long list of torch imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.profiler import record_function
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from transformers.utils import ModelOutput
# add path to here, if necessary
COMPONENT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
if COMPONENT_ROOT not in sys.path:
logging.info(f"Adding {COMPONENT_ROOT} to path")
sys.path.append(str(COMPONENT_ROOT))
from image_io import build_image_datasets
# internal imports
from model import get_model_metadata, load_model
from profiling import (
LogDiskIOBlock,
LogTimeBlock,
LogTimeOfIterator,
PyTorchProfilerHandler,
)
torch.set_default_dtype(torch.float64)
class PyTorchDistributedModelTrainingSequence:
"""Generic class to run the sequence for training a PyTorch model
using distributed training."""
def __init__(self):
"""Constructor"""
self.logger = logging.getLogger(__name__)
# DATA
self.training_data_sampler = None
self.training_data_loader = None
self.validation_data_loader = None
# MODEL
self.model = None
self.labels = []
self.model_signature = None
# DISTRIBUTED CONFIG
self.world_size = 1
self.world_rank = 0
self.local_world_size = 1
self.local_rank = 0
self.multinode_available = False
self.cpu_count = os.cpu_count()
self.device = None
# NOTE: if we're running multiple nodes, this indicates if we're on first node
self.self_is_main_node = True
# TRAINING CONFIGS
self.dataloading_config = None
self.training_config = None
# PROFILER
self.profiler = None
self.profiler_output_tmp_dir = None
#####################
### SETUP METHODS ###
#####################
def setup_config(self, args):
"""Sets internal variables using provided CLI arguments (see build_arguments_parser()).
In particular, sets device(cuda) and multinode parameters."""
self.dataloading_config = args
self.training_config = args
# verify parameter default values
if self.dataloading_config.num_workers is None:
self.dataloading_config.num_workers = 0
if self.dataloading_config.num_workers < 0:
self.dataloading_config.num_workers = self.cpu_count
if self.dataloading_config.num_workers == 0:
self.logger.warning(
"You specified num_workers=0, forcing prefetch_factor to be discarded."
)
self.dataloading_config.prefetch_factor = None
# NOTE: strtobool returns an int, converting to bool explicitely
self.dataloading_config.pin_memory = bool(self.dataloading_config.pin_memory)
self.dataloading_config.non_blocking = bool(
self.dataloading_config.non_blocking
)
# add this switch to test for different strategies
if self.dataloading_config.multiprocessing_sharing_strategy:
torch.multiprocessing.set_sharing_strategy(
self.dataloading_config.multiprocessing_sharing_strategy
)
# DISTRIBUTED: detect multinode config
# depending on the Azure ML distribution.type, different environment variables will be provided
# to configure DistributedDataParallel
self.distributed_backend = args.distributed_backend
if self.distributed_backend == "nccl":
self.world_size = int(os.environ.get("WORLD_SIZE", "1"))
self.world_rank = int(os.environ.get("RANK", "0"))
self.local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", "1"))
self.local_rank = int(os.environ.get("LOCAL_RANK", "0"))
self.multinode_available = self.world_size > 1
self.self_is_main_node = self.world_rank == 0
elif self.distributed_backend == "mpi":
# Note: Distributed pytorch package doesn't have MPI built in.
# MPI is only included if you build PyTorch from source on a host that has MPI installed.
self.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", "1"))
self.world_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", "0"))
self.local_world_size = int(
os.environ.get("OMPI_COMM_WORLD_LOCAL_SIZE", "1")
)
self.local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", "0"))
self.multinode_available = self.world_size > 1
self.self_is_main_node = self.world_rank == 0
else:
raise NotImplementedError(
f"distributed_backend={self.distributed_backend} is not implemented yet."
)
# Use CUDA if it is available
if not self.training_config.disable_cuda and torch.cuda.is_available():
self.logger.info(
f"Setting up torch.device for CUDA for local gpu:{self.local_rank}"
)
self.device = torch.device(self.local_rank)
else:
self.logger.info(f"Setting up torch.device for cpu")
self.device = torch.device("cpu")
if self.multinode_available:
self.logger.info(
f"Running in multinode with backend={self.distributed_backend} local_rank={self.local_rank} rank={self.world_rank} size={self.world_size}"
)
# DISTRIBUTED: this is required to initialize the pytorch backend
torch.distributed.init_process_group(
self.distributed_backend,
rank=self.world_rank,
world_size=self.world_size,
)
else:
self.logger.info(
f"Not running in multinode, so not initializing process group."
)
# DISTRIBUTED: in distributed mode, you want to report parameters
# only from main process (rank==0) to avoid conflict
if self.self_is_main_node:
# MLFLOW: report relevant parameters using mlflow
logged_params = {
# log some distribution params
"nodes": int(os.environ.get("AZUREML_NODE_COUNT", "1")),
"instance_per_node": self.world_size
// int(os.environ.get("AZUREML_NODE_COUNT", "1")),
"cuda_available": torch.cuda.is_available(),
"disable_cuda": self.training_config.disable_cuda,
"distributed": self.multinode_available,
"distributed_backend": self.distributed_backend,
# data loading params
"batch_size": self.dataloading_config.batch_size,
"num_workers": self.dataloading_config.num_workers,
"cpu_count": self.cpu_count,
"prefetch_factor": self.dataloading_config.prefetch_factor,
"persistent_workers": self.dataloading_config.persistent_workers,
"pin_memory": self.dataloading_config.pin_memory,
"non_blocking": self.dataloading_config.non_blocking,
"multiprocessing_sharing_strategy": self.dataloading_config.multiprocessing_sharing_strategy,
# training params
"model_arch": self.training_config.model_arch,
"model_arch_pretrained": self.training_config.model_arch_pretrained,
"optimizer.learning_rate": self.training_config.learning_rate,
"optimizer.momentum": self.training_config.momentum,
# profiling params
"enable_profiling": self.training_config.enable_profiling,
}
if not self.training_config.disable_cuda and torch.cuda.is_available():
# add some gpu properties
logged_params["cuda_device_count"] = torch.cuda.device_count()
cuda_device_properties = torch.cuda.get_device_properties(self.device)
logged_params["cuda_device_name"] = cuda_device_properties.name
logged_params["cuda_device_major"] = cuda_device_properties.major
logged_params["cuda_device_minor"] = cuda_device_properties.minor
logged_params[
"cuda_device_memory"
] = cuda_device_properties.total_memory
logged_params[
"cuda_device_processor_count"
] = cuda_device_properties.multi_processor_count
mlflow.log_params(logged_params)
def setup_datasets(
self,
training_dataset: torch.utils.data.Dataset,
validation_dataset: torch.utils.data.Dataset,
labels: list,
):
"""Creates and sets up dataloaders for training/validation datasets."""
self.labels = labels
# DISTRIBUTED: you need to use a DistributedSampler that wraps your dataset
# it will draw a different sample on each node/process to distribute data sampling
self.training_data_sampler = DistributedSampler(
training_dataset, num_replicas=self.world_size, rank=self.world_rank
)
# setting up DataLoader with the right arguments
optional_data_loading_kwargs = {}
if self.dataloading_config.num_workers > 0:
# NOTE: this option _ONLY_ applies if num_workers > 0
# or else DataLoader will except
optional_data_loading_kwargs[
"prefetch_factor"
] = self.dataloading_config.prefetch_factor
optional_data_loading_kwargs[
"persistent_workers"
] = self.dataloading_config.persistent_workers
self.training_data_loader = DataLoader(
training_dataset,
batch_size=self.dataloading_config.batch_size,
num_workers=self.dataloading_config.num_workers, # self.cpu_count,
pin_memory=self.dataloading_config.pin_memory,
# DISTRIBUTED: the sampler needs to be provided to the DataLoader
sampler=self.training_data_sampler,
# all other args
**optional_data_loading_kwargs,
)
# DISTRIBUTED: we don't need a sampler for validation set
# it is used as-is in every node/process
self.validation_data_loader = DataLoader(
validation_dataset,
batch_size=self.dataloading_config.batch_size,
num_workers=self.dataloading_config.num_workers, # self.cpu_count,
pin_memory=self.dataloading_config.pin_memory,
)
if self.self_is_main_node:
# MLFLOW: report relevant parameters using mlflow
mlflow.log_params({"num_classes": len(labels)})
def setup_model(self, model):
"""Configures a model for training."""
self.logger.info(f"Setting up model to use device {self.device}")
self.model = model.to(self.device)
# DISTRIBUTED: the model needs to be wrapped in a DistributedDataParallel class
if self.multinode_available:
self.logger.info(f"Setting up model to use DistributedDataParallel.")
self.model = torch.nn.parallel.DistributedDataParallel(self.model)
# fun: log the number of parameters
params_count = 0
for param in model.parameters():
if param.requires_grad:
params_count += param.numel()
self.logger.info(
"MLFLOW: model_param_count={:.2f} (millions)".format(
round(params_count / 1e6, 2)
)
)
if self.self_is_main_node:
mlflow.log_params({"model_param_count": round(params_count / 1e6, 2)})
return self.model
########################
### TRAINING METHODS ###
########################
def _epoch_eval(self, epoch, criterion):
"""Called during train() for running the eval phase of one epoch."""
with torch.no_grad():
num_correct = 0
num_total_images = 0
running_loss = 0.0
epoch_eval_metrics = {}
# PROFILER: here we're introducing a layer on top of data loader to capture its performance
# in pratice, we'd just use for images, targets in tqdm(self.training_data_loader)
for images, targets in LogTimeOfIterator(
tqdm(self.validation_data_loader),
"validation_data_loader",
async_collector=epoch_eval_metrics,
):
with record_function("eval.to_device"):
images = images.to(
self.device, non_blocking=self.dataloading_config.non_blocking
)
targets = targets.to(
self.device, non_blocking=self.dataloading_config.non_blocking
)
with record_function("eval.forward"):
outputs = self.model(images)
if isinstance(outputs, torch.Tensor):
# if we're training a regular pytorch model (ex: torchvision)
loss = criterion(outputs, targets)
_, predicted = torch.max(outputs.data, 1)
correct = predicted == targets
elif isinstance(outputs, ModelOutput):
# if we're training a HuggingFace model
loss = criterion(outputs.logits, targets)
_, predicted = torch.max(outputs.logits.data, 1)
correct = predicted == targets
else:
# if anything else, just except
raise ValueError(
f"outputs from model is type {type(outputs)} which is unknown."
)
running_loss += loss.item() * images.size(0)
num_correct += torch.sum(correct).item()
num_total_images += len(images)
epoch_eval_metrics["running_loss"] = running_loss
epoch_eval_metrics["num_correct"] = num_correct
epoch_eval_metrics["num_samples"] = num_total_images
return epoch_eval_metrics
def _epoch_train(self, epoch, optimizer, scheduler, criterion):
"""Called during train() for running the train phase of one epoch."""
self.model.train()
self.training_data_sampler.set_epoch(epoch)
num_correct = 0
num_total_images = 0
running_loss = 0.0
epoch_train_metrics = {}
# PROFILER: here we're introducing a layer on top of data loader to capture its performance
# in pratice, we'd just use for images, targets in tqdm(self.training_data_loader)
for images, targets in LogTimeOfIterator(
tqdm(self.training_data_loader),
"training_data_loader",
async_collector=epoch_train_metrics,
):
# PROFILER: record_function will report to the profiler (if enabled)
# here a specific wall time for a given block of code
with record_function("train.to_device"):
images = images.to(
self.device, non_blocking=self.dataloading_config.non_blocking
)
targets = targets.to(
self.device, non_blocking=self.dataloading_config.non_blocking
)
with record_function("train.forward"):
# zero the parameter gradients
optimizer.zero_grad()
outputs = self.model(images)
# if self.model_signature is None:
# self.model_signature = infer_signature(images, outputs)
if isinstance(outputs, torch.Tensor):
# if we're training a regular pytorch model (ex: torchvision)
loss = criterion(outputs, targets)
_, predicted = torch.max(outputs.data, 1)
correct = predicted == targets
elif isinstance(outputs, ModelOutput):
# if we're training a HuggingFace model
loss = criterion(outputs.logits, targets)
_, predicted = torch.max(outputs.logits.data, 1)
correct = predicted == targets
else:
# if anything else, just except
raise ValueError(
f"outputs from model is type {type(outputs)} which is unknown."
)
running_loss += loss.item() * images.size(0)
num_correct += torch.sum(correct).item()
num_total_images += len(images)
# PROFILER: record_function will report to the profiler (if enabled)
# here a specific wall time for a given block of code
with record_function("train.backward"):
loss.backward()
optimizer.step()
scheduler.step()
epoch_train_metrics["running_loss"] = running_loss
epoch_train_metrics["num_correct"] = num_correct
epoch_train_metrics["num_samples"] = num_total_images
return epoch_train_metrics
def train(self, epochs: int = None, checkpoints_dir: str = None):
"""Trains the model.
Args:
epochs (int, optional): if not provided uses internal config
checkpoints_dir (str, optional): path to write checkpoints
"""
if epochs is None:
epochs = self.training_config.num_epochs
# Observe that all parameters are being optimized
optimizer = optim.SGD(
self.model.parameters(),
lr=self.training_config.learning_rate,
momentum=self.training_config.momentum,
nesterov=True,
# weight_decay=1e-4,
)
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.CrossEntropyLoss()
# Decay LR by a factor of 0.1 every 7 epochs
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
# DISTRIBUTED: export checkpoint only from main node
if self.self_is_main_node and checkpoints_dir is not None:
# saving checkpoint before training
self.checkpoint_save(
self.model, optimizer, checkpoints_dir, epoch=-1, loss=0.0
)
# DISTRIBUTED: you'll node that this loop has nothing specifically "distributed"
# that's because most of the changes are in the backend (DistributedDataParallel)
for epoch in range(epochs):
self.logger.info(f"Starting epoch={epoch}")
# we'll collect metrics we want to report for this epoch
epoch_metrics = {}
# start timer for epoch time metric
epoch_train_start = time.time()
# TRAIN: loop on training set and return metrics
epoch_train_metrics = self._epoch_train(
epoch, optimizer, scheduler, criterion
)
self.logger.info(f"Epoch metrics: {epoch_train_metrics}")
# stop timer
epoch_metrics["epoch_train_time"] = time.time() - epoch_train_start
# record metrics of interest
epoch_metrics["training_data_loader.count"] = epoch_train_metrics[
"training_data_loader.count"
]
epoch_metrics["training_data_loader.time.sum"] = epoch_train_metrics[
"training_data_loader.time.sum"
]
epoch_metrics["training_data_loader.time.first"] = epoch_train_metrics[
"training_data_loader.time.first"
]
epoch_metrics["epoch_train_loss"] = (
epoch_train_metrics["running_loss"] / epoch_train_metrics["num_samples"]
)
epoch_metrics["epoch_train_acc"] = (
epoch_train_metrics["num_correct"] / epoch_train_metrics["num_samples"]
)
# start timer for epoch time metric
epoch_eval_start = time.time()
# EVAL: run evaluation on validation set and return metrics
epoch_eval_metrics = self._epoch_eval(epoch, criterion)
self.logger.info(f"Epoch metrics: {epoch_train_metrics}")
# stop timer
epoch_metrics["epoch_eval_time"] = time.time() - epoch_eval_start
# record metrics of interest
epoch_metrics["validation_data_loader.count"] = epoch_eval_metrics[
"validation_data_loader.count"
]
epoch_metrics["validation_data_loader.time.sum"] = epoch_eval_metrics[
"validation_data_loader.time.sum"
]
epoch_metrics["validation_data_loader.time.first"] = epoch_eval_metrics[
"validation_data_loader.time.first"
]
epoch_metrics["epoch_valid_loss"] = (
epoch_eval_metrics["running_loss"] / epoch_eval_metrics["num_samples"]
)
epoch_metrics["epoch_valid_acc"] = (
epoch_eval_metrics["num_correct"] / epoch_eval_metrics["num_samples"]
)
# start timer for epoch time metric
epoch_utility_start = time.time()
# PROFILER: use profiler.step() to mark a step in training
# the pytorch profiler will use internally to trigger
# saving the traces in different files
if self.profiler:
self.profiler.step()
# DISTRIBUTED: export checkpoint only from main node
if self.self_is_main_node and checkpoints_dir is not None:
self.checkpoint_save(
self.model,
optimizer,
checkpoints_dir,
epoch=epoch,
loss=epoch_metrics["epoch_valid_loss"],
)
# report metric values in stdout
self.logger.info(f"MLFLOW: metrics={epoch_metrics} epoch={epoch}")
# MLFLOW / DISTRIBUTED: report metrics only from main node
if self.self_is_main_node:
mlflow.log_metrics(epoch_metrics)
mlflow.log_metric(
"epoch_utility_time", time.time() - epoch_utility_start, step=epoch
)
def runtime_error_report(self, runtime_exception):
"""Call this when catching a critical exception.
Will print all sorts of relevant information to the log."""
self.logger.critical(traceback.format_exc())
try:
import psutil
self.logger.critical(f"Memory: {str(psutil.virtual_memory())}")
except ModuleNotFoundError:
self.logger.critical(
"import psutil failed, cannot display virtual memory stats."
)
if torch.cuda.is_available():
self.logger.critical(
"Cuda memory summary:\n"
+ str(torch.cuda.memory_summary(device=None, abbreviated=False))
)
self.logger.critical(
"Cuda memory snapshot:\n"
+ json.dumps(torch.cuda.memory_snapshot(), indent=" ")
)
else:
self.logger.critical(
"Cuda is not available, cannot report cuda memory allocation."
)
def close(self):
"""Tear down potential resources"""
if self.multinode_available:
self.logger.info(
f"Destroying process group on local_rank={self.local_rank} rank={self.world_rank} size={self.world_size}"
)
# DISTRIBUTED: this will teardown the distributed process group
torch.distributed.destroy_process_group()
else:
self.logger.info(
f"Not running in multinode, so not destroying process group."
)
#################
### MODEL I/O ###
#################
def checkpoint_save(
self, model, optimizer, output_dir: str, epoch: int, loss: float
):
"""Saves model as checkpoint"""
# create output directory just in case
os.makedirs(output_dir, exist_ok=True)
model_output_path = os.path.join(
output_dir, f"model-checkpoint-epoch{epoch}-loss{loss}.pt"
)
self.logger.info(f"Exporting checkpoint to {model_output_path}")
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
# DISTRIBUTED: to export model, you need to get it out of the DistributedDataParallel class
self.logger.info(
"Model was distributed, we will checkpoint DistributedDataParallel.module"
)
model_to_save = model.module
else:
model_to_save = model
with record_function("checkpoint.save"):
torch.save(
{
"epoch": epoch,
"model_state_dict": model_to_save.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"loss": loss,
},
model_output_path,
)
def save(
self,
output_dir: str,
name: str = "dev",
register_as: str = None,
) -> None:
# DISTRIBUTED: you want to save the model only from the main node/process
# in data distributed mode, all models should theoretically be the same
if self.self_is_main_node:
self.logger.info(f"Saving model and classes in {output_dir}...")
# create output directory just in case
os.makedirs(output_dir, exist_ok=True)
if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
# DISTRIBUTED: to export model, you need to get it out of the DistributedDataParallel class
self.logger.info(
"Model was distributed, we will export DistributedDataParallel.module"
)
model_to_save = self.model.module.to("cpu")
else:
model_to_save = self.model.to("cpu")
# Save the labels to a csv file.
# This file will be required to map the output array
# from the API to the labels.
with open("label-mapping.txt", "w") as f:
f.write("\n".join(self.labels))
mlflow.log_artifact("label-mapping.txt")
# MLFLOW: mlflow has a nice method to export the model automatically
# add tags and environment for it. You can then use it in Azure ML
# to register your model to an endpoint.
mlflow.pytorch.log_model(
model_to_save,
artifact_path="final_model",
registered_model_name=register_as, # also register it if name is provided
signature=self.model_signature,
)
# MLFLOW: Register the model with the model registry
# This is useful for Azure ML to register your model
# to an endpoint.
if register_as is not None:
mlflow.register_model(
model_uri=f"runs:/{mlflow.active_run().info.run_id}/final_model",
name=register_as,
)
def build_arguments_parser(parser: argparse.ArgumentParser = None):
"""Builds the argument parser for CLI settings"""
if parser is None:
parser = argparse.ArgumentParser()
group = parser.add_argument_group(f"Training Inputs")
group.add_argument(
"--train_images",
type=str,
required=True,
help="Path to folder containing training images",
)
group.add_argument(
"--valid_images",
type=str,
required=True,
help="path to folder containing validation images",
)
group = parser.add_argument_group(f"Training Outputs")
group.add_argument(
"--model_output",
type=str,
required=False,
default=None,
help="Path to write final model",
)
group.add_argument(
"--checkpoints",
type=str,
required=False,
default=None,
help="Path to read/write checkpoints",
)
group.add_argument(
"--register_model_as",
type=str,
required=False,
default=None,
help="Name to register final model in MLFlow",
)
group = parser.add_argument_group(f"Data Loading Parameters")
group.add_argument(
"--batch_size",
type=int,
required=False,
default=64,
help="Train/valid data loading batch size (default: 64)",
)
group.add_argument(
"--num_workers",
type=int,
required=False,
default=None,
help="Num workers for data loader (default: -1 => all cpus available)",
)
group.add_argument(
"--prefetch_factor",
type=int,
required=False,
default=2,
help="Data loader prefetch factor (default: 2)",
)
group.add_argument(
"--persistent_workers",
type=strtobool,
required=False,
default=True,
help="Use persistent prefetching workers (default: True)",
)
group.add_argument(
"--pin_memory",
type=strtobool,
required=False,
default=True,
help="Pin Data loader prefetch factor (default: True)",
)
group.add_argument(
"--non_blocking",
type=strtobool,
required=False,
default=False,
help="Use non-blocking transfer to device (default: False)",
)
group = parser.add_argument_group(f"Model/Training Parameters")
group.add_argument(
"--model_arch",
type=str,
required=False,
default="resnet18",
help="Which model architecture to use (default: resnet18)",
)
group.add_argument(
"--model_arch_pretrained",
type=strtobool,
required=False,
default=True,
help="Use pretrained model (default: true)",
)
group.add_argument(
"--distributed_backend",
type=str,
required=False,
choices=["nccl", "mpi"],
default="nccl",
help="Which distributed backend to use.",
)
group.add_argument(
"--disable_cuda",
type=strtobool,
required=False,
default=False,
help="set True to force use of cpu (local testing).",
)
# DISTRIBUTED: torch.distributed.launch is passing this argument to your script
# it is likely to be deprecated in favor of os.environ['LOCAL_RANK']
# see https://pytorch.org/docs/stable/distributed.html#launch-utility
group.add_argument(
"--local_rank",
type=int,
required=False,
default=None,
help="Passed by torch.distributed.launch utility when running from cli.",
)
group.add_argument(
"--num_epochs",
type=int,
required=False,
default=1,
help="Number of epochs to train for",
)
group.add_argument(
"--learning_rate",
type=float,
required=False,
default=0.001,
help="Learning rate of optimizer",
)
group.add_argument(
"--momentum",
type=float,
required=False,
default=0.9,
help="Momentum of optimizer",
)
group = parser.add_argument_group(f"System Parameters")
group.add_argument(
"--enable_profiling",
type=strtobool,
required=False,
default=False,
help="Enable pytorch profiler.",
)
group.add_argument(
"--multiprocessing_sharing_strategy",
type=str,
choices=torch.multiprocessing.get_all_sharing_strategies(),
required=False,
default=None,
help="Check https://pytorch.org/docs/stable/multiprocessing.html",
)
return parser
def run(args):
"""Run the script using CLI arguments"""
logger = logging.getLogger(__name__)
logger.info(f"Running with arguments: {args}")
# MLFLOW: initialize mlflow (once in entire script)
mlflow.start_run()
# use a handler for the training sequence
training_handler = PyTorchDistributedModelTrainingSequence()
# sets cuda and distributed config
training_handler.setup_config(args)
# PROFILER: here we use a helper class to enable profiling
# see profiling.py for the implementation details
training_profiler = PyTorchProfilerHandler(
enabled=bool(args.enable_profiling),
rank=training_handler.world_rank,
)
# PROFILER: set profiler in trainer to call profiler.step() during training
training_handler.profiler = training_profiler.start_profiler()
# report the time and disk usage during this code block
with LogTimeBlock(
"build_image_datasets", enabled=training_handler.self_is_main_node
), LogDiskIOBlock(
"build_image_datasets", enabled=training_handler.self_is_main_node
):
# build the image folder datasets
train_dataset, valid_dataset, labels = build_image_datasets(
train_images_dir=args.train_images,
valid_images_dir=args.valid_images,
input_size=get_model_metadata(args.model_arch)["input_size"],
)
# creates data loaders from datasets for distributed training
training_handler.setup_datasets(train_dataset, valid_dataset, labels)
with LogTimeBlock("load_model", enabled=training_handler.self_is_main_node):
# creates the model architecture
model = load_model(
args.model_arch,
output_dimension=len(labels),
pretrained=args.model_arch_pretrained,
)
# logging of labels
logger.info(labels)
# sets the model for distributed training
training_handler.setup_model(model)
# runs training sequence
# NOTE: num_epochs is provided in args
try:
training_handler.train(checkpoints_dir=args.checkpoints)
except RuntimeError as runtime_exception: # if runtime error occurs (ex: cuda out of memory)
# then print some runtime error report in the logs
training_handler.runtime_error_report(runtime_exception)
# re-raise
raise runtime_exception
# stops profiling (and save in mlflow)
training_profiler.stop_profiler()
# saves final model
if args.model_output:
training_handler.save(
args.model_output,
name=f"epoch-{args.num_epochs}",
register_as=args.register_model_as,
)
# properly teardown distributed resources
training_handler.close()
# MLFLOW: finalize mlflow (once in entire script)
mlflow.end_run()
logger.info("run() completed")
def main(cli_args=None):
"""Main function of the script."""
# initialize root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# create argument parser
parser = build_arguments_parser()
# runs on cli arguments
args = parser.parse_args(cli_args) # if None, runs on sys.argv
# run the run function
run(args)
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,20 @@
import os
import sys
import logging
import pytest
import tempfile
from unittest.mock import Mock
SRC_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src"))
if SRC_ROOT not in sys.path:
logging.info(f"Adding {SRC_ROOT} to path")
sys.path.append(str(SRC_ROOT))
@pytest.fixture()
def temporary_dir():
"""Creates a temporary directory for the tests"""
temp_directory = tempfile.TemporaryDirectory()
yield temp_directory.name
temp_directory.cleanup()

Просмотреть файл

@ -0,0 +1,40 @@
"""
Tests running the model loader for every possible model in the list
"""
import pytest
import torch
# local imports
from model import (
MODEL_ARCH_LIST,
get_model_metadata,
load_model,
)
# IMPORTANT: see conftest.py for fixtures
@pytest.mark.parametrize("model_arch", MODEL_ARCH_LIST)
def test_model_loader(model_arch):
"""Tests src/components/pytorch_image_classifier/model/"""
model_metadata = get_model_metadata(model_arch)
assert model_metadata is not None
assert isinstance(model_metadata, dict)
assert "library" in model_metadata
assert "input_size" in model_metadata
# using pretrained=False to avoid downloading each time we unit test
model = load_model(model_arch, output_dimension=4, pretrained=False)
assert model is not None
assert isinstance(model, torch.nn.Module)
def test_model_loader_failure():
"""Test asking for a model that deosn't exist"""
with pytest.raises(NotImplementedError):
get_model_metadata("not_a_model")
with pytest.raises(NotImplementedError):
load_model("not_a_model", output_dimension=4, pretrained=False)

Просмотреть файл

@ -0,0 +1,219 @@
"""
Tests running the train.py script end-to-end
on a randomly generated (small) dataset.
"""
import os
import sys
import tempfile
import pytest
from unittest.mock import patch
import numpy as np
from PIL import Image
# local imports
import train
from model import MODEL_ARCH_LIST
# IMPORTANT: see conftest.py for fixtures (ex: temporary_dir)
@pytest.fixture()
def random_image_in_folder_classes(temporary_dir):
image_dataset_path = os.path.join(temporary_dir, "image_in_folders")
os.makedirs(image_dataset_path, exist_ok=False)
n_samples = 100
n_classes = 4
for i in range(n_samples):
a = np.random.rand(300, 300, 3) * 255
im_out = Image.fromarray(a.astype("uint8")).convert("RGB")
class_dir = "class_{}".format(i % n_classes)
image_path = os.path.join(
image_dataset_path, class_dir, "random_image_{}.jpg".format(i)
)
os.makedirs(os.path.join(image_dataset_path, class_dir), exist_ok=True)
im_out.save(image_path)
return image_dataset_path
# IMPORTANT: we have to restrict the list of models for unit test
# because github actions runners have 7GB RAM only and will OOM
TEST_MODEL_ARCH_LIST = [
"test",
"resnet18",
"resnet34",
]
# NOTE: we only care about patching those specific mlflow methods
# to mlflow initialization conflict between tests
@patch("mlflow.end_run") # we can have only 1 start/end per test session
@patch("mlflow.register_model") # patched to test model name registration
@patch("mlflow.pytorch.log_model") # patched to test model name registration
@patch("mlflow.log_params") # patched to avoid conflict in parameters
@patch("mlflow.start_run") # we can have only 1 start/end per test session
@pytest.mark.parametrize("model_arch", TEST_MODEL_ARCH_LIST)
def test_components_pytorch_image_classifier_single_node(
mlflow_start_run_mock,
mlflow_log_params_mock,
mlflow_pytorch_log_model_mock,
mlflow_register_model_mock,
mlflow_end_run_mock,
model_arch,
temporary_dir,
random_image_in_folder_classes,
):
"""Tests src/components/pytorch_image_classifier/train.py"""
model_dir = os.path.join(temporary_dir, "pytorch_image_classifier_model")
checkpoints_dir = os.path.join(
temporary_dir, "pytorch_image_classifier_checkpoints"
)
# create test arguments for the script
# fmt: off
script_args = [
"train.py",
"--train_images", random_image_in_folder_classes,
"--valid_images", random_image_in_folder_classes, # using same data for train/valid
"--batch_size", "16",
"--num_workers", "0", # single thread pre-fetching
"--prefetch_factor", "2", # will be discarded if num_workers=0
"--pin_memory", "True",
"--non_blocking", "False",
"--model_arch", model_arch,
"--model_arch_pretrained", "True",
"--num_epochs", "2",
"--model_output", model_dir,
"--checkpoints", checkpoints_dir,
"--register_model_as", "foo",
"--enable_profiling", "True",
]
# fmt: on
# replaces sys.argv with test arguments and run main
with patch.object(sys, "argv", script_args):
train.main()
# those mlflow calls must be unique in the script
mlflow_start_run_mock.assert_called_once()
mlflow_end_run_mock.assert_called_once()
# test all log_params calls
for log_params_call in mlflow_log_params_mock.call_args_list:
args, kwargs = log_params_call
assert isinstance(args[0], dict) # call has only 1 argument, and it's a dict
# test model registration with mlflow.pytorch.log_model()
log_model_calls = mlflow_pytorch_log_model_mock.call_args_list
assert len(log_model_calls) == 1
args, kwargs = log_model_calls[0] # unpack arguments
assert "artifact_path" in kwargs
assert kwargs["artifact_path"] == "final_model"
assert "registered_model_name" in kwargs
assert kwargs["registered_model_name"] == "foo"
# test model registration with mlflow.register_model()
register_model_calls = mlflow_register_model_mock.call_args_list
assert len(register_model_calls) == 1 # call should happen only once
args, kwargs = register_model_calls[0] # unpack arguments
assert "model_uri" in kwargs
assert kwargs["model_uri"].endswith("final_model")
assert "name" in kwargs
assert kwargs["name"] == "foo"
# test checkpoints presence
assert len(os.listdir(checkpoints_dir)) == 3 # 1 before training loop, + 2 epochs
@patch("mlflow.end_run") # we can have only 1 start/end per test session
@patch("mlflow.register_model") # patched to test model name registration
@patch("mlflow.pytorch.log_model") # patched to test model name registration
@patch("mlflow.log_params") # patched to avoid conflict in parameters
@patch("mlflow.start_run") # we can have only 1 start/end per test session
@patch("torch.distributed.init_process_group") # to avoid calling for the actual thing
@patch(
"torch.distributed.destroy_process_group"
) # to avoid calling for the actual thing
@patch(
"torch.nn.parallel.DistributedDataParallel"
) # to avoid calling for the actual thing
@pytest.mark.parametrize("backend", ["nccl", "mpi"])
def test_components_pytorch_image_classifier_second_of_two_nodes(
torch_ddp_mock,
torch_dist_destroy_process_group_mock,
torch_dist_init_process_group_mock,
mlflow_start_run_mock,
mlflow_log_params_mock,
mlflow_pytorch_log_model_mock,
mlflow_register_model_mock,
mlflow_end_run_mock,
backend,
temporary_dir,
random_image_in_folder_classes,
):
"""Tests src/components/pytorch_image_classifier/train.py"""
model_dir = os.path.join(
temporary_dir, "pytorch_image_classifier_distributed_model"
)
torch_ddp_mock.side_effect = lambda model: model # ddp would return just the model
# create some environment variables for the backend
if backend == "nccl":
backend_expected_env = {
# setup as if there were 2 nodes with 1 gpu each
"WORLD_SIZE": "2",
"RANK": "1",
"LOCAL_WORLD_SIZE": "1",
"LOCAL_RANK": "0",
}
elif backend == "mpi":
backend_expected_env = {
# setup as if there were 2 nodes with 1 gpu each
"OMPI_COMM_WORLD_SIZE": "2",
"OMPI_COMM_WORLD_RANK": "1",
"OMPI_COMM_WORLD_LOCAL_SIZE": "1",
"OMPI_COMM_WORLD_LOCAL_RANK": "0",
}
else:
raise Exception("backend {} used for testing is not implemented in script.")
with patch.dict(os.environ, backend_expected_env, clear=False):
# create test arguments for the script
# fmt: off
script_args = [
"train.py",
"--train_images", random_image_in_folder_classes,
"--valid_images", random_image_in_folder_classes, # using same data for train/valid
"--distributed_backend", backend,
"--batch_size", "16",
"--num_workers", "0", # single thread pre-fetching
"--prefetch_factor", "2", # will be discarded if num_workers=0
"--pin_memory", "True",
"--non_blocking", "False",
"--model_arch", "resnet18",
"--model_arch_pretrained", "True",
"--num_epochs", "1",
"--register_model_as", "foo",
]
# fmt: on
# replaces sys.argv with test arguments and run main
with patch.object(sys, "argv", script_args):
train.main()
# those mlflow calls must be unique in the script
mlflow_start_run_mock.assert_called_once()
mlflow_end_run_mock.assert_called_once()
mlflow_pytorch_log_model_mock.assert_not_called() # not saving from non-head nodes
mlflow_register_model_mock.assert_not_called() # not registering from non-head nodes
torch_dist_init_process_group_mock.assert_called_once()
torch_dist_init_process_group_mock.assert_called_with(backend, rank=1, world_size=2)
torch_dist_destroy_process_group_mock.assert_called_once()

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,6 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: blue
endpoint_name: dogs-classifier-online
model: azureml:resnet-dogs-classifier@latest
instance_type: Standard_DS2_v2
instance_count: 1

Просмотреть файл

@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
name: dogs-classifier-online
description: Stanford Dogs Classifier
auth_mode: key

Просмотреть файл

@ -0,0 +1,19 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
command: |
tar xvfm ${{inputs.archive}} --no-same-owner -C ${{outputs.images}} #TODO: Split data into Train-Validate-Test
inputs:
archive:
type: uri_file
path: http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
outputs:
images:
type: uri_folder
mode: upload
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/
environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest
compute: azureml:cpu-cluster

Просмотреть файл

@ -0,0 +1,83 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
# <inputs_and_outputs>
inputs:
training_images:
type: uri_folder
mode: download # pick ro_mount, rw_mount or download
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/train//**
validation_images: #TODO: Use different datasets for validation
type: uri_folder
mode: download # pick ro_mount, rw_mount or download
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/valid/**
# </inputs_and_outputs>
# <jobs>
settings:
default_datastore: azureml:workspaceblobstore
continue_on_step_failure: true
jobs:
train:
type: command
component: file:train.yaml
compute: azureml:gpu-cluster
resources:
instance_count: 1 # number of nodes
distribution:
type: pytorch
process_count_per_instance: 1 # number of gpus
# NOTE: set env var if needed
environment_variables:
NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests
# NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use specific topology file for A100
# NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
# NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
# NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
# NCCL_NET: "Socket" # to force node-to-node comm to use Socket (slow)
# NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
# UCX_IB_PCI_RELAXED_ORDERING: "on"
# UCX_TLS: "tcp"
# UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...
# CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus
# TORCH_DISTRIBUTED_DEBUG: "DETAIL"
inputs:
# data inputs
train_images: ${{parent.inputs.training_images}}
valid_images: ${{parent.inputs.validation_images}}
# data loading
batch_size: 64
num_workers: 5
prefetch_factor: 4
persistent_workers: true
pin_memory: true
non_blocking: false
# model
model_arch: "resnet18"
model_arch_pretrained: true
# training
num_epochs: 1
learning_rate: 0.001
momentum: 0.9
# profiling
enable_profiling: false
# multiprocessing_sharing_strategy: "file_system" # WARNING: this can cause hang at job completion
# Model Registrataion
register_model_as: "resnet-dogs-classifier"
# </jobs>

Просмотреть файл

@ -0,0 +1,21 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: nvidia_pytorch
build:
path: ../../../data-science/environment/
tags:
os: ubuntu
os_version: 20.04
hpcx: 2.10
mpi: openmpi
mpi_version: 4.1.2rc4
ucx: 1.12.0
cuda: 11.6.2
cudnn: 8.4.0.27
nccl: 2.12.10
rdma_core: 36.0
nsight_compute: 2022.1.1.2
nsight_systems: "2022.2.1.31-5fe97ab"
nccl_test: 2.11.0
azureml-defaults: 1.41.0
mlflow: 1.25.1
transformers: 4.18.0

Просмотреть файл

@ -0,0 +1,123 @@
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
type: command
description: >-
Fine-tunes a pre-trained pytorch model for image classification.
Inputs should be provided as distinct directories containing distinct images
as we're using [ImageFolder](http://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html) to load data.
name: pytorch_image_classifier
display_name: Image Classification Model (PyTorch)
version: 1.0.4
inputs:
# data loading
train_images:
type: path
description: "Path to folder containing training images, stored in subdirectories according to their class."
valid_images:
type: path
description: "Path to folder containing validation images, stored in subdirectories according to their class."
# data loading
batch_size:
type: integer
min: 1
optional: true
description: "Train/valid data loading batch size (default: 64)"
num_workers:
type: integer
optional: true
description: "Num workers for data loader (default: -1 => all cpus available)"
prefetch_factor:
type: integer
optional: true
description: "Data loader prefetch factor (default: 2)"
persistent_workers:
type: boolean
optional: true
description: "Use persistent prefetching workers (default: True)"
pin_memory:
type: boolean
optional: true
description: "Pin Data loader prefetch factor (default: True)"
non_blocking:
type: boolean
optional: true
description: "Use non-blocking transfer to device (default: False)"
# model
model_arch:
type: string
optional: true
description: "Which model architecture to use (default: resnet18)"
model_arch_pretrained:
type: boolean
optional: true
description: "Use pretrained model (default: true)"
# training
num_epochs:
type: integer
optional: true
description: "Number of epochs to train for (default: 1)"
learning_rate:
type: number
optional: true
description: "Learning rate of optimizer (default: 0.001)"
momentum:
type: number
optional: true
description: "Momentum of optimizer (default: 0.9)"
# model registration
register_model_as:
type: string
optional: true
description: "Name to register final model in MLFlow"
# system parameters
enable_profiling:
type: boolean
default: false
description: "Enables profiler"
multiprocessing_sharing_strategy:
type: string
optional: true
description: "Check https://pytorch.org/docs/stable/multiprocessing.html"
outputs:
checkpoints:
type: path
description: "Path to export checkpoints"
trained_model:
type: path
description: "Path to the final model"
code: ../../../data-science/src
environment: azureml:nvidia_pytorch@latest
command: >-
python train.py
--train_images ${{inputs.train_images}}
--valid_images ${{inputs.valid_images}}
[--batch_size ${{inputs.batch_size}}]
[--num_workers ${{inputs.num_workers}}]
[--prefetch_factor ${{inputs.prefetch_factor}}]
[--persistent_workers ${{inputs.persistent_workers}}]
[--pin_memory ${{inputs.pin_memory}}]
[--non_blocking ${{inputs.non_blocking}}]
[--model_arch ${{inputs.model_arch}}]
[--model_arch_pretrained ${{inputs.model_arch_pretrained}}]
[--num_epochs ${{inputs.num_epochs}}]
[--learning_rate ${{inputs.learning_rate}}]
[--momentum ${{inputs.momentum}}]
--model_output ${{outputs.trained_model}}
--checkpoints ${{outputs.checkpoints}}
[--register_model_as ${{inputs.register_model_as}}]
--enable_profiling ${{inputs.enable_profiling}}
[--multiprocessing_sharing_strategy ${{inputs.multiprocessing_sharing_strategy}}]
distribution:
# NOTE: using type:pytorch will use all the right env variables for pytorch init_process_group
type: pytorch
process_count_per_instance: 1

Просмотреть файл

@ -0,0 +1,63 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: aml-cli-v2
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: gpu-cluster
size: Standard_NC6
min_instances: 0
max_instances: 1
cluster_tier: dedicated
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
build_type: docker
environment_name: nvidia_pytorch # Not used for docker builds
environment_file: mlops/azureml/train/train-env.yaml
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
environment_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_file: mlops/azureml/train/pipeline.yaml
experiment_name: $(environment)_cv_train_$(Build.SourceBranchName)
display_name: $(environment)_cv_run_$(Build.BuildID)

Просмотреть файл

@ -0,0 +1,61 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: aml-cli-v2
- name: endpoint_name
value: dogs-online-$(namespace)$(postfix)$(environment)
- name: endpoint_type
value: online
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: CreateOnlineEndpoint
displayName: Create/Update Online Endpoint
jobs:
- job: DeployOnlineEndpoint
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
parameters:
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
parameters:
deployment_name: dogs-online-dp
deployment_file: mlops/azureml/deploy/online/online-deployment.yml
- template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
parameters:
traffic_allocation: dogs-online-dp=100
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
parameters:
deployment_name: dogs-online-dp
sample_request: data/sample-request.json
request_type: json

Просмотреть файл

@ -0,0 +1,51 @@
name: deploy-cv-model-training-pipeline
on:
workflow_dispatch:
jobs:
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
create-compute:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: gpu-cluster
size: Standard_NC6
min_instances: 0
max_instances: 1
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
register-environment:
needs: [get-config,create-compute]
uses: ./.github/workflows/register-environment.yml
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
environment_file: mlops/azureml/train/train-env.yaml
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
register-dataset:
needs: [get-config,register-environment]
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
data_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
file_type: Training
name: stanford_dogs
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
run-pipeline:
needs: [get-config,register-dataset]
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
parameters-file: mlops/azureml/train/pipeline.yaml
job-name: cv-train
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -0,0 +1,42 @@
name: deploy-online-endpoint-pipeline
on:
workflow_dispatch:
jobs:
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
create-endpoint:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
endpoint_name: dogs-classifier-online2
endpoint_type: online
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-deployment:
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config,create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
endpoint_name: dogs-classifier-online2
endpoint_type: online
deployment_name: dogs-online-dp
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
allocate-traffic:
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
needs: [get-config,create-deployment]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
traffic_allocation: dogs-online-dp=100
endpoint_name: dogs-classifier-online2
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -0,0 +1,43 @@
variables:
ap_vm_image: ubuntu-20.04
## Training pipeline settings
# Training dataset settings
training_dataset_name: dogs-imgs
training_dataset_description: 'Stanford Dogs Dataset (http://vision.stanford.edu/aditya86/ImageNetDogs/)'
training_dataset_local_path: data/training-imgs/
training_dataset_path_on_datastore: dogs-imgs/
training_dataset_type: local
training_dataset_storage_url: 'http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar'
labels_dataset_name: dogs-labels
labels_dataset_description: 'Labels for Stanford Dogs Dataset (http://vision.stanford.edu/aditya86/ImageNetDogs/)'
labels_dataset_local_path: data/training/
labels_dataset_path_on_datastore: dogs-labels/
labels_dataset_type: local
# Training AzureML Environment settings
training_env_name: nvidia_pytorch
training_env_path: data-science/environment/training/
# Compute target for pipeline
training_target: gpu-cluster
training_target_sku: Standard_NC6
training_target_min_nodes: 0
training_target_max_nodes: 1
# Name for the training pipeline
training_pipeline_name: resnet-dogs-training-pipeline
training_experiment_name: resnet-dogs-training
# Training arguments specification
training_arguments: --epochs 2 --batch-size 64 --training-mode feature-extraction
# Training datasets specification
# Syntax: <name>:<version>:<mode>:<steps (names separated by +)>
training_datasets: dogs-labels:1:download:prep dogs-imgs:latest:mount:train+eval
# Name under which the model will be registered
model_name: resnet-dogs-classifier

Просмотреть файл

@ -0,0 +1,44 @@
{
"name": "pytorch_manual",
"environmentVariables": {
"EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
},
"python": {
"userManagedDependencies": false,
"interpreterPath": "python",
"condaDependenciesFile": null,
"baseCondaEnvironment": null
},
"docker": {
"enabled": true,
"baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04",
"baseDockerfile": null,
"sharedVolumes": true,
"shmSize": "2g",
"arguments": [],
"baseImageRegistry": {
"address": null,
"username": null,
"password": null,
"registryIdentity": null
},
"platform": {
"os": "Linux",
"architecture": "amd64"
}
},
"spark": {
"repositories": [],
"packages": [],
"precachePackages": true
},
"databricks": {
"mavenLibraries": [],
"pypiLibraries": [],
"rcranLibraries": [],
"jarLibraries": [],
"eggLibraries": []
},
"r": null,
"inferencingStackVersion": null
}

Просмотреть файл

@ -0,0 +1,17 @@
name: pytorch_manual
channels:
- conda-forge
dependencies:
- python=3.7
- pip=20.2.4
- pip:
- pandas==1.3.5
- scikit-learn==1.0.2
- matplotlib==3.5.2
- msrest==0.6.21
- mlflow==1.27.0
- azureml-core==1.43.0
- azureml-defaults==1.43.0
- azureml-mlflow==1.43.0
- torch==1.11.0
- torchvision==0.12.0

Просмотреть файл

Просмотреть файл

@ -0,0 +1,129 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import argparse
import pandas as pd
import mlflow
import torch
from sklearn import metrics as sklmetrics
import matplotlib.pyplot as plt
from model import CustomImageDataset, load_model
def main(labels_path, images_path, model_path, model_name, output_dir, deploy_flag_output):
# Load model
model_file = os.path.join(model_path, f'{model_name}.pth')
net = load_model(path=model_file)
# Load test data
labels_data = pd.read_csv(os.path.join(labels_path, 'labels_test.csv'))
labels_data = labels_data.set_index('path').squeeze() # Convert to appropiate format for CustomImageDataset
testset = CustomImageDataset(images_path, labels_data, mode='test')
print(f'Test size: {len(testset)}')
# Generate predictions
predictions = get_predictions(testset, net)
predictions.to_csv(os.path.join(output_dir, 'predictions.csv'), index=False)
# Evaluation metrics
metrics = evaluate(predictions.label_real, predictions.label_predicted)
for k, v in metrics.items():
mlflow.log_metric(k, v)
# Plot confusion matrix
plt.rcParams["figure.figsize"] = [40, 40]
sklmetrics.ConfusionMatrixDisplay.from_predictions(predictions.label_real, predictions.label_predicted, cmap='YlGnBu')
plt.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png")
# Promote model
deploy_flag = is_new_model_better()
mlflow.log_metric("deploy flag", deploy_flag)
with open(deploy_flag_output, 'w') as f:
f.write('%d' % int(deploy_flag))
deploy_flag_str = 'not' if deploy_flag == False else ''
print(f'Finished. Model will {deploy_flag_str} be registered.')
def get_predictions(dataset, net, batch_size=256):
testloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
predictions = pd.DataFrame()
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
for data in testloader:
images, classes, paths = data
# calculate outputs by running images through the network
outputs = net(images)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
predictions_batch = pd.DataFrame({
'path': paths,
'label_real': dataset.get_labels(classes),
'label_predicted': dataset.get_labels(predicted)
})
predictions = pd.concat([predictions, predictions_batch], ignore_index=True)
return predictions
def evaluate(labels_real, labels_pred):
metrics = {
'accuracy': sklmetrics.accuracy_score(labels_real, labels_pred),
'mcc': sklmetrics.matthews_corrcoef(labels_real, labels_pred),
'recall_micro': sklmetrics.recall_score(labels_real, labels_pred, average='micro'),
'recall_macro': sklmetrics.recall_score(labels_real, labels_pred, average='macro'),
'recall_weighted': sklmetrics.recall_score(labels_real, labels_pred, average='weighted'),
'precison_micro': sklmetrics.precision_score(labels_real, labels_pred, average='micro'),
'precison_macro': sklmetrics.precision_score(labels_real, labels_pred, average='macro'),
'precison_weighted': sklmetrics.precision_score(labels_real, labels_pred, average='weighted'),
'f1_micro': sklmetrics.f1_score(labels_real, labels_pred, average='micro'),
'f1_macro': sklmetrics.f1_score(labels_real, labels_pred, average='macro'),
'f1_weighted': sklmetrics.f1_score(labels_real, labels_pred, average='weighted'),
}
return metrics
def is_new_model_better():
return True # For simplicity
def parse_args(args_list=None):
parser = argparse.ArgumentParser()
parser.add_argument('--prepared_data_path', type=str, required=True, help='Directory path to test data (output from prep step)')
parser.add_argument('--dogs-imgs', type=str, help='Directory path to images')
parser.add_argument('--model_path', type=str, help='Model output directory')
parser.add_argument('--evaluation_path', type=str, default='evaluation_results/', help="Evaluation results output directory")
parser.add_argument('--deploy_flag', type=str, help='A deploy flag whether to deploy or no')
args_parsed, unknown = parser.parse_known_args(args_list)
if unknown:
print(f"Unrecognized arguments. These won't be used: {unknown}")
return args_parsed
if __name__ == '__main__':
args = parse_args()
main(
labels_path=args.prepared_data_path,
images_path=args.dogs_imgs,
model_path=args.model_path,
model_name='model',
output_dir=args.evaluation_path,
deploy_flag_output=args.deploy_flag
)

Просмотреть файл

@ -0,0 +1,5 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
from .dataset import CustomImageDataset
from .net import load_model

Просмотреть файл

@ -0,0 +1,49 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import PIL
from torch.utils.data import Dataset
import torchvision.transforms as transforms
class CustomImageDataset(Dataset):
def __init__(self, img_dir, img_labels, mode='test'):
self.img_dir = img_dir
self.img_labels = img_labels
self.classes = img_labels.unique().tolist()
self.mode = mode
if self.mode == 'train':
self.transform = transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor()
])
else:
self.transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
])
def __len__(self):
return len(self.img_labels)
def __getitem__(self, idx):
img_path = self.img_labels.index[idx]
image = PIL.Image.open(os.path.join(self.img_dir, img_path)).convert('RGB')
image = self.transform(image)
img_label = self.img_labels[idx]
img_class = self.classes.index(img_label)
return image, img_class, img_path
def nclasses(self):
return len(self.classes)
def get_labels(self, indexes):
return [self.classes[i] for i in indexes]

Просмотреть файл

@ -0,0 +1,36 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
def load_model(path=None, num_classes=2, mode='finetuning', learning_rate=0.001, momentum=0.9):
# Load existing model
if path:
print('Loading existing model from path...')
model_data = torch.load(path)
model = models.resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, model_data['fc.weight'].shape[0])
model.load_state_dict(model_data)
return model
# Initialize new model
assert mode in ['finetuning', 'feature-extraction']
model = models.resnet18(pretrained=True)
if mode == 'feature-extraction': # Freeze layers
for param in model.parameters():
param.requires_grad = False
model.fc = nn.Linear(model.fc.in_features, num_classes)
criterion = nn.CrossEntropyLoss()
params_optim = model.parameters() if mode == 'finetuning' else model.fc.parameters() if mode == 'feature-extraction' else None
optimizer = optim.SGD(params_optim, lr=learning_rate, momentum=momentum)
return model, criterion, optimizer

Просмотреть файл

@ -0,0 +1,58 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import argparse
import numpy as np
import pandas as pd
import mlflow
def main(raw_data_path, prepared_data_path):
print(f'Raw data path: {raw_data_path}')
print(f'Output data path: {prepared_data_path}')
# Read data
labels_data = pd.read_csv(os.path.join(raw_data_path, 'image_labels.csv'))
mlflow.log_metric('total_labels', len(labels_data))
# Split data into train and test datasets
random_data = np.random.rand(len(labels_data))
labels_train = labels_data[random_data < 0.7]
labels_test = labels_data[random_data >= 0.7]
print(labels_train)
mlflow.log_metric('train_size', labels_train.shape[0])
mlflow.log_metric('test_size', labels_test.shape[0])
labels_train.to_csv(os.path.join(prepared_data_path, 'labels_train.csv'), index=False)
labels_test.to_csv(os.path.join(prepared_data_path, 'labels_test.csv'), index=False)
print('Finished.')
def parse_args(args_list=None):
parser = argparse.ArgumentParser()
parser.add_argument("--dogs-labels", type=str, required=True, help="Path to labels")
parser.add_argument("--prepared_data_path", type=str, required=True, help="Path for prepared data")
args_parsed, unknown = parser.parse_known_args(args_list)
if unknown:
print(f"Unrecognized arguments. These won't be used: {unknown}")
return args_parsed
if __name__ == "__main__":
args = parse_args()
main(
raw_data_path=args.dogs_labels,
prepared_data_path=args.prepared_data_path
)

Просмотреть файл

@ -0,0 +1,118 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# Example adapted from:
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
import os
import argparse
import pandas as pd
import mlflow
import torch
from model import CustomImageDataset, load_model
def main(labels_path, images_path,
model_name, output_dir,
mode, epochs, batch_size, learning_rate, momentum):
labels_data = pd.read_csv(os.path.join(labels_path, 'labels_train.csv'))
labels_data = labels_data.set_index('path').squeeze() # Convert to appropiate format for CustomImageDataset
trainset = CustomImageDataset(images_path, labels_data, mode='train')
print(f'Train size: {len(trainset)}')
print("Training...")
net = train(
trainset,
mode=mode,
epochs=epochs,
batch_size=batch_size,
learning_rate=learning_rate,
momentum=momentum
)
print('Finished training')
print(f"Saving model in folder {output_dir}...")
os.makedirs(output_dir, exist_ok=True)
model_path = os.path.join(output_dir, f'{model_name}.pth')
torch.save(net.state_dict(), model_path)
print('Finished.')
def train(dataset, mode='finetuning', epochs=2, batch_size=64, learning_rate=0.001, momentum=0.9, stats_freq=25):
trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)
net, criterion, optimizer = load_model(
num_classes=dataset.nclasses(),
mode=mode,
learning_rate=learning_rate,
momentum=momentum
)
global_iter = 0
for epoch in range(epochs): # loop over the dataset multiple times
print(f'----\nEpoch {epoch}\n----\n')
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, classes]
inputs, classes, paths = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, classes)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (i + 1) % stats_freq == 0:
mlflow.log_metric('loss', running_loss / stats_freq, step=global_iter)
mlflow.log_metric('epoch', epoch, step=global_iter)
running_loss = 0.0
global_iter += 1
return net
def parse_args(args_list=None):
parser = argparse.ArgumentParser()
parser.add_argument('--prepared_data_path', type=str, required=True, help='Directory path to training data (output from prep step)')
parser.add_argument('--dogs-imgs', type=str, help='Directory path to images')
parser.add_argument('--model_path', type=str, help='Model output directory')
parser.add_argument('--training-mode', type=str, default='feature-extraction', choices=['finetuning', 'feature-extraction'])
parser.add_argument('--epochs', type=int, default=2)
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--learning-rate', type=float, default=0.001)
parser.add_argument('--momentum', type=float, default=0.9)
args_parsed, unknown = parser.parse_known_args(args_list)
if unknown:
print(f"Unrecognized arguments. These won't be used: {unknown}")
return args_parsed
if __name__ == '__main__':
args = parse_args()
main(
labels_path=args.prepared_data_path,
images_path=args.dogs_imgs,
model_name='model',
output_dir=args.model_path,
mode=args.training_mode,
epochs=args.epochs,
batch_size=args.batch_size,
learning_rate=args.learning_rate,
momentum=args.momentum
)

Просмотреть файл

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,105 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- template: ../../config-aml.yml
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk
trigger:
- none
pool:
vmImage: $(ap_vm_image)
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
steps:
# Setup
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
# Environment
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
parameters:
environment_name: $(training_env_name)
build_type: folder
environment_file: $(training_env_path)
# Compute
- template: templates/${{ variables.version }}/get-compute.yml@mlops-templates
parameters:
compute_type: training
# Datasets (images + labels)
# Images dataset
- task: Bash@3
displayName: 'Download data'
inputs:
targetType: inline
script: |
mkdir -p $(training_dataset_local_path)
curl $(training_dataset_storage_url) | tar xvf - --no-same-owner -C $(training_dataset_local_path)
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
# Labels dataset
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
datasetName: $(labels_dataset_name)
datasetDescription: $(labels_dataset_description)
datasetLocalPath: $(labels_dataset_local_path)
datasetPathOnDatastore: $(labels_dataset_path_on_datastore)
datasetType: $(labels_dataset_type)
# Deploy training pipeline
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
- task: Bash@3
name: export_pipeline_id
displayName: "Export Pipeline ID"
inputs:
targetType: "inline"
script: |
echo "##vso[task.setvariable variable=pipeline_id;isOutput=true;]$(pipeline_id)"
# Run training
- job: invoke_pipeline
displayName: 'Invoke pipeline'
pool: server
timeoutInMinutes: 0
dependsOn: DeployTrainingPipeline
variables:
pipeline_id: $[ dependencies.DeployTrainingPipeline.outputs['export_pipeline_id.pipeline_id'] ]
steps:
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
displayName: 'Invoke AML Pipeline'
inputs:
azureSubscription: '$(ado_service_connection_aml_ws)'
PipelineId: '$(PIPELINE_ID)'
ExperimentName: '$(training_experiment_name)'

Двоичные данные
documentation/architecturepattern/AzureML_CML_Architecture.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 144 KiB

Двоичные данные
documentation/architecturepattern/AzureML_CML_Architecture.vsdx Normal file

Двоичный файл не отображается.

Двоичный файл не отображается.

До

Ширина:  |  Высота:  |  Размер: 323 KiB

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 147 KiB

Двоичный файл не отображается.

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 145 KiB

Двоичный файл не отображается.

Двоичный файл не отображается.

До

Ширина:  |  Высота:  |  Размер: 339 KiB

Просмотреть файл

@ -8,7 +8,7 @@ param env string
param tags object = {
Owner: 'mlops-v2'
Project: 'mlops-v2'
Environment: 'dev'
Environment: env
Toolkit: 'bicep'
Name: prefix
}
@ -28,7 +28,7 @@ module st './modules/storage_account.bicep' = {
name: 'st'
scope: resourceGroup(rg.name)
params: {
baseName: '${prefix}${postfix}${env}'
baseName: '${uniqueString(rg.id)}${env}'
location: location
tags: tags
}
@ -61,7 +61,7 @@ module cr './modules/container_registry.bicep' = {
name: 'cr'
scope: resourceGroup(rg.name)
params: {
baseName: '${prefix}${postfix}${env}'
baseName: '${uniqueString(rg.id)}${env}'
location: location
tags: tags
}

Просмотреть файл

@ -4,10 +4,10 @@
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../../config-infra-prod.yml
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../../config-infra-dev.yml
- template: ../../config-infra-dev.yml
trigger:
- none
@ -24,7 +24,7 @@ stages :
steps:
- checkout: self
- script: |
az bicep build --file ./infrastructure/bicep/main.bicep
az bicep build --file ./infrastructure/main.bicep
name: LintBicepCode
displayName: Run Bicep Linter
@ -43,7 +43,7 @@ stages :
inlineScript: |
az deployment sub validate \
--name $(Build.DefinitionName) \
--template-file ./infrastructure/bicep/main.bicep \
--template-file ./infrastructure/main.bicep \
--location $(location) \
--parameters location=$(location) prefix=$(namespace) postfix=$(postfix) env=$(environment)
@ -54,7 +54,7 @@ stages :
displayName: Deploy Bicep
pool:
vmImage: $(ap_vm_image)
environment: dev
environment: $(environment)
strategy:
runOnce:
deploy:
@ -72,5 +72,5 @@ stages :
az deployment sub create \
--name $(Build.DefinitionName) \
--location $(location) \
--template-file ./infrastructure/bicep/main.bicep \
--template-file ./infrastructure/main.bicep \
--parameters location=$(location) prefix=$(namespace) postfix=$(postfix) env=$(environment)

Просмотреть файл

@ -32,6 +32,11 @@ module "aml_workspace" {
enable_aml_computecluster = var.enable_aml_computecluster
storage_account_name = module.storage_account_aml.name
enable_aml_secure_workspace = var.enable_aml_secure_workspace
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
subnet_default_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
subnet_training_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_training[0].id : ""
tags = local.tags
}
@ -51,6 +56,10 @@ module "storage_account_aml" {
firewall_bypass = ["AzureServices"]
firewall_virtual_network_subnet_ids = []
enable_aml_secure_workspace = var.enable_aml_secure_workspace
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
tags = local.tags
}
@ -66,6 +75,10 @@ module "key_vault" {
postfix = var.postfix
env = var.environment
enable_aml_secure_workspace = var.enable_aml_secure_workspace
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
tags = local.tags
}
@ -96,5 +109,26 @@ module "container_registry" {
postfix = var.postfix
env = var.environment
enable_aml_secure_workspace = var.enable_aml_secure_workspace
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
tags = local.tags
}
module "data_explorer" {
source = "./modules/data-explorer"
rg_name = module.resource_group.name
location = module.resource_group.location
prefix = var.prefix
postfix = var.postfix
env = var.environment
key_vault_id = module.key_vault.id
enable_monitoring = var.enable_monitoring
client_secret = var.client_secret
tags = local.tags
}

Просмотреть файл

@ -0,0 +1,37 @@
# Bastion
module "bastion" {
source = "./modules/bastion-host"
prefix = var.prefix
postfix = var.postfix
env = var.environment
rg_name = module.resource_group.name
location = module.resource_group.location
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_bastion[0].id : ""
enable_aml_secure_workspace = var.enable_aml_secure_workspace
tags = local.tags
}
# Virtual machine
module "virtual_machine_jumphost" {
source = "./modules/virtual-machine"
prefix = var.prefix
postfix = var.postfix
env = var.environment
rg_name = module.resource_group.name
location = module.resource_group.location
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
jumphost_username = var.jumphost_username
jumphost_password = var.jumphost_password
enable_aml_secure_workspace = var.enable_aml_secure_workspace
tags = local.tags
}

Просмотреть файл

@ -18,12 +18,14 @@ resource "azurerm_machine_learning_workspace" "mlw" {
# Compute cluster
resource "azurerm_machine_learning_compute_cluster" "adl_aml_ws_compute_cluster" {
resource "azurerm_machine_learning_compute_cluster" "mlw_compute_cluster" {
name = "cpu-cluster"
location = var.location
vm_priority = "LowPriority"
vm_size = "Standard_DS3_v2"
machine_learning_workspace_id = azurerm_machine_learning_workspace.mlw.id
subnet_resource_id = var.enable_aml_secure_workspace ? var.subnet_training_id : ""
count = var.enable_aml_computecluster ? 1 : 0
scale_settings {
@ -33,70 +35,63 @@ resource "azurerm_machine_learning_compute_cluster" "adl_aml_ws_compute_cluster"
}
}
# # Datastore
# DNS Zones
# resource "azurerm_resource_group_template_deployment" "arm_aml_create_datastore" {
# name = "arm_aml_create_datastore"
# resource_group_name = var.rg_name
# deployment_mode = "Incremental"
# parameters_content = jsonencode({
# "WorkspaceName" = {
# value = azurerm_machine_learning_workspace.mlw.name
# },
# "StorageAccountName" = {
# value = var.storage_account_name
# }
# })
resource "azurerm_private_dns_zone" "mlw_zone_api" {
name = "privatelink.api.azureml.ms"
resource_group_name = var.rg_name
# depends_on = [time_sleep.wait_30_seconds]
count = var.enable_aml_secure_workspace ? 1 : 0
}
# template_content = <<TEMPLATE
# {
# "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
# "contentVersion": "1.0.0.0",
# "parameters": {
# "WorkspaceName": {
# "type": "String"
# },
# "StorageAccountName": {
# "type": "String"
# }
# },
# "resources": [
# {
# "type": "Microsoft.MachineLearningServices/workspaces/datastores",
# "apiVersion": "2021-03-01-preview",
# "name": "[concat(parameters('WorkspaceName'), '/default')]",
# "dependsOn": [],
# "properties": {
# "contents": {
# "accountName": "[parameters('StorageAccountName')]",
# "containerName": "default",
# "contentsType": "AzureBlob",
# "credentials": {
# "credentialsType": "None"
# },
# "endpoint": "core.windows.net",
# "protocol": "https"
# },
# "description": "Default datastore for mlops-tabular",
# "isDefault": false,
# "properties": {
# "ServiceDataAccessAuthIdentity": "None"
# },
# "tags": {}
# }
# }
# ]
# }
# TEMPLATE
# }
resource "azurerm_private_dns_zone" "mlw_zone_notebooks" {
name = "privatelink.notebooks.azure.net"
resource_group_name = var.rg_name
# resource "time_sleep" "wait_30_seconds" {
count = var.enable_aml_secure_workspace ? 1 : 0
}
# depends_on = [
# azurerm_machine_learning_workspace.mlw
# ]
# Linking of DNS zones to Virtual Network
# create_duration = "30s"
# }
resource "azurerm_private_dns_zone_virtual_network_link" "mlw_zone_api_link" {
name = "${var.prefix}${var.postfix}_link_api"
resource_group_name = var.rg_name
private_dns_zone_name = azurerm_private_dns_zone.mlw_zone_api[0].name
virtual_network_id = var.vnet_id
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_private_dns_zone_virtual_network_link" "mlw_zone_notebooks_link" {
name = "${var.prefix}${var.postfix}_link_notebooks"
resource_group_name = var.rg_name
private_dns_zone_name = azurerm_private_dns_zone.mlw_zone_notebooks[0].name
virtual_network_id = var.vnet_id
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Private Endpoint configuration
resource "azurerm_private_endpoint" "mlw_pe" {
name = "pe-${azurerm_machine_learning_workspace.mlw.name}-amlw"
location = var.location
resource_group_name = var.rg_name
subnet_id = var.subnet_default_id
private_service_connection {
name = "psc-aml-${var.prefix}-${var.postfix}${var.env}"
private_connection_resource_id = azurerm_machine_learning_workspace.mlw.id
subresource_names = ["amlworkspace"]
is_manual_connection = false
}
private_dns_zone_group {
name = "private-dns-zone-group-ws"
private_dns_zone_ids = [azurerm_private_dns_zone.mlw_zone_api[0].id, azurerm_private_dns_zone.mlw_zone_notebooks[0].id]
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}

Просмотреть файл

@ -58,3 +58,22 @@ variable "storage_account_name" {
type = string
description = "The Name of the Storage Account linked to AML workspace"
}
variable "enable_aml_secure_workspace" {
description = "Variable to enable or disable AML secure workspace"
}
variable "vnet_id" {
type = string
description = "The ID of the vnet that should be linked to the DNS zone"
}
variable "subnet_default_id" {
type = string
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
}
variable "subnet_training_id" {
type = string
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
}

Просмотреть файл

@ -0,0 +1,31 @@
resource "azurerm_bastion_host" "bas" {
name = "bas-${var.prefix}-${var.postfix}${var.env}"
location = var.location
resource_group_name = var.rg_name
sku = "Standard"
copy_paste_enabled = false
file_copy_enabled = false
ip_configuration {
name = "configuration"
subnet_id = var.subnet_id
public_ip_address_id = azurerm_public_ip.pip[0].id
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}
resource "azurerm_public_ip" "pip" {
name = "pip-${var.prefix}-${var.postfix}${var.env}"
location = var.location
resource_group_name = var.rg_name
allocation_method = "Static"
sku = "Standard"
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}

Просмотреть файл

Просмотреть файл

@ -0,0 +1,39 @@
variable "rg_name" {
type = string
description = "Resource group name"
}
variable "location" {
type = string
description = "Location of the resource group"
}
variable "tags" {
type = map(string)
default = {}
description = "A mapping of tags which should be assigned to the deployed resource"
}
variable "prefix" {
type = string
description = "Prefix for the module name"
}
variable "postfix" {
type = string
description = "Postfix for the module name"
}
variable "env" {
type = string
description = "Environment prefix"
}
variable "subnet_id" {
type = string
description = "Subnet ID for the bastion"
}
variable "enable_aml_secure_workspace" {
description = "Variable to enable or disable AML secure workspace"
}

Просмотреть файл

@ -7,8 +7,53 @@ resource "azurerm_container_registry" "cr" {
name = "cr${local.safe_prefix}${local.safe_postfix}${var.env}"
resource_group_name = var.rg_name
location = var.location
sku = "Standard"
sku = var.enable_aml_secure_workspace ? "Premium" : "Standard"
admin_enabled = true
tags = var.tags
}
# DNS Zones
resource "azurerm_private_dns_zone" "cr_zone" {
name = "privatelink.azurecr.io"
resource_group_name = var.rg_name
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Linking of DNS zones to Virtual Network
resource "azurerm_private_dns_zone_virtual_network_link" "cr_zone_link" {
name = "${var.prefix}${var.postfix}_link_acr"
resource_group_name = var.rg_name
private_dns_zone_name = azurerm_private_dns_zone.cr_zone[0].name
virtual_network_id = var.vnet_id
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Private Endpoint configuration
resource "azurerm_private_endpoint" "cr_pe" {
name = "pe-${azurerm_container_registry.cr.name}-acr"
location = var.location
resource_group_name = var.rg_name
subnet_id = var.subnet_id
private_service_connection {
name = "psc-acr-${var.prefix}-${var.postfix}${var.env}"
private_connection_resource_id = azurerm_container_registry.cr.id
subresource_names = ["registry"]
is_manual_connection = false
}
private_dns_zone_group {
name = "private-dns-zone-group-acr"
private_dns_zone_ids = [azurerm_private_dns_zone.cr_zone[0].id]
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}

Просмотреть файл

@ -28,3 +28,17 @@ variable "env" {
type = string
description = "Environment prefix"
}
variable "enable_aml_secure_workspace" {
description = "Variable to enable or disable AML secure workspace"
}
variable "vnet_id" {
type = string
description = "The ID of the vnet that should be linked to the DNS zone"
}
variable "subnet_id" {
type = string
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
}

Просмотреть файл

Просмотреть файл

@ -8,4 +8,67 @@ resource "azurerm_key_vault" "kv" {
sku_name = "standard"
tags = var.tags
access_policy {
tenant_id = data.azurerm_client_config.current.tenant_id
object_id = data.azurerm_client_config.current.object_id
key_permissions = [
"Create",
"Get",
]
secret_permissions = [
"Set",
"Get",
"Delete",
"Purge",
"Recover"
]
}
}
# DNS Zones
resource "azurerm_private_dns_zone" "kv_zone" {
name = "privatelink.vaultcore.azure.net"
resource_group_name = var.rg_name
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Linking of DNS zones to Virtual Network
resource "azurerm_private_dns_zone_virtual_network_link" "kv_zone_link" {
name = "${var.prefix}${var.postfix}_link_kv"
resource_group_name = var.rg_name
private_dns_zone_name = azurerm_private_dns_zone.kv_zone[0].name
virtual_network_id = var.vnet_id
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Private Endpoint configuration
resource "azurerm_private_endpoint" "kv_pe" {
name = "pe-${azurerm_key_vault.kv.name}-vault"
location = var.location
resource_group_name = var.rg_name
subnet_id = var.subnet_id
private_service_connection {
name = "psc-kv-${var.prefix}-${var.postfix}${var.env}"
private_connection_resource_id = azurerm_key_vault.kv.id
subresource_names = ["vault"]
is_manual_connection = false
}
private_dns_zone_group {
name = "private-dns-zone-group-kv"
private_dns_zone_ids = [azurerm_private_dns_zone.kv_zone[0].id]
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}

Просмотреть файл

@ -28,3 +28,17 @@ variable "env" {
type = string
description = "Environment prefix"
}
variable "enable_aml_secure_workspace" {
description = "Variable to enable or disable AML secure workspace"
}
variable "vnet_id" {
type = string
description = "The ID of the vnet that should be linked to the DNS zone"
}
variable "subnet_id" {
type = string
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
}

Просмотреть файл

@ -32,3 +32,87 @@ resource "azurerm_storage_account_network_rules" "firewall_rules" {
virtual_network_subnet_ids = var.firewall_virtual_network_subnet_ids
bypass = var.firewall_bypass
}
# DNS Zones
resource "azurerm_private_dns_zone" "st_zone_blob" {
name = "privatelink.blob.core.windows.net"
resource_group_name = var.rg_name
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_private_dns_zone" "st_zone_file" {
name = "privatelink.file.core.windows.net"
resource_group_name = var.rg_name
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Linking of DNS zones to Virtual Network
resource "azurerm_private_dns_zone_virtual_network_link" "st_zone_link_blob" {
name = "${var.prefix}${var.postfix}_link_st_blob"
resource_group_name = var.rg_name
private_dns_zone_name = azurerm_private_dns_zone.st_zone_blob[0].name
virtual_network_id = var.vnet_id
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_private_dns_zone_virtual_network_link" "st_zone_link_file" {
name = "${var.prefix}${var.postfix}_link_st_file"
resource_group_name = var.rg_name
private_dns_zone_name = azurerm_private_dns_zone.st_zone_file[0].name
virtual_network_id = var.vnet_id
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Private Endpoint configuration
resource "azurerm_private_endpoint" "st_pe_blob" {
name = "pe-${azurerm_storage_account.st.name}-blob"
location = var.location
resource_group_name = var.rg_name
subnet_id = var.subnet_id
private_service_connection {
name = "psc-blob-${var.prefix}-${var.postfix}${var.env}"
private_connection_resource_id = azurerm_storage_account.st.id
subresource_names = ["blob"]
is_manual_connection = false
}
private_dns_zone_group {
name = "private-dns-zone-group-blob"
private_dns_zone_ids = [azurerm_private_dns_zone.st_zone_blob[0].id]
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}
resource "azurerm_private_endpoint" "st_pe_file" {
name = "pe-${azurerm_storage_account.st.name}-file"
location = var.location
resource_group_name = var.rg_name
subnet_id = var.subnet_id
private_service_connection {
name = "psc-file-${var.prefix}-${var.postfix}${var.env}"
private_connection_resource_id = azurerm_storage_account.st.id
subresource_names = ["file"]
is_manual_connection = false
}
private_dns_zone_group {
name = "private-dns-zone-group-file"
private_dns_zone_ids = [azurerm_private_dns_zone.st_zone_file[0].id]
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}

Просмотреть файл

@ -42,3 +42,17 @@ variable "firewall_virtual_network_subnet_ids" {
variable "firewall_bypass" {
default = ["None"]
}
variable "enable_aml_secure_workspace" {
description = "Variable to enable or disable AML secure workspace"
}
variable "vnet_id" {
type = string
description = "The ID of the vnet that should be linked to the DNS zone"
}
variable "subnet_id" {
type = string
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
}

Просмотреть файл

@ -0,0 +1,104 @@
resource "azurerm_virtual_machine" "vm" {
name = "wvm-jumphost"
location = var.location
resource_group_name = var.rg_name
network_interface_ids = [azurerm_network_interface.vm_nic[0].id]
vm_size = "Standard_DS3_v2"
delete_os_disk_on_termination = true
delete_data_disks_on_termination = true
storage_image_reference {
publisher = "microsoft-dsvm"
offer = "dsvm-win-2019"
sku = "server-2019"
version = "latest"
}
os_profile {
computer_name = var.jumphost_username
admin_username = var.jumphost_username
admin_password = var.jumphost_password
}
os_profile_windows_config {
provision_vm_agent = true
enable_automatic_upgrades = true
}
identity {
type = "SystemAssigned"
}
storage_os_disk {
name = "disk-${var.prefix}-${var.postfix}${var.env}"
caching = "ReadWrite"
create_option = "FromImage"
managed_disk_type = "StandardSSD_LRS"
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}
resource "azurerm_network_interface" "vm_nic" {
name = "nic-${var.prefix}-${var.postfix}${var.env}"
location = var.location
resource_group_name = var.rg_name
ip_configuration {
name = "configuration"
private_ip_address_allocation = "Dynamic"
subnet_id = var.subnet_id
# public_ip_address_id = azurerm_public_ip.vm_public_ip.id
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}
resource "azurerm_network_security_group" "vm_nsg" {
name = "nsg-${var.prefix}-${var.postfix}${var.env}"
location = var.location
resource_group_name = var.rg_name
security_rule {
name = "RDP"
priority = 1010
direction = "Inbound"
access = "Allow"
protocol = "Tcp"
source_port_range = "*"
destination_port_range = 3389
source_address_prefix = "*"
destination_address_prefix = "*"
}
count = var.enable_aml_secure_workspace ? 1 : 0
tags = var.tags
}
resource "azurerm_network_interface_security_group_association" "vm_nsg_association" {
network_interface_id = azurerm_network_interface.vm_nic[0].id
network_security_group_id = azurerm_network_security_group.vm_nsg[0].id
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_dev_test_global_vm_shutdown_schedule" "vm_schedule" {
virtual_machine_id = azurerm_virtual_machine.vm[0].id
location = var.location
enabled = true
daily_recurrence_time = "2000"
timezone = "W. Europe Standard Time"
notification_settings {
enabled = false
}
count = var.enable_aml_secure_workspace ? 1 : 0
}

Просмотреть файл

Просмотреть файл

@ -0,0 +1,49 @@
variable "rg_name" {
type = string
description = "Resource group name"
}
variable "location" {
type = string
description = "Location of the resource group"
}
variable "tags" {
type = map(string)
default = {}
description = "A mapping of tags which should be assigned to the deployed resource"
}
variable "prefix" {
type = string
description = "Prefix for the module name"
}
variable "postfix" {
type = string
description = "Postfix for the module name"
}
variable "env" {
type = string
description = "Environment prefix"
}
variable "jumphost_username" {
type = string
description = "VM username"
}
variable "jumphost_password" {
type = string
description = "VM password"
}
variable "subnet_id" {
type = string
description = "Subnet ID for the virtual machine"
}
variable "enable_aml_secure_workspace" {
description = "Variable to enable or disable AML secure workspace"
}

Просмотреть файл

@ -0,0 +1,131 @@
# Virtual network
resource "azurerm_virtual_network" "vnet_default" {
name = "vnet-${var.prefix}-${var.postfix}${var.environment}"
resource_group_name = module.resource_group.name
location = module.resource_group.location
address_space = ["10.0.0.0/16"]
count = var.enable_aml_secure_workspace ? 1 : 0
tags = local.tags
}
# Subnets
resource "azurerm_subnet" "snet_default" {
name = "snet-${var.prefix}-${var.postfix}${var.environment}-default"
resource_group_name = module.resource_group.name
virtual_network_name = azurerm_virtual_network.vnet_default[0].name
address_prefixes = ["10.0.1.0/24"]
enforce_private_link_endpoint_network_policies = true
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_subnet" "snet_bastion" {
name = "AzureBastionSubnet"
resource_group_name = module.resource_group.name
virtual_network_name = azurerm_virtual_network.vnet_default[0].name
address_prefixes = ["10.0.10.0/27"]
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_subnet" "snet_training" {
name = "snet-${var.prefix}-${var.postfix}${var.environment}-training"
resource_group_name = module.resource_group.name
virtual_network_name = azurerm_virtual_network.vnet_default[0].name
address_prefixes = ["10.0.2.0/24"]
enforce_private_link_endpoint_network_policies = true
count = var.enable_aml_secure_workspace ? 1 : 0
}
# Network security groups
resource "azurerm_network_security_group" "nsg_training" {
name = "nsg-${var.prefix}-${var.postfix}${var.environment}-training"
location = module.resource_group.location
resource_group_name = module.resource_group.name
security_rule {
name = "BatchNodeManagement"
priority = 100
direction = "Inbound"
access = "Allow"
protocol = "Tcp"
source_port_range = "*"
destination_port_range = "29876-29877"
source_address_prefix = "BatchNodeManagement"
destination_address_prefix = "*"
}
security_rule {
name = "AzureMachineLearning"
priority = 110
direction = "Inbound"
access = "Allow"
protocol = "Tcp"
source_port_range = "*"
destination_port_range = "44224"
source_address_prefix = "AzureMachineLearning"
destination_address_prefix = "*"
}
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_subnet_network_security_group_association" "nsg-training-link" {
subnet_id = azurerm_subnet.snet_training[0].id
network_security_group_id = azurerm_network_security_group.nsg_training[0].id
count = var.enable_aml_secure_workspace ? 1 : 0
}
# User Defined Routes
resource "azurerm_route_table" "rt_training" {
name = "rt-${var.prefix}-${var.postfix}${var.environment}-training"
location = module.resource_group.location
resource_group_name = module.resource_group.name
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_route" "route_training_internet" {
name = "Internet"
resource_group_name = module.resource_group.name
route_table_name = azurerm_route_table.rt_training[0].name
address_prefix = "0.0.0.0/0"
next_hop_type = "Internet"
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_route" "route_training_aml" {
name = "AzureMLRoute"
resource_group_name = module.resource_group.name
route_table_name = azurerm_route_table.rt_training[0].name
address_prefix = "AzureMachineLearning"
next_hop_type = "Internet"
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_route" "route_training_batch" {
name = "BatchRoute"
resource_group_name = module.resource_group.name
route_table_name = azurerm_route_table.rt_training[0].name
address_prefix = "BatchNodeManagement"
next_hop_type = "Internet"
count = var.enable_aml_secure_workspace ? 1 : 0
}
resource "azurerm_subnet_route_table_association" "rt_training_link" {
subnet_id = azurerm_subnet.snet_training[0].id
route_table_id = azurerm_route_table.rt_training[0].id
count = var.enable_aml_secure_workspace ? 1 : 0
}

Просмотреть файл

@ -9,6 +9,14 @@ variables:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
parameters:
- name: jumphost_username
type: string
default: "azureuser"
- name: jumphost_password
type: string
default: "ThisIsNotVerySecure!"
trigger:
- none
@ -51,4 +59,10 @@ stages :
- template: templates/infra/run-terraform-init.yml@mlops-templates
- template: templates/infra/run-terraform-validate.yml@mlops-templates
- template: templates/infra/run-terraform-plan.yml@mlops-templates
parameters:
jumphost_username: ${{parameters.jumphost_username}}
jumphost_password: ${{parameters.jumphost_password}}
- template: templates/infra/run-terraform-apply.yml@mlops-templates
parameters:
jumphost_username: ${{parameters.jumphost_username}}
jumphost_password: ${{parameters.jumphost_password}}

Просмотреть файл

@ -21,3 +21,27 @@ variable "postfix" {
variable "enable_aml_computecluster" {
description = "Variable to enable or disable AML compute cluster"
}
variable "enable_aml_secure_workspace" {
description = "Variable to enable or disable AML secure workspace"
}
variable "jumphost_username" {
type = string
description = "VM username"
default = "azureuser"
}
variable "jumphost_password" {
type = string
description = "VM password"
default = "ThisIsNotVerySecure!"
}
variable "enable_monitoring" {
description = "Variable to enable or disable Monitoring"
}
variable "client_secret" {
description = "Service Principal Secret"
}

Просмотреть файл

@ -0,0 +1,24 @@
name: nlp_inference_conda_env
channels:
- pytorch
- anaconda
- defaults
- conda-forge
dependencies:
- python=3.8
- pip=21.2.4
- pytorch=1.10.0
- torchvision=0.11.1
- torchaudio=0.10.0
- cudatoolkit=11.1.1
- nvidia-apex=0.1.0
- gxx_linux-64=8.5.0
- pip:
- azureml-defaults==1.39.0
- azureml-mlflow==1.39.0
- azureml-telemetry==1.39.0
- azureml-train-core==1.39.0
- mlflow==1.24.0
- transformers==4.17.0
- 'inference-schema[numpy-support]==1.3.0'
- applicationinsights==0.11.10

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше