Merge branch 'main' into dev
This commit is contained in:
Коммит
b4ef102154
|
@ -0,0 +1,13 @@
|
|||
blank_issues_enabled: false
|
||||
contact_links:
|
||||
- name: MLOps v2 solution accelerators discussions.
|
||||
url: https://github.com/azure/mlops-v2/discussions
|
||||
about: >-
|
||||
Please ask questions and start open-ended discussions here.
|
||||
Use issues for well-defined work in the solution accelerator repositories.
|
||||
- name: Azure ML CLI issues.
|
||||
url: https://github.com/azure/azure-cli-extensions/issues/new/choose
|
||||
about: Please open issues with the Azure ML CLI extension here.
|
||||
- name: Azure ML Python SDK issues.
|
||||
url: https://github.com/azure/azure-sdk-for-python/issues/new/choose
|
||||
about: Please open issues with the Azure ML Python SDK here.
|
|
@ -0,0 +1,21 @@
|
|||
---
|
||||
name: Suggest an enhancement for this repository.
|
||||
about: Have an idea for improvements to this repository?
|
||||
title: '[repo] <title>'
|
||||
labels: ''
|
||||
assignees: ''
|
||||
---
|
||||
|
||||
## Why?
|
||||
|
||||
<!-- What problem is this solving? -->
|
||||
|
||||
## How?
|
||||
|
||||
<!-- How are you suggesting it gets solved? -->
|
||||
|
||||
## Anything else?
|
||||
|
||||
<!--
|
||||
Links? References? Anything that will give us more context about the issue that you are encountering!
|
||||
-->
|
|
@ -0,0 +1,25 @@
|
|||
---
|
||||
name: Request or suggest a new solution accelerator.
|
||||
about: Have an idea for a new solution accelerator?
|
||||
title: '[new accelerator] <title>'
|
||||
labels: ''
|
||||
assignees: ''
|
||||
---
|
||||
|
||||
## Why doesn't an existing solution accelerator work?
|
||||
|
||||
<!-- Concisely explain why a new solution accelerator is needed. -->
|
||||
|
||||
## What work is needed?
|
||||
|
||||
<!--
|
||||
Concisely explain the infrastructure and MLOps work needed.
|
||||
Include as much detail as possible in how this would fit into the
|
||||
overall solution accelerator.
|
||||
-->
|
||||
|
||||
## Anything else?
|
||||
|
||||
<!--
|
||||
Links? References? Anything that will give us more context about the issue that you are encountering!
|
||||
-->
|
|
@ -0,0 +1,13 @@
|
|||
# PR into Azure/mlops-v2
|
||||
|
||||
## Checklist
|
||||
|
||||
I have:
|
||||
|
||||
- [ ] read and followed the contributing guidelines
|
||||
|
||||
## Changes
|
||||
|
||||
-
|
||||
|
||||
fixes #
|
|
@ -0,0 +1,14 @@
|
|||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.2.0
|
||||
hooks:
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
||||
|
||||
# Opinionated code formatter to forget about formatting
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 21.12b0
|
||||
hooks:
|
||||
- id: black
|
||||
additional_dependencies: ['click==8.0.4']
|
|
@ -12,3 +12,5 @@ dependencies:
|
|||
- pandas==1.2.1
|
||||
- joblib==1.0.0
|
||||
- matplotlib==3.3.3
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
|
|
@ -0,0 +1,16 @@
|
|||
channels:
|
||||
- defaults
|
||||
- anaconda
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.7.5
|
||||
- pip
|
||||
- pip:
|
||||
- azureml-mlflow==1.38.0
|
||||
- azureml-sdk==1.38.0
|
||||
- scikit-learn==0.24.1
|
||||
- pandas==1.2.1
|
||||
- joblib==1.0.0
|
||||
- matplotlib==3.3.3
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
|
|
@ -53,10 +53,18 @@ def parse_args():
|
|||
parser.add_argument("--val_data", type=str, help="Path to test dataset")
|
||||
parser.add_argument("--test_data", type=str, help="Path to test dataset")
|
||||
|
||||
parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
|
||||
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def log_training_data(df, table_name):
|
||||
from obs.collector import Online_Collector
|
||||
collector = Online_Collector(table_name)
|
||||
collector.batch_collect(df)
|
||||
|
||||
def main(args):
|
||||
'''Read, split, and save datasets'''
|
||||
|
||||
|
@ -93,6 +101,9 @@ def main(args):
|
|||
val.to_parquet((Path(args.val_data) / "val.parquet"))
|
||||
test.to_parquet((Path(args.test_data) / "test.parquet"))
|
||||
|
||||
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
|
||||
log_training_data(data, args.table_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
|
|
@ -2,5 +2,5 @@ $schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.sch
|
|||
name: blue
|
||||
endpoint_name: taxi-fare-online
|
||||
model: azureml:taxi-model@latest
|
||||
instance_type: Standard_F2s_v2
|
||||
instance_type: Standard_DS2_v2
|
||||
instance_count: 1
|
||||
|
|
|
@ -8,6 +8,8 @@ inputs:
|
|||
input: #using local data, will create an anonymous data asset
|
||||
type: uri_folder
|
||||
path: ../../../data/
|
||||
enable_monitoring:
|
||||
table_name: 'taximonitoring'
|
||||
|
||||
outputs:
|
||||
train_data:
|
||||
|
@ -35,9 +37,12 @@ jobs:
|
|||
--train_data ${{outputs.train_data}}
|
||||
--val_data ${{outputs.val_data}}
|
||||
--test_data ${{outputs.test_data}}
|
||||
--enable_monitoring ${{inputs.enable_monitoring}}
|
||||
environment: azureml:taxi-train-env@latest
|
||||
inputs:
|
||||
raw_data: ${{parent.inputs.input}}
|
||||
enable_monitoring: ${{parent.inputs.enable_monitoring}}
|
||||
table_name: ${{parent.inputs.table_name}}
|
||||
outputs:
|
||||
train_data: ${{parent.outputs.train_data}}
|
||||
val_data: ${{parent.outputs.val_data}}
|
||||
|
|
|
@ -11,7 +11,7 @@ variables:
|
|||
- name: version
|
||||
value: aml-cli-v2
|
||||
- name: endpoint_name
|
||||
value: taxi-fare-batch
|
||||
value: taxi-batch-$(namespace)$(postfix)$(environment)
|
||||
- name: endpoint_type
|
||||
value: batch
|
||||
|
||||
|
@ -28,6 +28,8 @@ resources:
|
|||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
|
||||
stages:
|
||||
- stage: CreateBatchEndpoint
|
||||
|
|
|
@ -25,10 +25,14 @@ resources:
|
|||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
<<<<<<< HEAD
|
||||
- repository: rai-vnext-preview # Template Repo
|
||||
name: Azure/rai-vnext-preview # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
=======
|
||||
ref: main
|
||||
>>>>>>> main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
|
@ -50,6 +54,7 @@ stages:
|
|||
build_type: conda
|
||||
environment_name: taxi-train-env
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- checkout: rai-vnext-preview
|
||||
path: s/
|
||||
- template: register-rai-components.yml
|
||||
|
@ -60,3 +65,6 @@ stages:
|
|||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_file: mlops/azureml/train/pipeline.yml
|
||||
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
|
||||
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
|
||||
enable_monitoring: $(enable_monitoring)
|
|
@ -7,11 +7,11 @@ variables:
|
|||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- template: ../../../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
- name: endpoint_name
|
||||
value: taxi-fare-online
|
||||
value: taxi-online-$(namespace)$(postfix)$(environment)
|
||||
- name: endpoint_type
|
||||
value: online
|
||||
|
||||
|
@ -29,6 +29,7 @@ resources:
|
|||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: CreateOnlineEndpoint
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
name: deploy-batch-endpoint-pipeline
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
create-compute:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: batch-cluster
|
||||
size: STANDARD_DS3_V2
|
||||
min_instances: 0
|
||||
max_instances: 5
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-endpoint:
|
||||
needs: [get-config,create-compute]
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
|
||||
endpoint_name: ${{ format('taxi-batch-{0}', needs.get-config.outputs.bep) }}
|
||||
endpoint_type: batch
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-deployment:
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config,create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
|
||||
endpoint_name: ${{ format('taxi-batch-{0}', needs.get-config.outputs.bep) }}
|
||||
endpoint_type: batch
|
||||
deployment_name: eptestdeploy
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
|
@ -0,0 +1,29 @@
|
|||
name: deploy-model-training-pipeline
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
register-environment:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
conda_file: data-science/environment/train-conda.yml
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
run-pipeline:
|
||||
needs: [get-config,register-environment]
|
||||
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
parameters-file: mlops/azureml/train/pipeline.yml
|
||||
job-name: test
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
|
@ -0,0 +1,42 @@
|
|||
name: deploy-online-endpoint-pipeline
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
create-endpoint:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
|
||||
endpoint_type: online
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-deployment:
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config,create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
|
||||
endpoint_type: online
|
||||
deployment_name: taxi-online-dp
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
allocate-traffic:
|
||||
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
|
||||
needs: [get-config,create-deployment]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
traffic_allocation: taxi-online-dp=100
|
||||
endpoint_name: ${{ format('taxi-online-{0}', needs.get-config.outputs.oep) }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
|
@ -1,3 +1,6 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
|
||||
ap_vm_image: ubuntu-20.04
|
||||
|
@ -16,7 +19,7 @@ variables:
|
|||
training_env_name: credit-training
|
||||
|
||||
# Training AzureML Environment conda yaml
|
||||
training_env_conda_yaml: data-science/environments/train.yml
|
||||
training_env_conda_yaml: data-science/environment/train.yml
|
||||
|
||||
# Name for the training pipeline
|
||||
training_pipeline_name: credit-training
|
||||
|
@ -27,8 +30,12 @@ variables:
|
|||
training_target_min_nodes: 0
|
||||
training_target_max_nodes: 4
|
||||
|
||||
# Training arguments specification; use azureml:dataset_name:version to reference an AML Dataset for --data_path
|
||||
training_arguments: --data_path azureml:uci-credit:1
|
||||
# Training arguments specification
|
||||
training_arguments: ''
|
||||
|
||||
# Training datasets specification
|
||||
# Syntax: <name>:<version>:<mode>:<steps (names separated by +)>
|
||||
training_datasets: uci-credit:1:download:prep
|
||||
|
||||
# Name under which the model will be registered
|
||||
model_name: credit-ci
|
||||
|
@ -47,7 +54,7 @@ variables:
|
|||
batch_env_name: credit-batch
|
||||
|
||||
# Batch AzureML Environment conda yaml
|
||||
batch_env_conda_yaml: data-science/environments/batch.yml
|
||||
batch_env_conda_yaml: data-science/environment/batch.yml
|
||||
|
||||
# Name for the batch scoring pipeline
|
||||
batch_pipeline_name: credit-batch-scoring
|
||||
|
@ -73,3 +80,7 @@ variables:
|
|||
batch_process_count_per_node: 1
|
||||
batch_node_count: 1
|
||||
|
||||
# Monitoring settings
|
||||
scoring_table_name: scoringdata
|
||||
training_table_name: mlmonitoring
|
||||
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
name: batch-monitoring
|
||||
channels:
|
||||
- defaults
|
||||
- anaconda
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.7.5
|
||||
- pip
|
||||
- pip:
|
||||
- azureml-defaults==1.38.0
|
||||
- azureml-mlflow==1.38.0
|
||||
- azureml-sdk==1.38.0
|
||||
- azureml-interpret==1.38.0
|
||||
- scikit-learn==0.24.1
|
||||
- pandas==1.2.1
|
||||
- joblib==1.0.0
|
||||
- matplotlib==3.3.3
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
|
|
@ -1,3 +1,6 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
name: mnist-train
|
||||
channels:
|
||||
- defaults
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
name: train
|
||||
channels:
|
||||
- defaults
|
||||
- anaconda
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.7.5
|
||||
- pip
|
||||
- pip:
|
||||
- azureml-mlflow==1.38.0
|
||||
- azureml-sdk==1.38.0
|
||||
- scikit-learn==0.24.1
|
||||
- pandas==1.2.1
|
||||
- joblib==1.0.0
|
||||
- matplotlib==3.3.3
|
||||
- fairlearn==0.7.0
|
||||
- azureml-contrib-fairness==1.38.0
|
||||
- interpret-community==0.24.1
|
||||
- interpret-core==0.2.7
|
||||
- azureml-interpret==1.38.0
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
|
|
@ -1,3 +1,6 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
@ -40,10 +43,10 @@ def parse_args():
|
|||
def main():
|
||||
# Parse command-line arguments
|
||||
args = parse_args()
|
||||
prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
|
||||
model_path = os.path.join(args.model_path, run.parent.id)
|
||||
explainer_path = os.path.join(args.explainer_path, run.parent.id)
|
||||
evaluation_path = os.path.join(args.evaluation_path, run.parent.id)
|
||||
prepared_data_path = args.prepared_data_path
|
||||
model_path = args.model_path
|
||||
explainer_path = args.explainer_path
|
||||
evaluation_path = args.evaluation_path
|
||||
|
||||
# Make sure evaluation output path exists
|
||||
if not os.path.exists(evaluation_path):
|
||||
|
@ -111,6 +114,10 @@ def main():
|
|||
for model_run in Model.list(ws):
|
||||
if model_run.name == args.model_name:
|
||||
mdl_path = Model.download(model_run, exist_ok=True)
|
||||
|
||||
if 'model.pkl' in mdl_path:
|
||||
mdl = joblib.load(mdl_path)
|
||||
else:
|
||||
mdl = joblib.load(os.path.join(mdl_path, 'model.pkl'))
|
||||
|
||||
test_accuracies[model_run.id] = mdl.score(X_test, y_test)
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
@ -17,14 +20,25 @@ ws = run.experiment.workspace
|
|||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="UCI Credit example")
|
||||
parser.add_argument("--data_path", type=str, default='data/', help="Directory path to training data")
|
||||
parser.add_argument("--uci-credit", type=str, default='data/', help="Directory path to training data")
|
||||
parser.add_argument("--prepared_data_path", type=str, default='prepared_data/', help="prepared data directory")
|
||||
return parser.parse_args()
|
||||
parser.add_argument("--enable_monitoring", type=str, default="false", help="enable logging to ADX")
|
||||
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
|
||||
return parser.parse_known_args()
|
||||
|
||||
def log_training_data(df, table_name):
|
||||
from obs.collector import Online_Collector
|
||||
from datetime import timedelta
|
||||
print("If there is an Authorization error, check your Azure KeyVault secret named kvmonitoringspkey. Terraform might put single quotation marks around the secret. Remove the single quotes and the secret should work.")
|
||||
collector = Online_Collector(table_name)
|
||||
df["timestamp"] = [pd.to_datetime('now') - timedelta(days=x) for x in range(len(df))]
|
||||
collector.batch_collect(df)
|
||||
|
||||
|
||||
def main():
|
||||
# Parse command-line arguments
|
||||
args = parse_args()
|
||||
prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
|
||||
args, unknown = parse_args()
|
||||
prepared_data_path = args.prepared_data_path
|
||||
|
||||
# Make sure data output path exists
|
||||
if not os.path.exists(prepared_data_path):
|
||||
|
@ -34,7 +48,7 @@ def main():
|
|||
mlflow.sklearn.autolog()
|
||||
|
||||
# Read training data
|
||||
df = pd.read_csv(os.path.join(args.data_path, 'credit.csv'))
|
||||
df = pd.read_csv(os.path.join(args.uci_credit, 'credit.csv'))
|
||||
|
||||
random_data = np.random.rand(len(df))
|
||||
|
||||
|
@ -62,5 +76,8 @@ def main():
|
|||
val.to_csv(VAL_PATH, index=False)
|
||||
test.to_csv(TEST_PATH, index=False)
|
||||
|
||||
if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
|
||||
log_training_data(df, args.table_name)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import glob
|
||||
|
@ -6,33 +8,55 @@ import argparse
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import joblib
|
||||
|
||||
from datetime import timedelta
|
||||
from azureml.core.model import Model
|
||||
|
||||
model = None
|
||||
explainer = None
|
||||
collector = None
|
||||
|
||||
|
||||
def init():
|
||||
global model, explainer
|
||||
global model, explainer, collector
|
||||
print("Started batch scoring by running init()")
|
||||
|
||||
parser = argparse.ArgumentParser('batch_scoring')
|
||||
parser.add_argument('--model_name', type=str, help='Model to use for batch scoring')
|
||||
parser = argparse.ArgumentParser("batch_scoring")
|
||||
parser.add_argument("--model_name", type=str, help="Model to use for batch scoring")
|
||||
parser.add_argument(
|
||||
"--enable_monitoring", type=str, help="Enable Monitoring", default="false"
|
||||
)
|
||||
parser.add_argument("--table_name", type=str, help="Table Name for logging data")
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
model_path = Model.get_model_path(args.model_name)
|
||||
print(f"Model path: {model_path}")
|
||||
model = joblib.load(os.path.join(model_path, 'model.pkl'))
|
||||
|
||||
if "model.pkl" in model_path:
|
||||
model = joblib.load(model_path)
|
||||
else:
|
||||
model = joblib.load(os.path.join(model_path, "model.pkl"))
|
||||
|
||||
# load the explainer
|
||||
explainer_path = os.path.join(Model.get_model_path(args.model_name), "explainer")
|
||||
#explainer = joblib.load(explainer_path)
|
||||
# explainer = joblib.load(explainer_path)
|
||||
|
||||
if (
|
||||
args.enable_monitoring.lower() == "true"
|
||||
or args.enable_monitoring == "1"
|
||||
or args.enable_monitoring.lower() == "yes"
|
||||
):
|
||||
from obs.collector import Online_Collector
|
||||
|
||||
collector = Online_Collector(args.table_name)
|
||||
|
||||
|
||||
def run(file_list):
|
||||
|
||||
print(f"Files to process: {file_list}")
|
||||
results = pd.DataFrame(columns=["Sno", "ProbaGoodCredit", "ProbaBadCredit", "FeatureImportance"])
|
||||
|
||||
results = pd.DataFrame(
|
||||
columns=["Sno", "ProbaGoodCredit", "ProbaBadCredit", "FeatureImportance"]
|
||||
)
|
||||
all_results = []
|
||||
for filename in file_list:
|
||||
|
||||
df = pd.read_csv(filename)
|
||||
|
@ -42,18 +66,27 @@ def run(file_list):
|
|||
proba = model.predict_proba(df)
|
||||
proba = pd.DataFrame(data=proba, columns=["ProbaGoodCredit", "ProbaBadCredit"])
|
||||
|
||||
#explanation = explainer.explain_local(df)
|
||||
# explanation = explainer.explain_local(df)
|
||||
# sorted feature importance values and feature names
|
||||
#sorted_local_importance_names = explanation.get_ranked_local_names()
|
||||
#sorted_local_importance_values = explanation.get_ranked_local_values()
|
||||
# sorted_local_importance_names = explanation.get_ranked_local_names()
|
||||
# sorted_local_importance_values = explanation.get_ranked_local_values()
|
||||
# get explanations in dictionnary
|
||||
#explanations = []
|
||||
#for i, j in zip(sorted_local_importance_names[0], sorted_local_importance_values[0]):
|
||||
# explanations = []
|
||||
# for i, j in zip(sorted_local_importance_names[0], sorted_local_importance_values[0]):
|
||||
# explanations.append(dict(zip(i, j)))
|
||||
#explanation = pd.DataFrame(data=explanations, columns=["FeatureImportance"])
|
||||
# explanation = pd.DataFrame(data=explanations, columns=["FeatureImportance"])
|
||||
|
||||
#result = pd.concat([sno, proba, explanation], axis=1)
|
||||
# result = pd.concat([sno, proba, explanation], axis=1)
|
||||
result = pd.concat([sno, proba], axis=1)
|
||||
results = results.append(result)
|
||||
all_results.append(pd.concat([df, proba], axis=1))
|
||||
print(f"Batch scored: {filename}")
|
||||
|
||||
if collector:
|
||||
full_results = pd.concat(all_results)
|
||||
full_results["timestamp"] = [
|
||||
pd.to_datetime("now") - timedelta(days=x) for x in range(len(full_results))
|
||||
]
|
||||
collector.batch_collect(full_results)
|
||||
|
||||
return results
|
|
@ -1,3 +1,7 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
@ -30,8 +34,8 @@ def main():
|
|||
# Parse command-line arguments
|
||||
args = parse_args()
|
||||
|
||||
prepared_data_path = os.path.join(args.prepared_data_path, run.parent.id)
|
||||
model_path = os.path.join(args.model_path, run.parent.id)
|
||||
prepared_data_path = args.prepared_data_path
|
||||
model_path = args.model_path
|
||||
|
||||
# Make sure model output path exists
|
||||
if not os.path.exists(model_path):
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- template: ../../config-aml.yml
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: $(ap_vm_image)
|
||||
|
||||
stages:
|
||||
- stage: DeployDriftJob
|
||||
displayName: Deploy Drift Job
|
||||
jobs:
|
||||
- job: DeployDriftJob
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/deploy-drift-detection.yml@mlops-templates
|
|
@ -24,6 +24,7 @@ resources:
|
|||
name: Azure/mlops-templates
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployBatchScoringPipeline
|
||||
|
@ -41,10 +42,14 @@ stages:
|
|||
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: $(batch_env_name)
|
||||
environment_conda_yaml: $(batch_env_conda_yaml)
|
||||
build_type: 'conda'
|
||||
environment_file: $(batch_env_conda_yaml)
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: scoring
|
||||
- template: templates/${{ variables.version }}/deploy-batch-scoring-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
|
|
|
@ -24,6 +24,7 @@ resources:
|
|||
name: Azure/mlops-templates # need to change org name from Azure when pulling the template
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
|
@ -41,7 +42,9 @@ stages:
|
|||
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: $(training_env_name)
|
||||
environment_conda_yaml: $(training_env_conda_yaml)
|
||||
build_type: 'conda'
|
||||
environment_file: $(training_env_conda_yaml)
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
|
@ -49,5 +52,7 @@ stages:
|
|||
parameters:
|
||||
compute_type: training
|
||||
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
|
|
|
@ -6,17 +6,22 @@ variables:
|
|||
# Global
|
||||
ap_vm_image: ubuntu-20.04
|
||||
|
||||
namespace: mlopsv2
|
||||
postfix: 0621
|
||||
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
|
||||
postfix: 0659
|
||||
location: westus
|
||||
|
||||
environment: dev
|
||||
enable_aml_computecluster: true
|
||||
enable_aml_secure_workspace: true
|
||||
enable_monitoring: false
|
||||
|
||||
# Azure DevOps
|
||||
ado_service_connection_rg: Azure-ARM-Dev
|
||||
ado_service_connection_aml_ws: Azure-ARM-Dev
|
||||
|
||||
# IaC
|
||||
# DO NOT TOUCH
|
||||
|
||||
# For pipeline reference
|
||||
resource_group: rg-$(namespace)-$(postfix)$(environment)
|
||||
aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
|
||||
application_insights: mlw-$(namespace)-$(postfix)$(environment)
|
||||
|
@ -24,10 +29,10 @@ variables:
|
|||
container_registry: cr$(namespace)$(postfix)$(environment)
|
||||
storage_account: st$(namespace)$(postfix)$(environment)
|
||||
|
||||
# Terraform
|
||||
# For terraform reference
|
||||
terraform_version: 0.14.7
|
||||
terraform_workingdir: infrastructure/terraform
|
||||
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf-state
|
||||
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tfstate
|
||||
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
|
||||
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
|
||||
terraform_st_container_name: default
|
||||
terraform_st_key: mlops-tab
|
||||
|
|
|
@ -7,17 +7,22 @@ variables:
|
|||
# Global
|
||||
ap_vm_image: ubuntu-20.04
|
||||
|
||||
namespace: mlopsv2
|
||||
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
|
||||
postfix: 0518
|
||||
location: westus
|
||||
location: westeurope
|
||||
environment: prod
|
||||
enable_aml_computecluster: true
|
||||
enable_aml_secure_workspace: false
|
||||
enable_monitoring: true
|
||||
|
||||
|
||||
# Azure DevOps
|
||||
ado_service_connection_rg: Azure-ARM-Dev
|
||||
ado_service_connection_aml_ws: Azure-ARM-Dev
|
||||
ado_service_connection_rg: Azure-ARM-Prod
|
||||
ado_service_connection_aml_ws: Azure-ARM-Prod
|
||||
|
||||
# IaC
|
||||
# DO NOT TOUCH
|
||||
|
||||
# For pipeline reference
|
||||
resource_group: rg-$(namespace)-$(postfix)$(environment)
|
||||
aml_workspace: mlw-$(namespace)-$(postfix)$(environment)
|
||||
application_insights: mlw-$(namespace)-$(postfix)$(environment)
|
||||
|
@ -25,10 +30,10 @@ variables:
|
|||
container_registry: cr$(namespace)$(postfix)$(environment)
|
||||
storage_account: st$(namespace)$(postfix)$(environment)
|
||||
|
||||
# Terraform
|
||||
# For terraform reference
|
||||
terraform_version: 0.14.7
|
||||
terraform_workingdir: infrastructure
|
||||
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf-state
|
||||
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tfstate
|
||||
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
|
||||
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
|
||||
terraform_st_container_name: default
|
||||
terraform_st_key: mlops-tab
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
|
||||
FROM nvcr.io/nvidia/pytorch:22.04-py3
|
||||
|
||||
##############################################################################
|
||||
# NCCL TESTS
|
||||
##############################################################################
|
||||
ENV NCCL_TESTS_TAG=v2.11.0
|
||||
|
||||
# NOTE: adding gencodes to support K80, M60, V100, A100
|
||||
RUN mkdir /tmp/nccltests && \
|
||||
cd /tmp/nccltests && \
|
||||
git clone -b ${NCCL_TESTS_TAG} https://github.com/NVIDIA/nccl-tests.git && \
|
||||
cd nccl-tests && \
|
||||
make \
|
||||
MPI=1 MPI_HOME=/opt/hpcx/ompi \
|
||||
NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80" \
|
||||
CUDA_HOME=/usr/local/cuda && \
|
||||
cp ./build/* /usr/local/bin && \
|
||||
rm -rf /tmp/nccltests
|
||||
|
||||
# Install dependencies missing in this container
|
||||
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
|
||||
COPY requirements.txt ./
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
# RUN python -m pip install azureml-defaults==1.41.0 \
|
||||
# mlflow==1.25.1 \
|
||||
# azureml-mlflow==1.41.0 \
|
||||
# transformers==4.18.0 \
|
||||
# psutil==5.9.0
|
||||
|
||||
# add ndv4-topo.xml
|
||||
RUN mkdir /opt/microsoft/
|
||||
ADD ./ndv4-topo.xml /opt/microsoft
|
||||
|
||||
# to use on A100, enable env var below in your job
|
||||
# ENV NCCL_TOPO_FILE="/opt/microsoft/ndv4-topo.xml"
|
||||
|
||||
# adjusts the level of info from NCCL tests
|
||||
ENV NCCL_DEBUG="INFO"
|
||||
ENV NCCL_DEBUG_SUBSYS="GRAPH,INIT,ENV"
|
||||
|
||||
# Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
|
||||
ENV NCCL_IB_PCI_RELAXED_ORDERING="1"
|
||||
ENV CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
ENV NCCL_SOCKET_IFNAME="eth0"
|
|
@ -0,0 +1,35 @@
|
|||
<!-- This topology file was copied from https://github.com/Azure/azhpc-images/blob/master/common/network-tuning.sh -->
|
||||
<system version="1">
|
||||
<cpu numaid="0" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:01.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0001:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0101:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0002:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0102:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="1" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:02.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0003:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0103:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0004:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0104:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="2" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:03.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="000b:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0105:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="000c:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0106:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="3" affinity="0000ffff,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="23" modelid="49">
|
||||
<pci busid="ffff:ff:04.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="000d:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0107:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="000e:00:00.0" class="0x030200" link_speed="16 GT/s" link_width="16"/>
|
||||
<pci busid="0108:00:00.0" class="0x020700" link_speed="16 GT/s" link_width="16"/>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
|
@ -0,0 +1,14 @@
|
|||
# for local testing (cpu)
|
||||
torchvision==0.12.0
|
||||
torch==1.11.0
|
||||
transformers==4.18.0
|
||||
|
||||
# for metrics reporting/plotting
|
||||
mlflow==1.25.1
|
||||
azureml-mlflow==1.41.0
|
||||
matplotlib==3.5.2
|
||||
tqdm==4.64.0
|
||||
psutil==5.9.0
|
||||
|
||||
# for unit testing
|
||||
pytest==7.1.2
|
|
@ -0,0 +1,20 @@
|
|||
# NOTE: install these requirements to run the unit tests
|
||||
|
||||
# CV packages
|
||||
torchvision==0.12.0
|
||||
torch==1.11.0
|
||||
transformers==4.18.0
|
||||
|
||||
# for metrics reporting/plotting
|
||||
mlflow==1.25.1
|
||||
azureml-mlflow==1.41.0
|
||||
matplotlib==3.5.2
|
||||
tqdm==4.64.0
|
||||
psutil==5.9.0
|
||||
|
||||
# for unit testing
|
||||
pytest==7.1.2
|
||||
pytest-cov==2.12.1
|
||||
|
||||
# Fix: force protobuf downgrade to avoid exception
|
||||
protobuf==3.20.1
|
|
@ -0,0 +1,108 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Original Author: Jeff Omhover (MSFT)
|
||||
|
||||
"""
|
||||
This script contains methods to hangle inputs for pytorch model training
|
||||
using the COCO dataset https://cocodataset.org/.
|
||||
"""
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
|
||||
import torchvision
|
||||
|
||||
|
||||
def find_image_subfolder(current_root):
|
||||
"""Identifies the right level of a directory
|
||||
that matches with torchvision.datasets.ImageFolder requirements.
|
||||
In particular, if images are in current_root/foo/bar/category_X/*.jpg
|
||||
we will want to feed current_root/foo/bar/ to ImageFolder.
|
||||
|
||||
Args:
|
||||
current_root (str): a given directory
|
||||
|
||||
Returns:
|
||||
image_folder (str): the subfolder containing multiple subdirs
|
||||
"""
|
||||
if not os.path.isdir(current_root):
|
||||
raise FileNotFoundError(
|
||||
f"While identifying the image folder, provided current_root={current_root} is not a directory."
|
||||
)
|
||||
|
||||
sub_directories = glob.glob(os.path.join(current_root, "*"))
|
||||
if len(sub_directories) == 1:
|
||||
# let's do it recursively
|
||||
return find_image_subfolder(sub_directories[0])
|
||||
if len(sub_directories) == 0:
|
||||
raise FileNotFoundError(
|
||||
f"While identifying image folder under {current_root}, we found no content at all. The image folder is empty."
|
||||
)
|
||||
else:
|
||||
return current_root
|
||||
|
||||
|
||||
def build_image_datasets(
|
||||
train_images_dir: str,
|
||||
valid_images_dir: str,
|
||||
input_size: int = 224,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
train_images_dir (str): path to the directory containing training images
|
||||
valid_images_dir (str): path to the directory containing validation images
|
||||
input_size (int): input size expected by the model
|
||||
|
||||
Returns:
|
||||
train_dataset (torchvision.datasets.VisionDataset): training dataset
|
||||
valid_dataset (torchvision.datasets.VisionDataset): validation dataset
|
||||
labels (Dict[str, int]): labels
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# identify the right level of sub directory
|
||||
train_images_dir = find_image_subfolder(train_images_dir)
|
||||
|
||||
logger.info(f"Creating training dataset from {train_images_dir}")
|
||||
|
||||
train_transform = torchvision.transforms.Compose(
|
||||
[
|
||||
torchvision.transforms.RandomResizedCrop(input_size),
|
||||
torchvision.transforms.RandomHorizontalFlip(),
|
||||
torchvision.transforms.ToTensor(),
|
||||
torchvision.transforms.Normalize(
|
||||
mean=[0.485, 0.456, 0.405], std=[0.229, 0.224, 0.225]
|
||||
),
|
||||
]
|
||||
)
|
||||
train_dataset = torchvision.datasets.ImageFolder(
|
||||
root=train_images_dir, transform=train_transform
|
||||
)
|
||||
logger.info(
|
||||
f"ImageFolder loaded training image from {train_images_dir}: samples={len(train_dataset)}, #classes={len(train_dataset.classes)} classes={train_dataset.classes}"
|
||||
)
|
||||
|
||||
# identify the right level of sub directory
|
||||
valid_images_dir = find_image_subfolder(valid_images_dir)
|
||||
|
||||
logger.info(f"Creating validation dataset from {valid_images_dir}")
|
||||
|
||||
valid_transform = torchvision.transforms.Compose(
|
||||
[
|
||||
torchvision.transforms.Resize(input_size),
|
||||
torchvision.transforms.CenterCrop(input_size),
|
||||
torchvision.transforms.ToTensor(),
|
||||
torchvision.transforms.Normalize(
|
||||
mean=[0.485, 0.456, 0.405], std=[0.229, 0.224, 0.225]
|
||||
),
|
||||
]
|
||||
)
|
||||
valid_dataset = torchvision.datasets.ImageFolder(
|
||||
root=valid_images_dir, transform=valid_transform
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"ImageFolder loaded validation image from {valid_images_dir}: samples={len(valid_dataset)}, #classes={len(valid_dataset.classes)} classes={valid_dataset.classes}"
|
||||
)
|
||||
|
||||
return train_dataset, valid_dataset, train_dataset.classes
|
|
@ -0,0 +1 @@
|
|||
from .model_loader import MODEL_ARCH_LIST, get_model_metadata, load_model
|
|
@ -0,0 +1,88 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Original Author: Jeff Omhover (MSFT)
|
||||
|
||||
"""
|
||||
This script provides code to load and setup a variety of models from multiple libraries.
|
||||
"""
|
||||
|
||||
MODEL_ARCH_MAP = {
|
||||
# TorchVision models
|
||||
"resnet18": {"input_size": 224, "library": "torchvision"},
|
||||
"resnet34": {"input_size": 224, "library": "torchvision"},
|
||||
"resnet50": {"input_size": 224, "library": "torchvision"},
|
||||
"resnet101": {"input_size": 224, "library": "torchvision"},
|
||||
"resnet152": {"input_size": 224, "library": "torchvision"},
|
||||
"alexnet": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg11": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg11_bn": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg13": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg13_bn": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg16": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg16_bn": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg19": {"input_size": 224, "library": "torchvision"},
|
||||
"vgg19_bn": {"input_size": 224, "library": "torchvision"},
|
||||
"densenet121": {"input_size": 224, "library": "torchvision"},
|
||||
"densenet169": {"input_size": 224, "library": "torchvision"},
|
||||
"densenet201": {"input_size": 224, "library": "torchvision"},
|
||||
"densenet161": {"input_size": 224, "library": "torchvision"},
|
||||
# Swin HuggingFace models
|
||||
"microsoft/swin-tiny-patch4-window7-224": {"input_size": 224, "library": "swin"},
|
||||
"microsoft/swin-small-patch4-window7-224": {"input_size": 224, "library": "swin"},
|
||||
"microsoft/swin-base-patch4-window7-224": {"input_size": 224, "library": "swin"},
|
||||
"microsoft/swin-base-patch4-window7-224-in22k": {
|
||||
"input_size": 224,
|
||||
"library": "swin",
|
||||
},
|
||||
"microsoft/swin-large-patch4-window7-224": {"input_size": 224, "library": "swin"},
|
||||
"microsoft/swin-large-patch4-window7-224-in22k": {
|
||||
"input_size": 224,
|
||||
"library": "swin",
|
||||
},
|
||||
"microsoft/swin-base-patch4-window12-384": {"input_size": 384, "library": "swin"},
|
||||
"microsoft/swin-base-patch4-window12-384-in22k": {
|
||||
"input_size": 384,
|
||||
"library": "swin",
|
||||
},
|
||||
"microsoft/swin-large-patch4-window12-384": {"input_size": 384, "library": "swin"},
|
||||
"microsoft/swin-large-patch4-window12-384-in22k": {
|
||||
"input_size": 384,
|
||||
"library": "swin",
|
||||
},
|
||||
# test model (super small)
|
||||
"test": {"input_size": 32, "library": "test"},
|
||||
}
|
||||
|
||||
MODEL_ARCH_LIST = list(MODEL_ARCH_MAP.keys())
|
||||
|
||||
|
||||
def get_model_metadata(model_arch: str):
|
||||
"""Returns the model metadata"""
|
||||
if model_arch in MODEL_ARCH_MAP:
|
||||
return MODEL_ARCH_MAP[model_arch]
|
||||
else:
|
||||
raise NotImplementedError(f"model_arch={model_arch} is not implemented yet.")
|
||||
|
||||
|
||||
def load_model(model_arch: str, output_dimension: int = 1, pretrained: bool = True):
|
||||
"""Loads a model from a given arch and sets it up for training"""
|
||||
if model_arch not in MODEL_ARCH_MAP:
|
||||
raise NotImplementedError(f"model_arch={model_arch} is not implemented yet.")
|
||||
|
||||
if MODEL_ARCH_MAP[model_arch]["library"] == "torchvision":
|
||||
from .torchvision_models import load_torchvision_model
|
||||
|
||||
return load_torchvision_model(model_arch, output_dimension, pretrained)
|
||||
if MODEL_ARCH_MAP[model_arch]["library"] == "swin":
|
||||
from .swin_models import load_swin_model
|
||||
|
||||
return load_swin_model(model_arch, output_dimension, pretrained)
|
||||
|
||||
if MODEL_ARCH_MAP[model_arch]["library"] == "test":
|
||||
from .test_model import load_test_model
|
||||
|
||||
return load_test_model(model_arch, output_dimension, pretrained)
|
||||
|
||||
raise NotImplementedError(
|
||||
f"library {MODEL_ARCH_MAP[model_arch]['library']} is not implemented yet."
|
||||
)
|
|
@ -0,0 +1,30 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Original Author: Jeff Omhover (MSFT)
|
||||
|
||||
"""
|
||||
This script provides code to load and setup a variety of models from torchvision.models.
|
||||
"""
|
||||
import logging
|
||||
|
||||
import torch
|
||||
from transformers import SwinConfig, SwinForImageClassification
|
||||
|
||||
|
||||
def load_swin_model(
|
||||
model_arch: str, output_dimension: int = 1, pretrained: bool = True
|
||||
):
|
||||
"""Loads a model from a given arch and sets it up for training"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(
|
||||
f"Loading model from arch={model_arch} pretrained={pretrained} output_dimension={output_dimension}"
|
||||
)
|
||||
if pretrained:
|
||||
model = SwinForImageClassification.from_pretrained(model_arch)
|
||||
else:
|
||||
model = SwinForImageClassification(config=SwinConfig())
|
||||
|
||||
model.classifier = torch.nn.Linear(model.swin.num_features, output_dimension)
|
||||
|
||||
return model
|
|
@ -0,0 +1,40 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Original Author: Jeff Omhover (MSFT)
|
||||
|
||||
"""
|
||||
Creates a super simple 32x32 CNN model for testing.
|
||||
From the CIFAR10 tutorial https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
|
||||
"""
|
||||
import logging
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self, output_dimension):
|
||||
super().__init__()
|
||||
self.conv1 = nn.Conv2d(3, 6, 5)
|
||||
self.pool = nn.MaxPool2d(2, 2)
|
||||
self.conv2 = nn.Conv2d(6, 16, 5)
|
||||
self.fc1 = nn.Linear(16 * 5 * 5, 120)
|
||||
self.fc2 = nn.Linear(120, 84)
|
||||
self.fc3 = nn.Linear(84, output_dimension)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.pool(F.relu(self.conv1(x)))
|
||||
x = self.pool(F.relu(self.conv2(x)))
|
||||
x = torch.flatten(x, 1) # flatten all dimensions except batch
|
||||
x = F.relu(self.fc1(x))
|
||||
x = F.relu(self.fc2(x))
|
||||
x = self.fc3(x)
|
||||
return x
|
||||
|
||||
|
||||
def load_test_model(
|
||||
model_arch: str, output_dimension: int = 1, pretrained: bool = True
|
||||
):
|
||||
"""Loads a model from a given arch and sets it up for training"""
|
||||
return Net(output_dimension)
|
|
@ -0,0 +1,44 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Original Author: Jeff Omhover (MSFT)
|
||||
|
||||
"""
|
||||
This script provides code to load and setup a variety of models from torchvision.models.
|
||||
"""
|
||||
import logging
|
||||
|
||||
import torch
|
||||
import torchvision.models as models
|
||||
|
||||
|
||||
def load_torchvision_model(
|
||||
model_arch: str, output_dimension: int = 1, pretrained: bool = True
|
||||
):
|
||||
"""Loads a model from a given arch and sets it up for training"""
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(
|
||||
f"Loading model from arch={model_arch} pretrained={pretrained} output_dimension={output_dimension}"
|
||||
)
|
||||
if hasattr(models, model_arch):
|
||||
model = getattr(models, model_arch)(pretrained=pretrained)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"model_arch={model_arch} is not implemented in torchvision model zoo."
|
||||
)
|
||||
|
||||
# see https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html
|
||||
if model_arch.startswith("resnet"):
|
||||
model.fc = torch.nn.Linear(model.fc.in_features, output_dimension)
|
||||
elif model_arch == "alexnet":
|
||||
model.classifier[6] = torch.nn.Linear(4096, output_dimension)
|
||||
elif model_arch.startswith("vgg"):
|
||||
model.classifier[6] = torch.nn.Linear(4096, output_dimension)
|
||||
elif model_arch.startswith("densenet"):
|
||||
model.classifier = torch.nn.Linear(1024, output_dimension)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"loading model_arch={model_arch} is not implemented yet in our custom code."
|
||||
)
|
||||
|
||||
return model
|
|
@ -0,0 +1,390 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Original Author: Jeff Omhover (MSFT)
|
||||
|
||||
"""
|
||||
This script provides some helper code to help with pytorch profiling.
|
||||
"""
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import torch
|
||||
import mlflow
|
||||
import tempfile
|
||||
from torch.profiler import ProfilerActivity
|
||||
from typing import Any
|
||||
|
||||
|
||||
def markdown_trace_handler(dir_name: str, rank: int = 0):
|
||||
"""This handler can be used inside torch.profiler call to output
|
||||
tables in markdown format"""
|
||||
|
||||
def _handler_fn(prof) -> None:
|
||||
if not os.path.isdir(dir_name):
|
||||
try:
|
||||
os.makedirs(dir_name, exist_ok=True)
|
||||
except Exception:
|
||||
raise RuntimeError("Can't create directory: " + dir_name)
|
||||
|
||||
# Note: trying to identify a unique name for the file
|
||||
file_name = os.path.join(
|
||||
dir_name,
|
||||
f"stacks_rank{rank}_step{prof.step_num}_t{int(time.time() * 1000)}.ms",
|
||||
)
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
f"Exporting profiler trace as markdown at {file_name}"
|
||||
)
|
||||
# generate report in markdown format
|
||||
markdown = ["# Pytorch Profiler report"]
|
||||
|
||||
markdown.append("## Average by cuda time")
|
||||
markdown.append("```")
|
||||
markdown.append(
|
||||
prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)
|
||||
)
|
||||
markdown.append("```")
|
||||
|
||||
with open(file_name, "w") as out_file:
|
||||
out_file.write("\n".join(markdown))
|
||||
|
||||
return _handler_fn
|
||||
|
||||
|
||||
def composite_trace_handler(handler_list):
|
||||
"""This can call multiple trace handlers inside one"""
|
||||
|
||||
def _handler_fn(prof) -> None:
|
||||
for handler in handler_list:
|
||||
handler(prof)
|
||||
|
||||
return _handler_fn
|
||||
|
||||
|
||||
def export_stack_trace_handler(
|
||||
dir_name: str, rank: int = 0, metrics=["self_cuda_time_total"]
|
||||
):
|
||||
"""This handler can be used inside torch.profiler call to output
|
||||
tables in markdown format"""
|
||||
|
||||
def _handler_fn(prof) -> None:
|
||||
if not os.path.isdir(dir_name):
|
||||
try:
|
||||
os.makedirs(dir_name, exist_ok=True)
|
||||
except Exception:
|
||||
raise RuntimeError("Can't create directory: " + dir_name)
|
||||
|
||||
# Note: trying to identify a unique name for the file
|
||||
for metric in metrics:
|
||||
file_name = os.path.join(
|
||||
dir_name,
|
||||
f"stacks_{metric}_rank{rank}_step{prof.step_num}_t{ int(time.time() * 1000)}.txt",
|
||||
)
|
||||
|
||||
logging.getLogger(__name__).info(
|
||||
f"Exporting {metric} stacks as text at {file_name}"
|
||||
)
|
||||
|
||||
prof.export_stacks(file_name, metric)
|
||||
|
||||
return _handler_fn
|
||||
|
||||
|
||||
class PyTorchProfilerHandler:
|
||||
"""This class handles the initialization and setup of PyTorch profiler"""
|
||||
|
||||
def __init__(self, enabled=False, rank=None):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
enabled (bool): is profiling enabled?
|
||||
export_format (str): generate 'markdown' or 'tensorboard' profile in mlflow artifacts
|
||||
rank (int): rank of the current process/node
|
||||
"""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.enabled = enabled
|
||||
self.rank = rank
|
||||
self.profiler_output_tmp_dir = None
|
||||
self.profiler = None
|
||||
|
||||
def start_profiler(self):
|
||||
"""Setup and start the pytorch profiler.
|
||||
|
||||
Returns:
|
||||
profiler (torch.profiler): the profiler
|
||||
"""
|
||||
if self.enabled:
|
||||
self.profiler_output_tmp_dir = tempfile.TemporaryDirectory()
|
||||
self.logger.info(
|
||||
f"Starting profiler (enabled=True) with tmp dir {self.profiler_output_tmp_dir.name}."
|
||||
)
|
||||
|
||||
## profiler activities CPU/GPU
|
||||
activities = [ProfilerActivity.CPU]
|
||||
if torch.cuda.is_available():
|
||||
self.logger.info(f"Enabling CUDA in profiler.")
|
||||
activities.append(ProfilerActivity.CUDA)
|
||||
|
||||
## handlers for exporting profile at each step
|
||||
# we're creating a list to export in multiple formats
|
||||
trace_handlers = []
|
||||
|
||||
# export in markdown
|
||||
markdown_logs_export = os.path.join(
|
||||
self.profiler_output_tmp_dir.name, "markdown"
|
||||
)
|
||||
trace_handlers.append(
|
||||
markdown_trace_handler(markdown_logs_export, rank=self.rank)
|
||||
)
|
||||
|
||||
# export stacks in txt
|
||||
stacks_logs_export = os.path.join(
|
||||
self.profiler_output_tmp_dir.name, "stacks"
|
||||
)
|
||||
stack_metrics = ["self_cpu_time_total"]
|
||||
if torch.cuda.is_available():
|
||||
stack_metrics.append("self_cuda_time_total")
|
||||
|
||||
trace_handlers.append(
|
||||
export_stack_trace_handler(
|
||||
stacks_logs_export, rank=self.rank, metrics=stack_metrics
|
||||
)
|
||||
)
|
||||
|
||||
# export tensorboard
|
||||
# NOTE: removed due to segfault in pytorch 1.11.0
|
||||
# will need to be uncommented for pytorch 1.11.1 which has a fix
|
||||
# tensorboard_logs_export = os.path.join(
|
||||
# self.profiler_output_tmp_dir.name, "tensorboard_logs"
|
||||
# )
|
||||
# trace_handlers.append(torch.profiler.tensorboard_trace_handler(
|
||||
# tensorboard_logs_export
|
||||
# ))
|
||||
|
||||
# profiler takes 1 handler, we're composing all above in a single handler
|
||||
trace_handler = composite_trace_handler(trace_handlers)
|
||||
|
||||
# process every single step
|
||||
profiler_schedule = torch.profiler.schedule(wait=0, warmup=0, active=1)
|
||||
|
||||
# initialize profiler
|
||||
self.profiler = torch.profiler.profile(
|
||||
schedule=profiler_schedule,
|
||||
record_shapes=True,
|
||||
with_flops=True,
|
||||
profile_memory=True,
|
||||
activities=activities,
|
||||
with_stack=True, # needed to export stacks
|
||||
on_trace_ready=trace_handler,
|
||||
)
|
||||
self.profiler.start()
|
||||
|
||||
else:
|
||||
self.logger.info(f"Profiler not started (enabled=False).")
|
||||
self.profiler = None
|
||||
|
||||
return self.profiler
|
||||
|
||||
def stop_profiler(self) -> None:
|
||||
"""Stops the pytorch profiler and logs the outputs using mlflow"""
|
||||
if self.profiler:
|
||||
self.logger.info(f"Stopping profiler.")
|
||||
self.profiler.stop()
|
||||
|
||||
# log via mlflow
|
||||
self.logger.info(
|
||||
f"MLFLOW log {self.profiler_output_tmp_dir.name} as an artifact."
|
||||
)
|
||||
mlflow.log_artifacts(
|
||||
self.profiler_output_tmp_dir.name, artifact_path="profiler"
|
||||
)
|
||||
|
||||
self.logger.info(
|
||||
f"Clean up profiler temp dir {self.profiler_output_tmp_dir.name}"
|
||||
)
|
||||
self.profiler_output_tmp_dir.cleanup()
|
||||
else:
|
||||
self.logger.info(
|
||||
"Not stopping profiler as it was not started in the first place."
|
||||
)
|
||||
|
||||
|
||||
class LogTimeBlock(object):
|
||||
"""This class should be used to time a code block.
|
||||
The time diff is computed from __enter__ to __exit__.
|
||||
Example
|
||||
-------
|
||||
```python
|
||||
with LogTimeBlock("my_perf_metric_name"):
|
||||
print("(((sleeping for 1 second)))")
|
||||
time.sleep(1)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, name, **kwargs):
|
||||
"""
|
||||
Constructs the LogTimeBlock.
|
||||
Args:
|
||||
name (str): key for the time difference (for storing as metric)
|
||||
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
|
||||
"""
|
||||
# kwargs
|
||||
self.step = kwargs.get("step", None)
|
||||
self.enabled = kwargs.get("enabled", True)
|
||||
|
||||
# internal variables
|
||||
self.name = name
|
||||
self.start_time = None
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
def __enter__(self):
|
||||
"""Starts the timer, gets triggered at beginning of code block"""
|
||||
if not self.enabled:
|
||||
return
|
||||
self.start_time = time.time() # starts "timer"
|
||||
|
||||
def __exit__(self, exc_type, value, traceback):
|
||||
"""Stops the timer and stores accordingly
|
||||
gets triggered at beginning of code block.
|
||||
|
||||
Note:
|
||||
arguments are by design for with statements.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
run_time = time.time() - self.start_time # stops "timer"
|
||||
|
||||
self._logger.info(
|
||||
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
|
||||
)
|
||||
mlflow.log_metric(self.name + ".time", run_time)
|
||||
|
||||
|
||||
class LogDiskIOBlock(object):
|
||||
def __init__(self, name, **kwargs):
|
||||
"""
|
||||
Constructs the LogDiskUsageBlock.
|
||||
Args:
|
||||
name (str): key for the time difference (for storing as metric)
|
||||
kwargs (dict): any keyword will be added as properties to metrics for logging (work in progress)
|
||||
"""
|
||||
# kwargs
|
||||
self.step = kwargs.get("step", None)
|
||||
self.enabled = kwargs.get("enabled", True)
|
||||
|
||||
# internal variables
|
||||
self.name = name
|
||||
self.process_id = os.getpid() # focus on current process
|
||||
self.start_time = None
|
||||
self.start_disk_counters = None
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
def __enter__(self):
|
||||
"""Get initial values, gets triggered at beginning of code block"""
|
||||
if not self.enabled:
|
||||
return
|
||||
try:
|
||||
import psutil
|
||||
|
||||
self.start_time = time.time()
|
||||
self.start_disk_counters = psutil.Process(self.process_id).io_counters()
|
||||
|
||||
except ModuleNotFoundError:
|
||||
self.logger.critical("import psutil failed, cannot display disk stats.")
|
||||
|
||||
def __exit__(self, exc_type, value, traceback):
|
||||
"""Stops the timer and stores accordingly
|
||||
gets triggered at beginning of code block.
|
||||
|
||||
Note:
|
||||
arguments are by design for with statements.
|
||||
"""
|
||||
if not self.enabled:
|
||||
return
|
||||
try:
|
||||
import psutil
|
||||
except ModuleNotFoundError:
|
||||
self.logger.critical("import psutil failed, cannot display disk stats.")
|
||||
return
|
||||
|
||||
run_time = time.time() - self.start_time
|
||||
|
||||
disk_io_metrics = {}
|
||||
end_disk_counters = psutil.Process(self.process_id).io_counters()
|
||||
disk_io_metrics[f"{self.name}.disk.read"] = (
|
||||
end_disk_counters.read_bytes - self.start_disk_counters.read_bytes
|
||||
) / (1024 * 1024)
|
||||
disk_io_metrics[f"{self.name}.disk.write"] = (
|
||||
end_disk_counters.write_bytes - self.start_disk_counters.write_bytes
|
||||
) / (1024 * 1024)
|
||||
|
||||
self._logger.info(
|
||||
f"--- time elapsed: {self.name} = {run_time:2f} s [step={self.step}]"
|
||||
)
|
||||
self._logger.info(f"--- disk_io_metrics: {disk_io_metrics}s [step={self.step}]")
|
||||
|
||||
mlflow.log_metrics(disk_io_metrics)
|
||||
|
||||
|
||||
class LogTimeOfIterator: # lgtm [py/iter-returns-non-self]
|
||||
"""This class is intended to "wrap" an existing Iterator
|
||||
and log metrics for each next() call"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
wrapped_sequence: Any,
|
||||
name: str,
|
||||
enabled: bool = True,
|
||||
async_collector: dict = None,
|
||||
):
|
||||
self.wrapped_sequence = wrapped_sequence
|
||||
self.wrapped_iterator = None
|
||||
|
||||
# for metrics
|
||||
self.enabled = enabled
|
||||
self.name = name
|
||||
self.iterator_times = []
|
||||
self.metrics = {}
|
||||
self.async_collector = async_collector
|
||||
|
||||
self._logger = logging.getLogger(__name__)
|
||||
|
||||
def __iter__(self):
|
||||
"""Creates the iterator"""
|
||||
if self.enabled:
|
||||
start_time = time.time()
|
||||
# if enabled, creates iterator from wrapped_sequence
|
||||
self.wrapped_iterator = self.wrapped_sequence.__iter__()
|
||||
self.metrics[f"{self.name}.init"] = time.time() - start_time
|
||||
|
||||
# return self
|
||||
return self
|
||||
else:
|
||||
# if disabled, return the iterator from wrapped_sequence
|
||||
# so that LogTimeOfIterator.__next__() will never get called
|
||||
return self.wrapped_sequence.__iter__()
|
||||
|
||||
def __next__(self):
|
||||
"""Iterates"""
|
||||
try:
|
||||
start_time = time.time()
|
||||
next_val = self.wrapped_iterator.__next__()
|
||||
self.iterator_times.append(time.time() - start_time)
|
||||
return next_val
|
||||
except StopIteration as e:
|
||||
self.log_metrics()
|
||||
raise e
|
||||
|
||||
def log_metrics(self):
|
||||
"""Logs metrics once iterator is finished"""
|
||||
self.metrics[f"{self.name}.count"] = len(self.iterator_times)
|
||||
self.metrics[f"{self.name}.time.sum"] = sum(self.iterator_times)
|
||||
self.metrics[f"{self.name}.time.first"] = self.iterator_times[0]
|
||||
|
||||
if self.async_collector is not None:
|
||||
self._logger.info(f"Async MLFLOW: {self.metrics}")
|
||||
for k in self.metrics:
|
||||
self.async_collector[k] = self.metrics[k]
|
||||
else:
|
||||
self._logger.info(f"MLFLOW: {self.metrics}")
|
||||
mlflow.log_metrics(self.metrics)
|
|
@ -0,0 +1,960 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
# Original Author: Jeff Omhover (MSFT)
|
||||
|
||||
|
||||
"""
|
||||
This script implements a Distributed PyTorch training sequence.
|
||||
|
||||
IMPORTANT: We have tagged the code with the following expressions to walk you through
|
||||
the key implementation details.
|
||||
|
||||
Using your editor, search for those strings to get an idea of how to implement:
|
||||
- DISTRIBUTED : how to implement distributed pytorch
|
||||
- MLFLOW : how to implement mlflow reporting of metrics and artifacts
|
||||
- PROFILER : how to implement pytorch profiler
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import traceback
|
||||
from distutils.util import strtobool
|
||||
|
||||
import mlflow
|
||||
|
||||
# the long list of torch imports
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from torch.optim import lr_scheduler
|
||||
from torch.profiler import record_function
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm
|
||||
from transformers.utils import ModelOutput
|
||||
|
||||
# add path to here, if necessary
|
||||
COMPONENT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "."))
|
||||
if COMPONENT_ROOT not in sys.path:
|
||||
logging.info(f"Adding {COMPONENT_ROOT} to path")
|
||||
sys.path.append(str(COMPONENT_ROOT))
|
||||
|
||||
from image_io import build_image_datasets
|
||||
|
||||
# internal imports
|
||||
from model import get_model_metadata, load_model
|
||||
from profiling import (
|
||||
LogDiskIOBlock,
|
||||
LogTimeBlock,
|
||||
LogTimeOfIterator,
|
||||
PyTorchProfilerHandler,
|
||||
)
|
||||
|
||||
torch.set_default_dtype(torch.float64)
|
||||
|
||||
|
||||
class PyTorchDistributedModelTrainingSequence:
|
||||
"""Generic class to run the sequence for training a PyTorch model
|
||||
using distributed training."""
|
||||
|
||||
def __init__(self):
|
||||
"""Constructor"""
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# DATA
|
||||
self.training_data_sampler = None
|
||||
self.training_data_loader = None
|
||||
self.validation_data_loader = None
|
||||
|
||||
# MODEL
|
||||
self.model = None
|
||||
self.labels = []
|
||||
self.model_signature = None
|
||||
|
||||
# DISTRIBUTED CONFIG
|
||||
self.world_size = 1
|
||||
self.world_rank = 0
|
||||
self.local_world_size = 1
|
||||
self.local_rank = 0
|
||||
self.multinode_available = False
|
||||
self.cpu_count = os.cpu_count()
|
||||
self.device = None
|
||||
# NOTE: if we're running multiple nodes, this indicates if we're on first node
|
||||
self.self_is_main_node = True
|
||||
|
||||
# TRAINING CONFIGS
|
||||
self.dataloading_config = None
|
||||
self.training_config = None
|
||||
|
||||
# PROFILER
|
||||
self.profiler = None
|
||||
self.profiler_output_tmp_dir = None
|
||||
|
||||
#####################
|
||||
### SETUP METHODS ###
|
||||
#####################
|
||||
|
||||
def setup_config(self, args):
|
||||
"""Sets internal variables using provided CLI arguments (see build_arguments_parser()).
|
||||
In particular, sets device(cuda) and multinode parameters."""
|
||||
self.dataloading_config = args
|
||||
self.training_config = args
|
||||
|
||||
# verify parameter default values
|
||||
if self.dataloading_config.num_workers is None:
|
||||
self.dataloading_config.num_workers = 0
|
||||
if self.dataloading_config.num_workers < 0:
|
||||
self.dataloading_config.num_workers = self.cpu_count
|
||||
if self.dataloading_config.num_workers == 0:
|
||||
self.logger.warning(
|
||||
"You specified num_workers=0, forcing prefetch_factor to be discarded."
|
||||
)
|
||||
self.dataloading_config.prefetch_factor = None
|
||||
|
||||
# NOTE: strtobool returns an int, converting to bool explicitely
|
||||
self.dataloading_config.pin_memory = bool(self.dataloading_config.pin_memory)
|
||||
self.dataloading_config.non_blocking = bool(
|
||||
self.dataloading_config.non_blocking
|
||||
)
|
||||
|
||||
# add this switch to test for different strategies
|
||||
if self.dataloading_config.multiprocessing_sharing_strategy:
|
||||
torch.multiprocessing.set_sharing_strategy(
|
||||
self.dataloading_config.multiprocessing_sharing_strategy
|
||||
)
|
||||
|
||||
# DISTRIBUTED: detect multinode config
|
||||
# depending on the Azure ML distribution.type, different environment variables will be provided
|
||||
# to configure DistributedDataParallel
|
||||
self.distributed_backend = args.distributed_backend
|
||||
if self.distributed_backend == "nccl":
|
||||
self.world_size = int(os.environ.get("WORLD_SIZE", "1"))
|
||||
self.world_rank = int(os.environ.get("RANK", "0"))
|
||||
self.local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE", "1"))
|
||||
self.local_rank = int(os.environ.get("LOCAL_RANK", "0"))
|
||||
self.multinode_available = self.world_size > 1
|
||||
self.self_is_main_node = self.world_rank == 0
|
||||
|
||||
elif self.distributed_backend == "mpi":
|
||||
# Note: Distributed pytorch package doesn't have MPI built in.
|
||||
# MPI is only included if you build PyTorch from source on a host that has MPI installed.
|
||||
self.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", "1"))
|
||||
self.world_rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", "0"))
|
||||
self.local_world_size = int(
|
||||
os.environ.get("OMPI_COMM_WORLD_LOCAL_SIZE", "1")
|
||||
)
|
||||
self.local_rank = int(os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", "0"))
|
||||
self.multinode_available = self.world_size > 1
|
||||
self.self_is_main_node = self.world_rank == 0
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"distributed_backend={self.distributed_backend} is not implemented yet."
|
||||
)
|
||||
|
||||
# Use CUDA if it is available
|
||||
if not self.training_config.disable_cuda and torch.cuda.is_available():
|
||||
self.logger.info(
|
||||
f"Setting up torch.device for CUDA for local gpu:{self.local_rank}"
|
||||
)
|
||||
self.device = torch.device(self.local_rank)
|
||||
else:
|
||||
self.logger.info(f"Setting up torch.device for cpu")
|
||||
self.device = torch.device("cpu")
|
||||
|
||||
if self.multinode_available:
|
||||
self.logger.info(
|
||||
f"Running in multinode with backend={self.distributed_backend} local_rank={self.local_rank} rank={self.world_rank} size={self.world_size}"
|
||||
)
|
||||
# DISTRIBUTED: this is required to initialize the pytorch backend
|
||||
torch.distributed.init_process_group(
|
||||
self.distributed_backend,
|
||||
rank=self.world_rank,
|
||||
world_size=self.world_size,
|
||||
)
|
||||
else:
|
||||
self.logger.info(
|
||||
f"Not running in multinode, so not initializing process group."
|
||||
)
|
||||
|
||||
# DISTRIBUTED: in distributed mode, you want to report parameters
|
||||
# only from main process (rank==0) to avoid conflict
|
||||
if self.self_is_main_node:
|
||||
# MLFLOW: report relevant parameters using mlflow
|
||||
logged_params = {
|
||||
# log some distribution params
|
||||
"nodes": int(os.environ.get("AZUREML_NODE_COUNT", "1")),
|
||||
"instance_per_node": self.world_size
|
||||
// int(os.environ.get("AZUREML_NODE_COUNT", "1")),
|
||||
"cuda_available": torch.cuda.is_available(),
|
||||
"disable_cuda": self.training_config.disable_cuda,
|
||||
"distributed": self.multinode_available,
|
||||
"distributed_backend": self.distributed_backend,
|
||||
# data loading params
|
||||
"batch_size": self.dataloading_config.batch_size,
|
||||
"num_workers": self.dataloading_config.num_workers,
|
||||
"cpu_count": self.cpu_count,
|
||||
"prefetch_factor": self.dataloading_config.prefetch_factor,
|
||||
"persistent_workers": self.dataloading_config.persistent_workers,
|
||||
"pin_memory": self.dataloading_config.pin_memory,
|
||||
"non_blocking": self.dataloading_config.non_blocking,
|
||||
"multiprocessing_sharing_strategy": self.dataloading_config.multiprocessing_sharing_strategy,
|
||||
# training params
|
||||
"model_arch": self.training_config.model_arch,
|
||||
"model_arch_pretrained": self.training_config.model_arch_pretrained,
|
||||
"optimizer.learning_rate": self.training_config.learning_rate,
|
||||
"optimizer.momentum": self.training_config.momentum,
|
||||
# profiling params
|
||||
"enable_profiling": self.training_config.enable_profiling,
|
||||
}
|
||||
|
||||
if not self.training_config.disable_cuda and torch.cuda.is_available():
|
||||
# add some gpu properties
|
||||
logged_params["cuda_device_count"] = torch.cuda.device_count()
|
||||
cuda_device_properties = torch.cuda.get_device_properties(self.device)
|
||||
logged_params["cuda_device_name"] = cuda_device_properties.name
|
||||
logged_params["cuda_device_major"] = cuda_device_properties.major
|
||||
logged_params["cuda_device_minor"] = cuda_device_properties.minor
|
||||
logged_params[
|
||||
"cuda_device_memory"
|
||||
] = cuda_device_properties.total_memory
|
||||
logged_params[
|
||||
"cuda_device_processor_count"
|
||||
] = cuda_device_properties.multi_processor_count
|
||||
|
||||
mlflow.log_params(logged_params)
|
||||
|
||||
def setup_datasets(
|
||||
self,
|
||||
training_dataset: torch.utils.data.Dataset,
|
||||
validation_dataset: torch.utils.data.Dataset,
|
||||
labels: list,
|
||||
):
|
||||
"""Creates and sets up dataloaders for training/validation datasets."""
|
||||
self.labels = labels
|
||||
|
||||
# DISTRIBUTED: you need to use a DistributedSampler that wraps your dataset
|
||||
# it will draw a different sample on each node/process to distribute data sampling
|
||||
self.training_data_sampler = DistributedSampler(
|
||||
training_dataset, num_replicas=self.world_size, rank=self.world_rank
|
||||
)
|
||||
|
||||
# setting up DataLoader with the right arguments
|
||||
optional_data_loading_kwargs = {}
|
||||
|
||||
if self.dataloading_config.num_workers > 0:
|
||||
# NOTE: this option _ONLY_ applies if num_workers > 0
|
||||
# or else DataLoader will except
|
||||
optional_data_loading_kwargs[
|
||||
"prefetch_factor"
|
||||
] = self.dataloading_config.prefetch_factor
|
||||
optional_data_loading_kwargs[
|
||||
"persistent_workers"
|
||||
] = self.dataloading_config.persistent_workers
|
||||
|
||||
self.training_data_loader = DataLoader(
|
||||
training_dataset,
|
||||
batch_size=self.dataloading_config.batch_size,
|
||||
num_workers=self.dataloading_config.num_workers, # self.cpu_count,
|
||||
pin_memory=self.dataloading_config.pin_memory,
|
||||
# DISTRIBUTED: the sampler needs to be provided to the DataLoader
|
||||
sampler=self.training_data_sampler,
|
||||
# all other args
|
||||
**optional_data_loading_kwargs,
|
||||
)
|
||||
|
||||
# DISTRIBUTED: we don't need a sampler for validation set
|
||||
# it is used as-is in every node/process
|
||||
self.validation_data_loader = DataLoader(
|
||||
validation_dataset,
|
||||
batch_size=self.dataloading_config.batch_size,
|
||||
num_workers=self.dataloading_config.num_workers, # self.cpu_count,
|
||||
pin_memory=self.dataloading_config.pin_memory,
|
||||
)
|
||||
|
||||
if self.self_is_main_node:
|
||||
# MLFLOW: report relevant parameters using mlflow
|
||||
mlflow.log_params({"num_classes": len(labels)})
|
||||
|
||||
def setup_model(self, model):
|
||||
"""Configures a model for training."""
|
||||
self.logger.info(f"Setting up model to use device {self.device}")
|
||||
self.model = model.to(self.device)
|
||||
|
||||
# DISTRIBUTED: the model needs to be wrapped in a DistributedDataParallel class
|
||||
if self.multinode_available:
|
||||
self.logger.info(f"Setting up model to use DistributedDataParallel.")
|
||||
self.model = torch.nn.parallel.DistributedDataParallel(self.model)
|
||||
|
||||
# fun: log the number of parameters
|
||||
params_count = 0
|
||||
for param in model.parameters():
|
||||
if param.requires_grad:
|
||||
params_count += param.numel()
|
||||
self.logger.info(
|
||||
"MLFLOW: model_param_count={:.2f} (millions)".format(
|
||||
round(params_count / 1e6, 2)
|
||||
)
|
||||
)
|
||||
if self.self_is_main_node:
|
||||
mlflow.log_params({"model_param_count": round(params_count / 1e6, 2)})
|
||||
|
||||
return self.model
|
||||
|
||||
########################
|
||||
### TRAINING METHODS ###
|
||||
########################
|
||||
|
||||
def _epoch_eval(self, epoch, criterion):
|
||||
"""Called during train() for running the eval phase of one epoch."""
|
||||
with torch.no_grad():
|
||||
num_correct = 0
|
||||
num_total_images = 0
|
||||
running_loss = 0.0
|
||||
|
||||
epoch_eval_metrics = {}
|
||||
|
||||
# PROFILER: here we're introducing a layer on top of data loader to capture its performance
|
||||
# in pratice, we'd just use for images, targets in tqdm(self.training_data_loader)
|
||||
for images, targets in LogTimeOfIterator(
|
||||
tqdm(self.validation_data_loader),
|
||||
"validation_data_loader",
|
||||
async_collector=epoch_eval_metrics,
|
||||
):
|
||||
with record_function("eval.to_device"):
|
||||
images = images.to(
|
||||
self.device, non_blocking=self.dataloading_config.non_blocking
|
||||
)
|
||||
targets = targets.to(
|
||||
self.device, non_blocking=self.dataloading_config.non_blocking
|
||||
)
|
||||
|
||||
with record_function("eval.forward"):
|
||||
outputs = self.model(images)
|
||||
|
||||
if isinstance(outputs, torch.Tensor):
|
||||
# if we're training a regular pytorch model (ex: torchvision)
|
||||
loss = criterion(outputs, targets)
|
||||
_, predicted = torch.max(outputs.data, 1)
|
||||
correct = predicted == targets
|
||||
elif isinstance(outputs, ModelOutput):
|
||||
# if we're training a HuggingFace model
|
||||
loss = criterion(outputs.logits, targets)
|
||||
_, predicted = torch.max(outputs.logits.data, 1)
|
||||
correct = predicted == targets
|
||||
else:
|
||||
# if anything else, just except
|
||||
raise ValueError(
|
||||
f"outputs from model is type {type(outputs)} which is unknown."
|
||||
)
|
||||
|
||||
running_loss += loss.item() * images.size(0)
|
||||
|
||||
num_correct += torch.sum(correct).item()
|
||||
num_total_images += len(images)
|
||||
|
||||
epoch_eval_metrics["running_loss"] = running_loss
|
||||
epoch_eval_metrics["num_correct"] = num_correct
|
||||
epoch_eval_metrics["num_samples"] = num_total_images
|
||||
|
||||
return epoch_eval_metrics
|
||||
|
||||
def _epoch_train(self, epoch, optimizer, scheduler, criterion):
|
||||
"""Called during train() for running the train phase of one epoch."""
|
||||
self.model.train()
|
||||
self.training_data_sampler.set_epoch(epoch)
|
||||
|
||||
num_correct = 0
|
||||
num_total_images = 0
|
||||
running_loss = 0.0
|
||||
|
||||
epoch_train_metrics = {}
|
||||
|
||||
# PROFILER: here we're introducing a layer on top of data loader to capture its performance
|
||||
# in pratice, we'd just use for images, targets in tqdm(self.training_data_loader)
|
||||
for images, targets in LogTimeOfIterator(
|
||||
tqdm(self.training_data_loader),
|
||||
"training_data_loader",
|
||||
async_collector=epoch_train_metrics,
|
||||
):
|
||||
# PROFILER: record_function will report to the profiler (if enabled)
|
||||
# here a specific wall time for a given block of code
|
||||
with record_function("train.to_device"):
|
||||
images = images.to(
|
||||
self.device, non_blocking=self.dataloading_config.non_blocking
|
||||
)
|
||||
targets = targets.to(
|
||||
self.device, non_blocking=self.dataloading_config.non_blocking
|
||||
)
|
||||
|
||||
with record_function("train.forward"):
|
||||
# zero the parameter gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
outputs = self.model(images)
|
||||
|
||||
# if self.model_signature is None:
|
||||
# self.model_signature = infer_signature(images, outputs)
|
||||
|
||||
if isinstance(outputs, torch.Tensor):
|
||||
# if we're training a regular pytorch model (ex: torchvision)
|
||||
loss = criterion(outputs, targets)
|
||||
_, predicted = torch.max(outputs.data, 1)
|
||||
correct = predicted == targets
|
||||
elif isinstance(outputs, ModelOutput):
|
||||
# if we're training a HuggingFace model
|
||||
loss = criterion(outputs.logits, targets)
|
||||
_, predicted = torch.max(outputs.logits.data, 1)
|
||||
correct = predicted == targets
|
||||
else:
|
||||
# if anything else, just except
|
||||
raise ValueError(
|
||||
f"outputs from model is type {type(outputs)} which is unknown."
|
||||
)
|
||||
|
||||
running_loss += loss.item() * images.size(0)
|
||||
num_correct += torch.sum(correct).item()
|
||||
num_total_images += len(images)
|
||||
|
||||
# PROFILER: record_function will report to the profiler (if enabled)
|
||||
# here a specific wall time for a given block of code
|
||||
with record_function("train.backward"):
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
|
||||
epoch_train_metrics["running_loss"] = running_loss
|
||||
epoch_train_metrics["num_correct"] = num_correct
|
||||
epoch_train_metrics["num_samples"] = num_total_images
|
||||
|
||||
return epoch_train_metrics
|
||||
|
||||
def train(self, epochs: int = None, checkpoints_dir: str = None):
|
||||
"""Trains the model.
|
||||
|
||||
Args:
|
||||
epochs (int, optional): if not provided uses internal config
|
||||
checkpoints_dir (str, optional): path to write checkpoints
|
||||
"""
|
||||
if epochs is None:
|
||||
epochs = self.training_config.num_epochs
|
||||
|
||||
# Observe that all parameters are being optimized
|
||||
optimizer = optim.SGD(
|
||||
self.model.parameters(),
|
||||
lr=self.training_config.learning_rate,
|
||||
momentum=self.training_config.momentum,
|
||||
nesterov=True,
|
||||
# weight_decay=1e-4,
|
||||
)
|
||||
|
||||
# criterion = nn.BCEWithLogitsLoss()
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
# Decay LR by a factor of 0.1 every 7 epochs
|
||||
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
|
||||
|
||||
# DISTRIBUTED: export checkpoint only from main node
|
||||
if self.self_is_main_node and checkpoints_dir is not None:
|
||||
# saving checkpoint before training
|
||||
self.checkpoint_save(
|
||||
self.model, optimizer, checkpoints_dir, epoch=-1, loss=0.0
|
||||
)
|
||||
|
||||
# DISTRIBUTED: you'll node that this loop has nothing specifically "distributed"
|
||||
# that's because most of the changes are in the backend (DistributedDataParallel)
|
||||
for epoch in range(epochs):
|
||||
self.logger.info(f"Starting epoch={epoch}")
|
||||
|
||||
# we'll collect metrics we want to report for this epoch
|
||||
epoch_metrics = {}
|
||||
|
||||
# start timer for epoch time metric
|
||||
epoch_train_start = time.time()
|
||||
|
||||
# TRAIN: loop on training set and return metrics
|
||||
epoch_train_metrics = self._epoch_train(
|
||||
epoch, optimizer, scheduler, criterion
|
||||
)
|
||||
self.logger.info(f"Epoch metrics: {epoch_train_metrics}")
|
||||
|
||||
# stop timer
|
||||
epoch_metrics["epoch_train_time"] = time.time() - epoch_train_start
|
||||
|
||||
# record metrics of interest
|
||||
epoch_metrics["training_data_loader.count"] = epoch_train_metrics[
|
||||
"training_data_loader.count"
|
||||
]
|
||||
epoch_metrics["training_data_loader.time.sum"] = epoch_train_metrics[
|
||||
"training_data_loader.time.sum"
|
||||
]
|
||||
epoch_metrics["training_data_loader.time.first"] = epoch_train_metrics[
|
||||
"training_data_loader.time.first"
|
||||
]
|
||||
epoch_metrics["epoch_train_loss"] = (
|
||||
epoch_train_metrics["running_loss"] / epoch_train_metrics["num_samples"]
|
||||
)
|
||||
epoch_metrics["epoch_train_acc"] = (
|
||||
epoch_train_metrics["num_correct"] / epoch_train_metrics["num_samples"]
|
||||
)
|
||||
|
||||
# start timer for epoch time metric
|
||||
epoch_eval_start = time.time()
|
||||
|
||||
# EVAL: run evaluation on validation set and return metrics
|
||||
epoch_eval_metrics = self._epoch_eval(epoch, criterion)
|
||||
self.logger.info(f"Epoch metrics: {epoch_train_metrics}")
|
||||
|
||||
# stop timer
|
||||
epoch_metrics["epoch_eval_time"] = time.time() - epoch_eval_start
|
||||
|
||||
# record metrics of interest
|
||||
epoch_metrics["validation_data_loader.count"] = epoch_eval_metrics[
|
||||
"validation_data_loader.count"
|
||||
]
|
||||
epoch_metrics["validation_data_loader.time.sum"] = epoch_eval_metrics[
|
||||
"validation_data_loader.time.sum"
|
||||
]
|
||||
epoch_metrics["validation_data_loader.time.first"] = epoch_eval_metrics[
|
||||
"validation_data_loader.time.first"
|
||||
]
|
||||
epoch_metrics["epoch_valid_loss"] = (
|
||||
epoch_eval_metrics["running_loss"] / epoch_eval_metrics["num_samples"]
|
||||
)
|
||||
epoch_metrics["epoch_valid_acc"] = (
|
||||
epoch_eval_metrics["num_correct"] / epoch_eval_metrics["num_samples"]
|
||||
)
|
||||
|
||||
# start timer for epoch time metric
|
||||
epoch_utility_start = time.time()
|
||||
|
||||
# PROFILER: use profiler.step() to mark a step in training
|
||||
# the pytorch profiler will use internally to trigger
|
||||
# saving the traces in different files
|
||||
if self.profiler:
|
||||
self.profiler.step()
|
||||
|
||||
# DISTRIBUTED: export checkpoint only from main node
|
||||
if self.self_is_main_node and checkpoints_dir is not None:
|
||||
self.checkpoint_save(
|
||||
self.model,
|
||||
optimizer,
|
||||
checkpoints_dir,
|
||||
epoch=epoch,
|
||||
loss=epoch_metrics["epoch_valid_loss"],
|
||||
)
|
||||
|
||||
# report metric values in stdout
|
||||
self.logger.info(f"MLFLOW: metrics={epoch_metrics} epoch={epoch}")
|
||||
|
||||
# MLFLOW / DISTRIBUTED: report metrics only from main node
|
||||
if self.self_is_main_node:
|
||||
mlflow.log_metrics(epoch_metrics)
|
||||
mlflow.log_metric(
|
||||
"epoch_utility_time", time.time() - epoch_utility_start, step=epoch
|
||||
)
|
||||
|
||||
def runtime_error_report(self, runtime_exception):
|
||||
"""Call this when catching a critical exception.
|
||||
Will print all sorts of relevant information to the log."""
|
||||
self.logger.critical(traceback.format_exc())
|
||||
try:
|
||||
import psutil
|
||||
|
||||
self.logger.critical(f"Memory: {str(psutil.virtual_memory())}")
|
||||
except ModuleNotFoundError:
|
||||
self.logger.critical(
|
||||
"import psutil failed, cannot display virtual memory stats."
|
||||
)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
self.logger.critical(
|
||||
"Cuda memory summary:\n"
|
||||
+ str(torch.cuda.memory_summary(device=None, abbreviated=False))
|
||||
)
|
||||
self.logger.critical(
|
||||
"Cuda memory snapshot:\n"
|
||||
+ json.dumps(torch.cuda.memory_snapshot(), indent=" ")
|
||||
)
|
||||
else:
|
||||
self.logger.critical(
|
||||
"Cuda is not available, cannot report cuda memory allocation."
|
||||
)
|
||||
|
||||
def close(self):
|
||||
"""Tear down potential resources"""
|
||||
if self.multinode_available:
|
||||
self.logger.info(
|
||||
f"Destroying process group on local_rank={self.local_rank} rank={self.world_rank} size={self.world_size}"
|
||||
)
|
||||
# DISTRIBUTED: this will teardown the distributed process group
|
||||
torch.distributed.destroy_process_group()
|
||||
else:
|
||||
self.logger.info(
|
||||
f"Not running in multinode, so not destroying process group."
|
||||
)
|
||||
|
||||
#################
|
||||
### MODEL I/O ###
|
||||
#################
|
||||
|
||||
def checkpoint_save(
|
||||
self, model, optimizer, output_dir: str, epoch: int, loss: float
|
||||
):
|
||||
"""Saves model as checkpoint"""
|
||||
# create output directory just in case
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
model_output_path = os.path.join(
|
||||
output_dir, f"model-checkpoint-epoch{epoch}-loss{loss}.pt"
|
||||
)
|
||||
|
||||
self.logger.info(f"Exporting checkpoint to {model_output_path}")
|
||||
|
||||
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
|
||||
# DISTRIBUTED: to export model, you need to get it out of the DistributedDataParallel class
|
||||
self.logger.info(
|
||||
"Model was distributed, we will checkpoint DistributedDataParallel.module"
|
||||
)
|
||||
model_to_save = model.module
|
||||
else:
|
||||
model_to_save = model
|
||||
|
||||
with record_function("checkpoint.save"):
|
||||
torch.save(
|
||||
{
|
||||
"epoch": epoch,
|
||||
"model_state_dict": model_to_save.state_dict(),
|
||||
"optimizer_state_dict": optimizer.state_dict(),
|
||||
"loss": loss,
|
||||
},
|
||||
model_output_path,
|
||||
)
|
||||
|
||||
def save(
|
||||
self,
|
||||
output_dir: str,
|
||||
name: str = "dev",
|
||||
register_as: str = None,
|
||||
) -> None:
|
||||
# DISTRIBUTED: you want to save the model only from the main node/process
|
||||
# in data distributed mode, all models should theoretically be the same
|
||||
if self.self_is_main_node:
|
||||
self.logger.info(f"Saving model and classes in {output_dir}...")
|
||||
|
||||
# create output directory just in case
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
if isinstance(self.model, torch.nn.parallel.DistributedDataParallel):
|
||||
# DISTRIBUTED: to export model, you need to get it out of the DistributedDataParallel class
|
||||
self.logger.info(
|
||||
"Model was distributed, we will export DistributedDataParallel.module"
|
||||
)
|
||||
model_to_save = self.model.module.to("cpu")
|
||||
else:
|
||||
model_to_save = self.model.to("cpu")
|
||||
|
||||
# Save the labels to a csv file.
|
||||
# This file will be required to map the output array
|
||||
# from the API to the labels.
|
||||
with open("label-mapping.txt", "w") as f:
|
||||
f.write("\n".join(self.labels))
|
||||
mlflow.log_artifact("label-mapping.txt")
|
||||
|
||||
# MLFLOW: mlflow has a nice method to export the model automatically
|
||||
# add tags and environment for it. You can then use it in Azure ML
|
||||
# to register your model to an endpoint.
|
||||
mlflow.pytorch.log_model(
|
||||
model_to_save,
|
||||
artifact_path="final_model",
|
||||
registered_model_name=register_as, # also register it if name is provided
|
||||
signature=self.model_signature,
|
||||
)
|
||||
|
||||
# MLFLOW: Register the model with the model registry
|
||||
# This is useful for Azure ML to register your model
|
||||
# to an endpoint.
|
||||
if register_as is not None:
|
||||
mlflow.register_model(
|
||||
model_uri=f"runs:/{mlflow.active_run().info.run_id}/final_model",
|
||||
name=register_as,
|
||||
)
|
||||
|
||||
|
||||
def build_arguments_parser(parser: argparse.ArgumentParser = None):
|
||||
"""Builds the argument parser for CLI settings"""
|
||||
if parser is None:
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
group = parser.add_argument_group(f"Training Inputs")
|
||||
group.add_argument(
|
||||
"--train_images",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to folder containing training images",
|
||||
)
|
||||
group.add_argument(
|
||||
"--valid_images",
|
||||
type=str,
|
||||
required=True,
|
||||
help="path to folder containing validation images",
|
||||
)
|
||||
|
||||
group = parser.add_argument_group(f"Training Outputs")
|
||||
group.add_argument(
|
||||
"--model_output",
|
||||
type=str,
|
||||
required=False,
|
||||
default=None,
|
||||
help="Path to write final model",
|
||||
)
|
||||
group.add_argument(
|
||||
"--checkpoints",
|
||||
type=str,
|
||||
required=False,
|
||||
default=None,
|
||||
help="Path to read/write checkpoints",
|
||||
)
|
||||
group.add_argument(
|
||||
"--register_model_as",
|
||||
type=str,
|
||||
required=False,
|
||||
default=None,
|
||||
help="Name to register final model in MLFlow",
|
||||
)
|
||||
|
||||
group = parser.add_argument_group(f"Data Loading Parameters")
|
||||
group.add_argument(
|
||||
"--batch_size",
|
||||
type=int,
|
||||
required=False,
|
||||
default=64,
|
||||
help="Train/valid data loading batch size (default: 64)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--num_workers",
|
||||
type=int,
|
||||
required=False,
|
||||
default=None,
|
||||
help="Num workers for data loader (default: -1 => all cpus available)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--prefetch_factor",
|
||||
type=int,
|
||||
required=False,
|
||||
default=2,
|
||||
help="Data loader prefetch factor (default: 2)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--persistent_workers",
|
||||
type=strtobool,
|
||||
required=False,
|
||||
default=True,
|
||||
help="Use persistent prefetching workers (default: True)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--pin_memory",
|
||||
type=strtobool,
|
||||
required=False,
|
||||
default=True,
|
||||
help="Pin Data loader prefetch factor (default: True)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--non_blocking",
|
||||
type=strtobool,
|
||||
required=False,
|
||||
default=False,
|
||||
help="Use non-blocking transfer to device (default: False)",
|
||||
)
|
||||
|
||||
group = parser.add_argument_group(f"Model/Training Parameters")
|
||||
group.add_argument(
|
||||
"--model_arch",
|
||||
type=str,
|
||||
required=False,
|
||||
default="resnet18",
|
||||
help="Which model architecture to use (default: resnet18)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--model_arch_pretrained",
|
||||
type=strtobool,
|
||||
required=False,
|
||||
default=True,
|
||||
help="Use pretrained model (default: true)",
|
||||
)
|
||||
group.add_argument(
|
||||
"--distributed_backend",
|
||||
type=str,
|
||||
required=False,
|
||||
choices=["nccl", "mpi"],
|
||||
default="nccl",
|
||||
help="Which distributed backend to use.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--disable_cuda",
|
||||
type=strtobool,
|
||||
required=False,
|
||||
default=False,
|
||||
help="set True to force use of cpu (local testing).",
|
||||
)
|
||||
# DISTRIBUTED: torch.distributed.launch is passing this argument to your script
|
||||
# it is likely to be deprecated in favor of os.environ['LOCAL_RANK']
|
||||
# see https://pytorch.org/docs/stable/distributed.html#launch-utility
|
||||
group.add_argument(
|
||||
"--local_rank",
|
||||
type=int,
|
||||
required=False,
|
||||
default=None,
|
||||
help="Passed by torch.distributed.launch utility when running from cli.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--num_epochs",
|
||||
type=int,
|
||||
required=False,
|
||||
default=1,
|
||||
help="Number of epochs to train for",
|
||||
)
|
||||
group.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
required=False,
|
||||
default=0.001,
|
||||
help="Learning rate of optimizer",
|
||||
)
|
||||
group.add_argument(
|
||||
"--momentum",
|
||||
type=float,
|
||||
required=False,
|
||||
default=0.9,
|
||||
help="Momentum of optimizer",
|
||||
)
|
||||
|
||||
group = parser.add_argument_group(f"System Parameters")
|
||||
group.add_argument(
|
||||
"--enable_profiling",
|
||||
type=strtobool,
|
||||
required=False,
|
||||
default=False,
|
||||
help="Enable pytorch profiler.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--multiprocessing_sharing_strategy",
|
||||
type=str,
|
||||
choices=torch.multiprocessing.get_all_sharing_strategies(),
|
||||
required=False,
|
||||
default=None,
|
||||
help="Check https://pytorch.org/docs/stable/multiprocessing.html",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def run(args):
|
||||
"""Run the script using CLI arguments"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Running with arguments: {args}")
|
||||
|
||||
# MLFLOW: initialize mlflow (once in entire script)
|
||||
mlflow.start_run()
|
||||
|
||||
# use a handler for the training sequence
|
||||
training_handler = PyTorchDistributedModelTrainingSequence()
|
||||
|
||||
# sets cuda and distributed config
|
||||
training_handler.setup_config(args)
|
||||
|
||||
# PROFILER: here we use a helper class to enable profiling
|
||||
# see profiling.py for the implementation details
|
||||
training_profiler = PyTorchProfilerHandler(
|
||||
enabled=bool(args.enable_profiling),
|
||||
rank=training_handler.world_rank,
|
||||
)
|
||||
# PROFILER: set profiler in trainer to call profiler.step() during training
|
||||
training_handler.profiler = training_profiler.start_profiler()
|
||||
|
||||
# report the time and disk usage during this code block
|
||||
with LogTimeBlock(
|
||||
"build_image_datasets", enabled=training_handler.self_is_main_node
|
||||
), LogDiskIOBlock(
|
||||
"build_image_datasets", enabled=training_handler.self_is_main_node
|
||||
):
|
||||
# build the image folder datasets
|
||||
train_dataset, valid_dataset, labels = build_image_datasets(
|
||||
train_images_dir=args.train_images,
|
||||
valid_images_dir=args.valid_images,
|
||||
input_size=get_model_metadata(args.model_arch)["input_size"],
|
||||
)
|
||||
|
||||
# creates data loaders from datasets for distributed training
|
||||
training_handler.setup_datasets(train_dataset, valid_dataset, labels)
|
||||
|
||||
with LogTimeBlock("load_model", enabled=training_handler.self_is_main_node):
|
||||
# creates the model architecture
|
||||
model = load_model(
|
||||
args.model_arch,
|
||||
output_dimension=len(labels),
|
||||
pretrained=args.model_arch_pretrained,
|
||||
)
|
||||
|
||||
# logging of labels
|
||||
logger.info(labels)
|
||||
# sets the model for distributed training
|
||||
training_handler.setup_model(model)
|
||||
|
||||
# runs training sequence
|
||||
# NOTE: num_epochs is provided in args
|
||||
try:
|
||||
training_handler.train(checkpoints_dir=args.checkpoints)
|
||||
except RuntimeError as runtime_exception: # if runtime error occurs (ex: cuda out of memory)
|
||||
# then print some runtime error report in the logs
|
||||
training_handler.runtime_error_report(runtime_exception)
|
||||
# re-raise
|
||||
raise runtime_exception
|
||||
|
||||
# stops profiling (and save in mlflow)
|
||||
training_profiler.stop_profiler()
|
||||
|
||||
# saves final model
|
||||
if args.model_output:
|
||||
training_handler.save(
|
||||
args.model_output,
|
||||
name=f"epoch-{args.num_epochs}",
|
||||
register_as=args.register_model_as,
|
||||
)
|
||||
|
||||
# properly teardown distributed resources
|
||||
training_handler.close()
|
||||
|
||||
# MLFLOW: finalize mlflow (once in entire script)
|
||||
mlflow.end_run()
|
||||
|
||||
logger.info("run() completed")
|
||||
|
||||
|
||||
def main(cli_args=None):
|
||||
"""Main function of the script."""
|
||||
# initialize root logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
|
||||
)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# create argument parser
|
||||
parser = build_arguments_parser()
|
||||
|
||||
# runs on cli arguments
|
||||
args = parser.parse_args(cli_args) # if None, runs on sys.argv
|
||||
|
||||
# run the run function
|
||||
run(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,20 @@
|
|||
import os
|
||||
import sys
|
||||
import logging
|
||||
import pytest
|
||||
import tempfile
|
||||
from unittest.mock import Mock
|
||||
|
||||
SRC_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src"))
|
||||
|
||||
if SRC_ROOT not in sys.path:
|
||||
logging.info(f"Adding {SRC_ROOT} to path")
|
||||
sys.path.append(str(SRC_ROOT))
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def temporary_dir():
|
||||
"""Creates a temporary directory for the tests"""
|
||||
temp_directory = tempfile.TemporaryDirectory()
|
||||
yield temp_directory.name
|
||||
temp_directory.cleanup()
|
|
@ -0,0 +1,40 @@
|
|||
"""
|
||||
Tests running the model loader for every possible model in the list
|
||||
"""
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
# local imports
|
||||
from model import (
|
||||
MODEL_ARCH_LIST,
|
||||
get_model_metadata,
|
||||
load_model,
|
||||
)
|
||||
|
||||
# IMPORTANT: see conftest.py for fixtures
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_arch", MODEL_ARCH_LIST)
|
||||
def test_model_loader(model_arch):
|
||||
"""Tests src/components/pytorch_image_classifier/model/"""
|
||||
model_metadata = get_model_metadata(model_arch)
|
||||
|
||||
assert model_metadata is not None
|
||||
assert isinstance(model_metadata, dict)
|
||||
assert "library" in model_metadata
|
||||
assert "input_size" in model_metadata
|
||||
|
||||
# using pretrained=False to avoid downloading each time we unit test
|
||||
model = load_model(model_arch, output_dimension=4, pretrained=False)
|
||||
|
||||
assert model is not None
|
||||
assert isinstance(model, torch.nn.Module)
|
||||
|
||||
|
||||
def test_model_loader_failure():
|
||||
"""Test asking for a model that deosn't exist"""
|
||||
with pytest.raises(NotImplementedError):
|
||||
get_model_metadata("not_a_model")
|
||||
|
||||
with pytest.raises(NotImplementedError):
|
||||
load_model("not_a_model", output_dimension=4, pretrained=False)
|
|
@ -0,0 +1,219 @@
|
|||
"""
|
||||
Tests running the train.py script end-to-end
|
||||
on a randomly generated (small) dataset.
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
# local imports
|
||||
import train
|
||||
from model import MODEL_ARCH_LIST
|
||||
|
||||
# IMPORTANT: see conftest.py for fixtures (ex: temporary_dir)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def random_image_in_folder_classes(temporary_dir):
|
||||
image_dataset_path = os.path.join(temporary_dir, "image_in_folders")
|
||||
os.makedirs(image_dataset_path, exist_ok=False)
|
||||
|
||||
n_samples = 100
|
||||
n_classes = 4
|
||||
|
||||
for i in range(n_samples):
|
||||
a = np.random.rand(300, 300, 3) * 255
|
||||
im_out = Image.fromarray(a.astype("uint8")).convert("RGB")
|
||||
|
||||
class_dir = "class_{}".format(i % n_classes)
|
||||
|
||||
image_path = os.path.join(
|
||||
image_dataset_path, class_dir, "random_image_{}.jpg".format(i)
|
||||
)
|
||||
os.makedirs(os.path.join(image_dataset_path, class_dir), exist_ok=True)
|
||||
im_out.save(image_path)
|
||||
|
||||
return image_dataset_path
|
||||
|
||||
|
||||
# IMPORTANT: we have to restrict the list of models for unit test
|
||||
# because github actions runners have 7GB RAM only and will OOM
|
||||
TEST_MODEL_ARCH_LIST = [
|
||||
"test",
|
||||
"resnet18",
|
||||
"resnet34",
|
||||
]
|
||||
|
||||
# NOTE: we only care about patching those specific mlflow methods
|
||||
# to mlflow initialization conflict between tests
|
||||
@patch("mlflow.end_run") # we can have only 1 start/end per test session
|
||||
@patch("mlflow.register_model") # patched to test model name registration
|
||||
@patch("mlflow.pytorch.log_model") # patched to test model name registration
|
||||
@patch("mlflow.log_params") # patched to avoid conflict in parameters
|
||||
@patch("mlflow.start_run") # we can have only 1 start/end per test session
|
||||
@pytest.mark.parametrize("model_arch", TEST_MODEL_ARCH_LIST)
|
||||
def test_components_pytorch_image_classifier_single_node(
|
||||
mlflow_start_run_mock,
|
||||
mlflow_log_params_mock,
|
||||
mlflow_pytorch_log_model_mock,
|
||||
mlflow_register_model_mock,
|
||||
mlflow_end_run_mock,
|
||||
model_arch,
|
||||
temporary_dir,
|
||||
random_image_in_folder_classes,
|
||||
):
|
||||
"""Tests src/components/pytorch_image_classifier/train.py"""
|
||||
model_dir = os.path.join(temporary_dir, "pytorch_image_classifier_model")
|
||||
checkpoints_dir = os.path.join(
|
||||
temporary_dir, "pytorch_image_classifier_checkpoints"
|
||||
)
|
||||
|
||||
# create test arguments for the script
|
||||
# fmt: off
|
||||
script_args = [
|
||||
"train.py",
|
||||
"--train_images", random_image_in_folder_classes,
|
||||
"--valid_images", random_image_in_folder_classes, # using same data for train/valid
|
||||
"--batch_size", "16",
|
||||
"--num_workers", "0", # single thread pre-fetching
|
||||
"--prefetch_factor", "2", # will be discarded if num_workers=0
|
||||
"--pin_memory", "True",
|
||||
"--non_blocking", "False",
|
||||
"--model_arch", model_arch,
|
||||
"--model_arch_pretrained", "True",
|
||||
"--num_epochs", "2",
|
||||
"--model_output", model_dir,
|
||||
"--checkpoints", checkpoints_dir,
|
||||
"--register_model_as", "foo",
|
||||
"--enable_profiling", "True",
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
# replaces sys.argv with test arguments and run main
|
||||
with patch.object(sys, "argv", script_args):
|
||||
train.main()
|
||||
|
||||
# those mlflow calls must be unique in the script
|
||||
mlflow_start_run_mock.assert_called_once()
|
||||
mlflow_end_run_mock.assert_called_once()
|
||||
|
||||
# test all log_params calls
|
||||
for log_params_call in mlflow_log_params_mock.call_args_list:
|
||||
args, kwargs = log_params_call
|
||||
assert isinstance(args[0], dict) # call has only 1 argument, and it's a dict
|
||||
|
||||
# test model registration with mlflow.pytorch.log_model()
|
||||
log_model_calls = mlflow_pytorch_log_model_mock.call_args_list
|
||||
assert len(log_model_calls) == 1
|
||||
args, kwargs = log_model_calls[0] # unpack arguments
|
||||
assert "artifact_path" in kwargs
|
||||
assert kwargs["artifact_path"] == "final_model"
|
||||
assert "registered_model_name" in kwargs
|
||||
assert kwargs["registered_model_name"] == "foo"
|
||||
|
||||
# test model registration with mlflow.register_model()
|
||||
register_model_calls = mlflow_register_model_mock.call_args_list
|
||||
assert len(register_model_calls) == 1 # call should happen only once
|
||||
args, kwargs = register_model_calls[0] # unpack arguments
|
||||
assert "model_uri" in kwargs
|
||||
assert kwargs["model_uri"].endswith("final_model")
|
||||
assert "name" in kwargs
|
||||
assert kwargs["name"] == "foo"
|
||||
|
||||
# test checkpoints presence
|
||||
assert len(os.listdir(checkpoints_dir)) == 3 # 1 before training loop, + 2 epochs
|
||||
|
||||
|
||||
@patch("mlflow.end_run") # we can have only 1 start/end per test session
|
||||
@patch("mlflow.register_model") # patched to test model name registration
|
||||
@patch("mlflow.pytorch.log_model") # patched to test model name registration
|
||||
@patch("mlflow.log_params") # patched to avoid conflict in parameters
|
||||
@patch("mlflow.start_run") # we can have only 1 start/end per test session
|
||||
@patch("torch.distributed.init_process_group") # to avoid calling for the actual thing
|
||||
@patch(
|
||||
"torch.distributed.destroy_process_group"
|
||||
) # to avoid calling for the actual thing
|
||||
@patch(
|
||||
"torch.nn.parallel.DistributedDataParallel"
|
||||
) # to avoid calling for the actual thing
|
||||
@pytest.mark.parametrize("backend", ["nccl", "mpi"])
|
||||
def test_components_pytorch_image_classifier_second_of_two_nodes(
|
||||
torch_ddp_mock,
|
||||
torch_dist_destroy_process_group_mock,
|
||||
torch_dist_init_process_group_mock,
|
||||
mlflow_start_run_mock,
|
||||
mlflow_log_params_mock,
|
||||
mlflow_pytorch_log_model_mock,
|
||||
mlflow_register_model_mock,
|
||||
mlflow_end_run_mock,
|
||||
backend,
|
||||
temporary_dir,
|
||||
random_image_in_folder_classes,
|
||||
):
|
||||
"""Tests src/components/pytorch_image_classifier/train.py"""
|
||||
model_dir = os.path.join(
|
||||
temporary_dir, "pytorch_image_classifier_distributed_model"
|
||||
)
|
||||
|
||||
torch_ddp_mock.side_effect = lambda model: model # ddp would return just the model
|
||||
|
||||
# create some environment variables for the backend
|
||||
if backend == "nccl":
|
||||
backend_expected_env = {
|
||||
# setup as if there were 2 nodes with 1 gpu each
|
||||
"WORLD_SIZE": "2",
|
||||
"RANK": "1",
|
||||
"LOCAL_WORLD_SIZE": "1",
|
||||
"LOCAL_RANK": "0",
|
||||
}
|
||||
elif backend == "mpi":
|
||||
backend_expected_env = {
|
||||
# setup as if there were 2 nodes with 1 gpu each
|
||||
"OMPI_COMM_WORLD_SIZE": "2",
|
||||
"OMPI_COMM_WORLD_RANK": "1",
|
||||
"OMPI_COMM_WORLD_LOCAL_SIZE": "1",
|
||||
"OMPI_COMM_WORLD_LOCAL_RANK": "0",
|
||||
}
|
||||
else:
|
||||
raise Exception("backend {} used for testing is not implemented in script.")
|
||||
|
||||
with patch.dict(os.environ, backend_expected_env, clear=False):
|
||||
# create test arguments for the script
|
||||
# fmt: off
|
||||
script_args = [
|
||||
"train.py",
|
||||
"--train_images", random_image_in_folder_classes,
|
||||
"--valid_images", random_image_in_folder_classes, # using same data for train/valid
|
||||
"--distributed_backend", backend,
|
||||
"--batch_size", "16",
|
||||
"--num_workers", "0", # single thread pre-fetching
|
||||
"--prefetch_factor", "2", # will be discarded if num_workers=0
|
||||
"--pin_memory", "True",
|
||||
"--non_blocking", "False",
|
||||
"--model_arch", "resnet18",
|
||||
"--model_arch_pretrained", "True",
|
||||
"--num_epochs", "1",
|
||||
"--register_model_as", "foo",
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
# replaces sys.argv with test arguments and run main
|
||||
with patch.object(sys, "argv", script_args):
|
||||
train.main()
|
||||
|
||||
# those mlflow calls must be unique in the script
|
||||
mlflow_start_run_mock.assert_called_once()
|
||||
mlflow_end_run_mock.assert_called_once()
|
||||
|
||||
mlflow_pytorch_log_model_mock.assert_not_called() # not saving from non-head nodes
|
||||
mlflow_register_model_mock.assert_not_called() # not registering from non-head nodes
|
||||
|
||||
torch_dist_init_process_group_mock.assert_called_once()
|
||||
torch_dist_init_process_group_mock.assert_called_with(backend, rank=1, world_size=2)
|
||||
|
||||
torch_dist_destroy_process_group_mock.assert_called_once()
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,6 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
|
||||
name: blue
|
||||
endpoint_name: dogs-classifier-online
|
||||
model: azureml:resnet-dogs-classifier@latest
|
||||
instance_type: Standard_DS2_v2
|
||||
instance_count: 1
|
|
@ -0,0 +1,4 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineEndpoint.schema.json
|
||||
name: dogs-classifier-online
|
||||
description: Stanford Dogs Classifier
|
||||
auth_mode: key
|
|
@ -0,0 +1,19 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
|
||||
|
||||
command: |
|
||||
tar xvfm ${{inputs.archive}} --no-same-owner -C ${{outputs.images}} #TODO: Split data into Train-Validate-Test
|
||||
|
||||
inputs:
|
||||
archive:
|
||||
type: uri_file
|
||||
path: http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
|
||||
|
||||
outputs:
|
||||
images:
|
||||
type: uri_folder
|
||||
mode: upload
|
||||
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/
|
||||
|
||||
environment: azureml:AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest
|
||||
|
||||
compute: azureml:cpu-cluster
|
|
@ -0,0 +1,83 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
|
||||
# <inputs_and_outputs>
|
||||
inputs:
|
||||
training_images:
|
||||
type: uri_folder
|
||||
mode: download # pick ro_mount, rw_mount or download
|
||||
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
|
||||
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/train//**
|
||||
validation_images: #TODO: Use different datasets for validation
|
||||
type: uri_folder
|
||||
mode: download # pick ro_mount, rw_mount or download
|
||||
path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/dogs/**
|
||||
# path: azureml://datastores/workspaceblobstore/paths/azureml-vision-datasets/places2/valid/**
|
||||
# </inputs_and_outputs>
|
||||
|
||||
# <jobs>
|
||||
settings:
|
||||
default_datastore: azureml:workspaceblobstore
|
||||
continue_on_step_failure: true
|
||||
|
||||
jobs:
|
||||
train:
|
||||
type: command
|
||||
component: file:train.yaml
|
||||
compute: azureml:gpu-cluster
|
||||
resources:
|
||||
instance_count: 1 # number of nodes
|
||||
distribution:
|
||||
type: pytorch
|
||||
process_count_per_instance: 1 # number of gpus
|
||||
|
||||
# NOTE: set env var if needed
|
||||
environment_variables:
|
||||
NCCL_DEBUG: "INFO" # adjusts the level of info from NCCL tests
|
||||
|
||||
# NCCL_TOPO_FILE: "/opt/microsoft/ndv4-topo.xml" # Use specific topology file for A100
|
||||
|
||||
# NCCL_IB_PCI_RELAXED_ORDERING: "1" # Relaxed Ordering can greatly help the performance of Infiniband networks in virtualized environments.
|
||||
# NCCL_IB_DISABLE: "1" # force disable infiniband (if set to "1")
|
||||
# NCCL_NET_PLUGIN: "none" # to force NET/Plugin off (no rdma/sharp plugin at all)
|
||||
# NCCL_NET: "Socket" # to force node-to-node comm to use Socket (slow)
|
||||
# NCCL_SOCKET_IFNAME: "eth0" # to force Socket comm to use eth0 (use NCCL_NET=Socket)
|
||||
|
||||
# UCX_IB_PCI_RELAXED_ORDERING: "on"
|
||||
# UCX_TLS: "tcp"
|
||||
# UCX_NET_DEVICES: "eth0" # if you have Error: Failed to resolve UCX endpoint...
|
||||
|
||||
# CUDA_DEVICE_ORDER: "PCI_BUS_ID" # ordering of gpus
|
||||
|
||||
# TORCH_DISTRIBUTED_DEBUG: "DETAIL"
|
||||
|
||||
inputs:
|
||||
# data inputs
|
||||
train_images: ${{parent.inputs.training_images}}
|
||||
valid_images: ${{parent.inputs.validation_images}}
|
||||
|
||||
# data loading
|
||||
batch_size: 64
|
||||
num_workers: 5
|
||||
prefetch_factor: 4
|
||||
persistent_workers: true
|
||||
pin_memory: true
|
||||
non_blocking: false
|
||||
|
||||
# model
|
||||
model_arch: "resnet18"
|
||||
model_arch_pretrained: true
|
||||
|
||||
# training
|
||||
num_epochs: 1
|
||||
learning_rate: 0.001
|
||||
momentum: 0.9
|
||||
|
||||
# profiling
|
||||
enable_profiling: false
|
||||
# multiprocessing_sharing_strategy: "file_system" # WARNING: this can cause hang at job completion
|
||||
|
||||
# Model Registrataion
|
||||
register_model_as: "resnet-dogs-classifier"
|
||||
|
||||
# </jobs>
|
|
@ -0,0 +1,21 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
|
||||
name: nvidia_pytorch
|
||||
build:
|
||||
path: ../../../data-science/environment/
|
||||
tags:
|
||||
os: ubuntu
|
||||
os_version: 20.04
|
||||
hpcx: 2.10
|
||||
mpi: openmpi
|
||||
mpi_version: 4.1.2rc4
|
||||
ucx: 1.12.0
|
||||
cuda: 11.6.2
|
||||
cudnn: 8.4.0.27
|
||||
nccl: 2.12.10
|
||||
rdma_core: 36.0
|
||||
nsight_compute: 2022.1.1.2
|
||||
nsight_systems: "2022.2.1.31-5fe97ab"
|
||||
nccl_test: 2.11.0
|
||||
azureml-defaults: 1.41.0
|
||||
mlflow: 1.25.1
|
||||
transformers: 4.18.0
|
|
@ -0,0 +1,123 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
|
||||
type: command
|
||||
|
||||
description: >-
|
||||
Fine-tunes a pre-trained pytorch model for image classification.
|
||||
Inputs should be provided as distinct directories containing distinct images
|
||||
as we're using [ImageFolder](http://pytorch.org/vision/main/generated/torchvision.datasets.ImageFolder.html) to load data.
|
||||
name: pytorch_image_classifier
|
||||
display_name: Image Classification Model (PyTorch)
|
||||
version: 1.0.4
|
||||
|
||||
inputs:
|
||||
# data loading
|
||||
train_images:
|
||||
type: path
|
||||
description: "Path to folder containing training images, stored in subdirectories according to their class."
|
||||
valid_images:
|
||||
type: path
|
||||
description: "Path to folder containing validation images, stored in subdirectories according to their class."
|
||||
|
||||
# data loading
|
||||
batch_size:
|
||||
type: integer
|
||||
min: 1
|
||||
optional: true
|
||||
description: "Train/valid data loading batch size (default: 64)"
|
||||
num_workers:
|
||||
type: integer
|
||||
optional: true
|
||||
description: "Num workers for data loader (default: -1 => all cpus available)"
|
||||
prefetch_factor:
|
||||
type: integer
|
||||
optional: true
|
||||
description: "Data loader prefetch factor (default: 2)"
|
||||
persistent_workers:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Use persistent prefetching workers (default: True)"
|
||||
pin_memory:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Pin Data loader prefetch factor (default: True)"
|
||||
non_blocking:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Use non-blocking transfer to device (default: False)"
|
||||
|
||||
# model
|
||||
model_arch:
|
||||
type: string
|
||||
optional: true
|
||||
description: "Which model architecture to use (default: resnet18)"
|
||||
model_arch_pretrained:
|
||||
type: boolean
|
||||
optional: true
|
||||
description: "Use pretrained model (default: true)"
|
||||
|
||||
# training
|
||||
num_epochs:
|
||||
type: integer
|
||||
optional: true
|
||||
description: "Number of epochs to train for (default: 1)"
|
||||
learning_rate:
|
||||
type: number
|
||||
optional: true
|
||||
description: "Learning rate of optimizer (default: 0.001)"
|
||||
momentum:
|
||||
type: number
|
||||
optional: true
|
||||
description: "Momentum of optimizer (default: 0.9)"
|
||||
|
||||
# model registration
|
||||
register_model_as:
|
||||
type: string
|
||||
optional: true
|
||||
description: "Name to register final model in MLFlow"
|
||||
|
||||
# system parameters
|
||||
enable_profiling:
|
||||
type: boolean
|
||||
default: false
|
||||
description: "Enables profiler"
|
||||
multiprocessing_sharing_strategy:
|
||||
type: string
|
||||
optional: true
|
||||
description: "Check https://pytorch.org/docs/stable/multiprocessing.html"
|
||||
|
||||
outputs:
|
||||
checkpoints:
|
||||
type: path
|
||||
description: "Path to export checkpoints"
|
||||
trained_model:
|
||||
type: path
|
||||
description: "Path to the final model"
|
||||
|
||||
code: ../../../data-science/src
|
||||
|
||||
environment: azureml:nvidia_pytorch@latest
|
||||
|
||||
command: >-
|
||||
python train.py
|
||||
--train_images ${{inputs.train_images}}
|
||||
--valid_images ${{inputs.valid_images}}
|
||||
[--batch_size ${{inputs.batch_size}}]
|
||||
[--num_workers ${{inputs.num_workers}}]
|
||||
[--prefetch_factor ${{inputs.prefetch_factor}}]
|
||||
[--persistent_workers ${{inputs.persistent_workers}}]
|
||||
[--pin_memory ${{inputs.pin_memory}}]
|
||||
[--non_blocking ${{inputs.non_blocking}}]
|
||||
[--model_arch ${{inputs.model_arch}}]
|
||||
[--model_arch_pretrained ${{inputs.model_arch_pretrained}}]
|
||||
[--num_epochs ${{inputs.num_epochs}}]
|
||||
[--learning_rate ${{inputs.learning_rate}}]
|
||||
[--momentum ${{inputs.momentum}}]
|
||||
--model_output ${{outputs.trained_model}}
|
||||
--checkpoints ${{outputs.checkpoints}}
|
||||
[--register_model_as ${{inputs.register_model_as}}]
|
||||
--enable_profiling ${{inputs.enable_profiling}}
|
||||
[--multiprocessing_sharing_strategy ${{inputs.multiprocessing_sharing_strategy}}]
|
||||
distribution:
|
||||
# NOTE: using type:pytorch will use all the right env variables for pytorch init_process_group
|
||||
type: pytorch
|
||||
process_count_per_instance: 1
|
|
@ -0,0 +1,63 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: gpu-cluster
|
||||
size: Standard_NC6
|
||||
min_instances: 0
|
||||
max_instances: 1
|
||||
cluster_tier: dedicated
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
build_type: docker
|
||||
environment_name: nvidia_pytorch # Not used for docker builds
|
||||
environment_file: mlops/azureml/train/train-env.yaml
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
environment_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_file: mlops/azureml/train/pipeline.yaml
|
||||
experiment_name: $(environment)_cv_train_$(Build.SourceBranchName)
|
||||
display_name: $(environment)_cv_run_$(Build.BuildID)
|
|
@ -0,0 +1,61 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
- name: endpoint_name
|
||||
value: dogs-online-$(namespace)$(postfix)$(environment)
|
||||
- name: endpoint_type
|
||||
value: online
|
||||
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: CreateOnlineEndpoint
|
||||
displayName: Create/Update Online Endpoint
|
||||
jobs:
|
||||
- job: DeployOnlineEndpoint
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/create-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
- template: templates/${{ variables.version }}/create-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: dogs-online-dp
|
||||
deployment_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
- template: templates/${{ variables.version }}/allocate-traffic.yml@mlops-templates
|
||||
parameters:
|
||||
traffic_allocation: dogs-online-dp=100
|
||||
- template: templates/${{ variables.version }}/test-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: dogs-online-dp
|
||||
sample_request: data/sample-request.json
|
||||
request_type: json
|
|
@ -0,0 +1,51 @@
|
|||
name: deploy-cv-model-training-pipeline
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
create-compute:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: gpu-cluster
|
||||
size: Standard_NC6
|
||||
min_instances: 0
|
||||
max_instances: 1
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
register-environment:
|
||||
needs: [get-config,create-compute]
|
||||
uses: ./.github/workflows/register-environment.yml
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
environment_file: mlops/azureml/train/train-env.yaml
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
register-dataset:
|
||||
needs: [get-config,register-environment]
|
||||
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
data_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
|
||||
file_type: Training
|
||||
name: stanford_dogs
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
run-pipeline:
|
||||
needs: [get-config,register-dataset]
|
||||
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
parameters-file: mlops/azureml/train/pipeline.yaml
|
||||
job-name: cv-train
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
|
@ -0,0 +1,42 @@
|
|||
name: deploy-online-endpoint-pipeline
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
create-endpoint:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
endpoint_name: dogs-classifier-online2
|
||||
endpoint_type: online
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-deployment:
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config,create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
endpoint_name: dogs-classifier-online2
|
||||
endpoint_type: online
|
||||
deployment_name: dogs-online-dp
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
allocate-traffic:
|
||||
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
|
||||
needs: [get-config,create-deployment]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
traffic_allocation: dogs-online-dp=100
|
||||
endpoint_name: dogs-classifier-online2
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
|
@ -0,0 +1,43 @@
|
|||
variables:
|
||||
|
||||
ap_vm_image: ubuntu-20.04
|
||||
|
||||
## Training pipeline settings
|
||||
|
||||
# Training dataset settings
|
||||
training_dataset_name: dogs-imgs
|
||||
training_dataset_description: 'Stanford Dogs Dataset (http://vision.stanford.edu/aditya86/ImageNetDogs/)'
|
||||
training_dataset_local_path: data/training-imgs/
|
||||
training_dataset_path_on_datastore: dogs-imgs/
|
||||
training_dataset_type: local
|
||||
training_dataset_storage_url: 'http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar'
|
||||
|
||||
labels_dataset_name: dogs-labels
|
||||
labels_dataset_description: 'Labels for Stanford Dogs Dataset (http://vision.stanford.edu/aditya86/ImageNetDogs/)'
|
||||
labels_dataset_local_path: data/training/
|
||||
labels_dataset_path_on_datastore: dogs-labels/
|
||||
labels_dataset_type: local
|
||||
|
||||
# Training AzureML Environment settings
|
||||
training_env_name: nvidia_pytorch
|
||||
training_env_path: data-science/environment/training/
|
||||
|
||||
# Compute target for pipeline
|
||||
training_target: gpu-cluster
|
||||
training_target_sku: Standard_NC6
|
||||
training_target_min_nodes: 0
|
||||
training_target_max_nodes: 1
|
||||
|
||||
# Name for the training pipeline
|
||||
training_pipeline_name: resnet-dogs-training-pipeline
|
||||
training_experiment_name: resnet-dogs-training
|
||||
|
||||
# Training arguments specification
|
||||
training_arguments: --epochs 2 --batch-size 64 --training-mode feature-extraction
|
||||
|
||||
# Training datasets specification
|
||||
# Syntax: <name>:<version>:<mode>:<steps (names separated by +)>
|
||||
training_datasets: dogs-labels:1:download:prep dogs-imgs:latest:mount:train+eval
|
||||
|
||||
# Name under which the model will be registered
|
||||
model_name: resnet-dogs-classifier
|
|
@ -0,0 +1,44 @@
|
|||
{
|
||||
"name": "pytorch_manual",
|
||||
"environmentVariables": {
|
||||
"EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
|
||||
},
|
||||
"python": {
|
||||
"userManagedDependencies": false,
|
||||
"interpreterPath": "python",
|
||||
"condaDependenciesFile": null,
|
||||
"baseCondaEnvironment": null
|
||||
},
|
||||
"docker": {
|
||||
"enabled": true,
|
||||
"baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04",
|
||||
"baseDockerfile": null,
|
||||
"sharedVolumes": true,
|
||||
"shmSize": "2g",
|
||||
"arguments": [],
|
||||
"baseImageRegistry": {
|
||||
"address": null,
|
||||
"username": null,
|
||||
"password": null,
|
||||
"registryIdentity": null
|
||||
},
|
||||
"platform": {
|
||||
"os": "Linux",
|
||||
"architecture": "amd64"
|
||||
}
|
||||
},
|
||||
"spark": {
|
||||
"repositories": [],
|
||||
"packages": [],
|
||||
"precachePackages": true
|
||||
},
|
||||
"databricks": {
|
||||
"mavenLibraries": [],
|
||||
"pypiLibraries": [],
|
||||
"rcranLibraries": [],
|
||||
"jarLibraries": [],
|
||||
"eggLibraries": []
|
||||
},
|
||||
"r": null,
|
||||
"inferencingStackVersion": null
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
name: pytorch_manual
|
||||
channels:
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.7
|
||||
- pip=20.2.4
|
||||
- pip:
|
||||
- pandas==1.3.5
|
||||
- scikit-learn==1.0.2
|
||||
- matplotlib==3.5.2
|
||||
- msrest==0.6.21
|
||||
- mlflow==1.27.0
|
||||
- azureml-core==1.43.0
|
||||
- azureml-defaults==1.43.0
|
||||
- azureml-mlflow==1.43.0
|
||||
- torch==1.11.0
|
||||
- torchvision==0.12.0
|
|
@ -0,0 +1,129 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import mlflow
|
||||
import torch
|
||||
from sklearn import metrics as sklmetrics
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
from model import CustomImageDataset, load_model
|
||||
|
||||
|
||||
def main(labels_path, images_path, model_path, model_name, output_dir, deploy_flag_output):
|
||||
|
||||
# Load model
|
||||
model_file = os.path.join(model_path, f'{model_name}.pth')
|
||||
net = load_model(path=model_file)
|
||||
|
||||
# Load test data
|
||||
labels_data = pd.read_csv(os.path.join(labels_path, 'labels_test.csv'))
|
||||
labels_data = labels_data.set_index('path').squeeze() # Convert to appropiate format for CustomImageDataset
|
||||
testset = CustomImageDataset(images_path, labels_data, mode='test')
|
||||
print(f'Test size: {len(testset)}')
|
||||
|
||||
# Generate predictions
|
||||
predictions = get_predictions(testset, net)
|
||||
predictions.to_csv(os.path.join(output_dir, 'predictions.csv'), index=False)
|
||||
|
||||
# Evaluation metrics
|
||||
metrics = evaluate(predictions.label_real, predictions.label_predicted)
|
||||
for k, v in metrics.items():
|
||||
mlflow.log_metric(k, v)
|
||||
|
||||
# Plot confusion matrix
|
||||
plt.rcParams["figure.figsize"] = [40, 40]
|
||||
sklmetrics.ConfusionMatrixDisplay.from_predictions(predictions.label_real, predictions.label_predicted, cmap='YlGnBu')
|
||||
plt.savefig("confusion_matrix.png")
|
||||
mlflow.log_artifact("confusion_matrix.png")
|
||||
|
||||
# Promote model
|
||||
deploy_flag = is_new_model_better()
|
||||
mlflow.log_metric("deploy flag", deploy_flag)
|
||||
with open(deploy_flag_output, 'w') as f:
|
||||
f.write('%d' % int(deploy_flag))
|
||||
|
||||
deploy_flag_str = 'not' if deploy_flag == False else ''
|
||||
print(f'Finished. Model will {deploy_flag_str} be registered.')
|
||||
|
||||
|
||||
def get_predictions(dataset, net, batch_size=256):
|
||||
|
||||
testloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
|
||||
|
||||
predictions = pd.DataFrame()
|
||||
# since we're not training, we don't need to calculate the gradients for our outputs
|
||||
with torch.no_grad():
|
||||
for data in testloader:
|
||||
images, classes, paths = data
|
||||
# calculate outputs by running images through the network
|
||||
outputs = net(images)
|
||||
# the class with the highest energy is what we choose as prediction
|
||||
_, predicted = torch.max(outputs.data, 1)
|
||||
|
||||
predictions_batch = pd.DataFrame({
|
||||
'path': paths,
|
||||
'label_real': dataset.get_labels(classes),
|
||||
'label_predicted': dataset.get_labels(predicted)
|
||||
})
|
||||
predictions = pd.concat([predictions, predictions_batch], ignore_index=True)
|
||||
|
||||
return predictions
|
||||
|
||||
|
||||
def evaluate(labels_real, labels_pred):
|
||||
|
||||
metrics = {
|
||||
'accuracy': sklmetrics.accuracy_score(labels_real, labels_pred),
|
||||
'mcc': sklmetrics.matthews_corrcoef(labels_real, labels_pred),
|
||||
|
||||
'recall_micro': sklmetrics.recall_score(labels_real, labels_pred, average='micro'),
|
||||
'recall_macro': sklmetrics.recall_score(labels_real, labels_pred, average='macro'),
|
||||
'recall_weighted': sklmetrics.recall_score(labels_real, labels_pred, average='weighted'),
|
||||
|
||||
'precison_micro': sklmetrics.precision_score(labels_real, labels_pred, average='micro'),
|
||||
'precison_macro': sklmetrics.precision_score(labels_real, labels_pred, average='macro'),
|
||||
'precison_weighted': sklmetrics.precision_score(labels_real, labels_pred, average='weighted'),
|
||||
|
||||
'f1_micro': sklmetrics.f1_score(labels_real, labels_pred, average='micro'),
|
||||
'f1_macro': sklmetrics.f1_score(labels_real, labels_pred, average='macro'),
|
||||
'f1_weighted': sklmetrics.f1_score(labels_real, labels_pred, average='weighted'),
|
||||
}
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def is_new_model_better():
|
||||
return True # For simplicity
|
||||
|
||||
|
||||
def parse_args(args_list=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--prepared_data_path', type=str, required=True, help='Directory path to test data (output from prep step)')
|
||||
parser.add_argument('--dogs-imgs', type=str, help='Directory path to images')
|
||||
parser.add_argument('--model_path', type=str, help='Model output directory')
|
||||
parser.add_argument('--evaluation_path', type=str, default='evaluation_results/', help="Evaluation results output directory")
|
||||
parser.add_argument('--deploy_flag', type=str, help='A deploy flag whether to deploy or no')
|
||||
|
||||
args_parsed, unknown = parser.parse_known_args(args_list)
|
||||
if unknown:
|
||||
print(f"Unrecognized arguments. These won't be used: {unknown}")
|
||||
|
||||
return args_parsed
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
|
||||
main(
|
||||
labels_path=args.prepared_data_path,
|
||||
images_path=args.dogs_imgs,
|
||||
model_path=args.model_path,
|
||||
model_name='model',
|
||||
output_dir=args.evaluation_path,
|
||||
deploy_flag_output=args.deploy_flag
|
||||
)
|
|
@ -0,0 +1,5 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from .dataset import CustomImageDataset
|
||||
from .net import load_model
|
|
@ -0,0 +1,49 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import PIL
|
||||
|
||||
from torch.utils.data import Dataset
|
||||
import torchvision.transforms as transforms
|
||||
|
||||
|
||||
class CustomImageDataset(Dataset):
|
||||
def __init__(self, img_dir, img_labels, mode='test'):
|
||||
self.img_dir = img_dir
|
||||
self.img_labels = img_labels
|
||||
self.classes = img_labels.unique().tolist()
|
||||
|
||||
self.mode = mode
|
||||
if self.mode == 'train':
|
||||
self.transform = transforms.Compose([
|
||||
transforms.RandomResizedCrop(224),
|
||||
transforms.RandomHorizontalFlip(),
|
||||
transforms.ToTensor()
|
||||
])
|
||||
else:
|
||||
self.transform = transforms.Compose([
|
||||
transforms.Resize(256),
|
||||
transforms.CenterCrop(224),
|
||||
transforms.ToTensor(),
|
||||
])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.img_labels)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
|
||||
img_path = self.img_labels.index[idx]
|
||||
image = PIL.Image.open(os.path.join(self.img_dir, img_path)).convert('RGB')
|
||||
image = self.transform(image)
|
||||
|
||||
img_label = self.img_labels[idx]
|
||||
img_class = self.classes.index(img_label)
|
||||
|
||||
return image, img_class, img_path
|
||||
|
||||
def nclasses(self):
|
||||
return len(self.classes)
|
||||
|
||||
def get_labels(self, indexes):
|
||||
return [self.classes[i] for i in indexes]
|
|
@ -0,0 +1,36 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.models as models
|
||||
import torch.optim as optim
|
||||
|
||||
|
||||
def load_model(path=None, num_classes=2, mode='finetuning', learning_rate=0.001, momentum=0.9):
|
||||
|
||||
# Load existing model
|
||||
if path:
|
||||
print('Loading existing model from path...')
|
||||
model_data = torch.load(path)
|
||||
model = models.resnet18(pretrained=False)
|
||||
model.fc = nn.Linear(model.fc.in_features, model_data['fc.weight'].shape[0])
|
||||
model.load_state_dict(model_data)
|
||||
return model
|
||||
|
||||
# Initialize new model
|
||||
assert mode in ['finetuning', 'feature-extraction']
|
||||
|
||||
model = models.resnet18(pretrained=True)
|
||||
if mode == 'feature-extraction': # Freeze layers
|
||||
for param in model.parameters():
|
||||
param.requires_grad = False
|
||||
|
||||
model.fc = nn.Linear(model.fc.in_features, num_classes)
|
||||
|
||||
criterion = nn.CrossEntropyLoss()
|
||||
|
||||
params_optim = model.parameters() if mode == 'finetuning' else model.fc.parameters() if mode == 'feature-extraction' else None
|
||||
optimizer = optim.SGD(params_optim, lr=learning_rate, momentum=momentum)
|
||||
|
||||
return model, criterion, optimizer
|
|
@ -0,0 +1,58 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import mlflow
|
||||
|
||||
|
||||
def main(raw_data_path, prepared_data_path):
|
||||
|
||||
print(f'Raw data path: {raw_data_path}')
|
||||
print(f'Output data path: {prepared_data_path}')
|
||||
|
||||
# Read data
|
||||
|
||||
labels_data = pd.read_csv(os.path.join(raw_data_path, 'image_labels.csv'))
|
||||
|
||||
mlflow.log_metric('total_labels', len(labels_data))
|
||||
|
||||
# Split data into train and test datasets
|
||||
|
||||
random_data = np.random.rand(len(labels_data))
|
||||
labels_train = labels_data[random_data < 0.7]
|
||||
labels_test = labels_data[random_data >= 0.7]
|
||||
|
||||
print(labels_train)
|
||||
|
||||
mlflow.log_metric('train_size', labels_train.shape[0])
|
||||
mlflow.log_metric('test_size', labels_test.shape[0])
|
||||
|
||||
labels_train.to_csv(os.path.join(prepared_data_path, 'labels_train.csv'), index=False)
|
||||
labels_test.to_csv(os.path.join(prepared_data_path, 'labels_test.csv'), index=False)
|
||||
|
||||
print('Finished.')
|
||||
|
||||
|
||||
def parse_args(args_list=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dogs-labels", type=str, required=True, help="Path to labels")
|
||||
parser.add_argument("--prepared_data_path", type=str, required=True, help="Path for prepared data")
|
||||
|
||||
args_parsed, unknown = parser.parse_known_args(args_list)
|
||||
if unknown:
|
||||
print(f"Unrecognized arguments. These won't be used: {unknown}")
|
||||
|
||||
return args_parsed
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
main(
|
||||
raw_data_path=args.dogs_labels,
|
||||
prepared_data_path=args.prepared_data_path
|
||||
)
|
|
@ -0,0 +1,118 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# Example adapted from:
|
||||
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
|
||||
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
|
||||
|
||||
import os
|
||||
import argparse
|
||||
|
||||
import pandas as pd
|
||||
import mlflow
|
||||
import torch
|
||||
|
||||
from model import CustomImageDataset, load_model
|
||||
|
||||
|
||||
def main(labels_path, images_path,
|
||||
model_name, output_dir,
|
||||
mode, epochs, batch_size, learning_rate, momentum):
|
||||
|
||||
labels_data = pd.read_csv(os.path.join(labels_path, 'labels_train.csv'))
|
||||
labels_data = labels_data.set_index('path').squeeze() # Convert to appropiate format for CustomImageDataset
|
||||
trainset = CustomImageDataset(images_path, labels_data, mode='train')
|
||||
print(f'Train size: {len(trainset)}')
|
||||
|
||||
print("Training...")
|
||||
net = train(
|
||||
trainset,
|
||||
mode=mode,
|
||||
epochs=epochs,
|
||||
batch_size=batch_size,
|
||||
learning_rate=learning_rate,
|
||||
momentum=momentum
|
||||
)
|
||||
print('Finished training')
|
||||
|
||||
print(f"Saving model in folder {output_dir}...")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
model_path = os.path.join(output_dir, f'{model_name}.pth')
|
||||
torch.save(net.state_dict(), model_path)
|
||||
|
||||
print('Finished.')
|
||||
|
||||
|
||||
def train(dataset, mode='finetuning', epochs=2, batch_size=64, learning_rate=0.001, momentum=0.9, stats_freq=25):
|
||||
|
||||
trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2)
|
||||
|
||||
net, criterion, optimizer = load_model(
|
||||
num_classes=dataset.nclasses(),
|
||||
mode=mode,
|
||||
learning_rate=learning_rate,
|
||||
momentum=momentum
|
||||
)
|
||||
|
||||
global_iter = 0
|
||||
for epoch in range(epochs): # loop over the dataset multiple times
|
||||
print(f'----\nEpoch {epoch}\n----\n')
|
||||
|
||||
running_loss = 0.0
|
||||
for i, data in enumerate(trainloader, 0):
|
||||
# get the inputs; data is a list of [inputs, classes]
|
||||
inputs, classes, paths = data
|
||||
|
||||
# zero the parameter gradients
|
||||
optimizer.zero_grad()
|
||||
|
||||
# forward + backward + optimize
|
||||
outputs = net(inputs)
|
||||
loss = criterion(outputs, classes)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
# print statistics
|
||||
running_loss += loss.item()
|
||||
if (i + 1) % stats_freq == 0:
|
||||
mlflow.log_metric('loss', running_loss / stats_freq, step=global_iter)
|
||||
mlflow.log_metric('epoch', epoch, step=global_iter)
|
||||
running_loss = 0.0
|
||||
|
||||
global_iter += 1
|
||||
|
||||
return net
|
||||
|
||||
|
||||
def parse_args(args_list=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--prepared_data_path', type=str, required=True, help='Directory path to training data (output from prep step)')
|
||||
parser.add_argument('--dogs-imgs', type=str, help='Directory path to images')
|
||||
parser.add_argument('--model_path', type=str, help='Model output directory')
|
||||
parser.add_argument('--training-mode', type=str, default='feature-extraction', choices=['finetuning', 'feature-extraction'])
|
||||
parser.add_argument('--epochs', type=int, default=2)
|
||||
parser.add_argument('--batch-size', type=int, default=64)
|
||||
parser.add_argument('--learning-rate', type=float, default=0.001)
|
||||
parser.add_argument('--momentum', type=float, default=0.9)
|
||||
|
||||
args_parsed, unknown = parser.parse_known_args(args_list)
|
||||
if unknown:
|
||||
print(f"Unrecognized arguments. These won't be used: {unknown}")
|
||||
|
||||
return args_parsed
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_args()
|
||||
|
||||
main(
|
||||
labels_path=args.prepared_data_path,
|
||||
images_path=args.dogs_imgs,
|
||||
model_name='model',
|
||||
output_dir=args.model_path,
|
||||
mode=args.training_mode,
|
||||
epochs=args.epochs,
|
||||
batch_size=args.batch_size,
|
||||
learning_rate=args.learning_rate,
|
||||
momentum=args.momentum
|
||||
)
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,105 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- template: ../../config-aml.yml
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk
|
||||
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: $(ap_vm_image)
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
steps:
|
||||
|
||||
# Setup
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
|
||||
# Environment
|
||||
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: $(training_env_name)
|
||||
build_type: folder
|
||||
environment_file: $(training_env_path)
|
||||
|
||||
# Compute
|
||||
- template: templates/${{ variables.version }}/get-compute.yml@mlops-templates
|
||||
parameters:
|
||||
compute_type: training
|
||||
|
||||
# Datasets (images + labels)
|
||||
# Images dataset
|
||||
- task: Bash@3
|
||||
displayName: 'Download data'
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
mkdir -p $(training_dataset_local_path)
|
||||
curl $(training_dataset_storage_url) | tar xvf - --no-same-owner -C $(training_dataset_local_path)
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
# Labels dataset
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
datasetName: $(labels_dataset_name)
|
||||
datasetDescription: $(labels_dataset_description)
|
||||
datasetLocalPath: $(labels_dataset_local_path)
|
||||
datasetPathOnDatastore: $(labels_dataset_path_on_datastore)
|
||||
datasetType: $(labels_dataset_type)
|
||||
|
||||
# Deploy training pipeline
|
||||
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
|
||||
- task: Bash@3
|
||||
name: export_pipeline_id
|
||||
displayName: "Export Pipeline ID"
|
||||
inputs:
|
||||
targetType: "inline"
|
||||
script: |
|
||||
echo "##vso[task.setvariable variable=pipeline_id;isOutput=true;]$(pipeline_id)"
|
||||
|
||||
# Run training
|
||||
- job: invoke_pipeline
|
||||
displayName: 'Invoke pipeline'
|
||||
pool: server
|
||||
timeoutInMinutes: 0
|
||||
dependsOn: DeployTrainingPipeline
|
||||
variables:
|
||||
pipeline_id: $[ dependencies.DeployTrainingPipeline.outputs['export_pipeline_id.pipeline_id'] ]
|
||||
steps:
|
||||
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
|
||||
displayName: 'Invoke AML Pipeline'
|
||||
inputs:
|
||||
azureSubscription: '$(ado_service_connection_aml_ws)'
|
||||
PipelineId: '$(PIPELINE_ID)'
|
||||
ExperimentName: '$(training_experiment_name)'
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 144 KiB |
Двоичный файл не отображается.
Двоичный файл не отображается.
До Ширина: | Высота: | Размер: 323 KiB |
Двоичные данные
documentation/architecturepattern/AzureML_NLP_Classification_Architecture.png
Normal file
Двоичные данные
documentation/architecturepattern/AzureML_NLP_Classification_Architecture.png
Normal file
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 147 KiB |
Двоичные данные
documentation/architecturepattern/AzureML_NLP_Classification_Architecture.vsdx
Normal file
Двоичные данные
documentation/architecturepattern/AzureML_NLP_Classification_Architecture.vsdx
Normal file
Двоичный файл не отображается.
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 145 KiB |
Двоичные данные
documentation/architecturepattern/AzureML_SupervisedCV_Architecture.vsdx
Normal file
Двоичные данные
documentation/architecturepattern/AzureML_SupervisedCV_Architecture.vsdx
Normal file
Двоичный файл не отображается.
Двоичный файл не отображается.
До Ширина: | Высота: | Размер: 339 KiB |
|
@ -8,7 +8,7 @@ param env string
|
|||
param tags object = {
|
||||
Owner: 'mlops-v2'
|
||||
Project: 'mlops-v2'
|
||||
Environment: 'dev'
|
||||
Environment: env
|
||||
Toolkit: 'bicep'
|
||||
Name: prefix
|
||||
}
|
||||
|
@ -28,7 +28,7 @@ module st './modules/storage_account.bicep' = {
|
|||
name: 'st'
|
||||
scope: resourceGroup(rg.name)
|
||||
params: {
|
||||
baseName: '${prefix}${postfix}${env}'
|
||||
baseName: '${uniqueString(rg.id)}${env}'
|
||||
location: location
|
||||
tags: tags
|
||||
}
|
||||
|
@ -61,7 +61,7 @@ module cr './modules/container_registry.bicep' = {
|
|||
name: 'cr'
|
||||
scope: resourceGroup(rg.name)
|
||||
params: {
|
||||
baseName: '${prefix}${postfix}${env}'
|
||||
baseName: '${uniqueString(rg.id)}${env}'
|
||||
location: location
|
||||
tags: tags
|
||||
}
|
||||
|
|
|
@ -4,10 +4,10 @@
|
|||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../../config-infra-prod.yml
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../../config-infra-dev.yml
|
||||
- template: ../../config-infra-dev.yml
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
@ -24,7 +24,7 @@ stages :
|
|||
steps:
|
||||
- checkout: self
|
||||
- script: |
|
||||
az bicep build --file ./infrastructure/bicep/main.bicep
|
||||
az bicep build --file ./infrastructure/main.bicep
|
||||
name: LintBicepCode
|
||||
displayName: Run Bicep Linter
|
||||
|
||||
|
@ -43,7 +43,7 @@ stages :
|
|||
inlineScript: |
|
||||
az deployment sub validate \
|
||||
--name $(Build.DefinitionName) \
|
||||
--template-file ./infrastructure/bicep/main.bicep \
|
||||
--template-file ./infrastructure/main.bicep \
|
||||
--location $(location) \
|
||||
--parameters location=$(location) prefix=$(namespace) postfix=$(postfix) env=$(environment)
|
||||
|
||||
|
@ -54,7 +54,7 @@ stages :
|
|||
displayName: Deploy Bicep
|
||||
pool:
|
||||
vmImage: $(ap_vm_image)
|
||||
environment: dev
|
||||
environment: $(environment)
|
||||
strategy:
|
||||
runOnce:
|
||||
deploy:
|
||||
|
@ -72,5 +72,5 @@ stages :
|
|||
az deployment sub create \
|
||||
--name $(Build.DefinitionName) \
|
||||
--location $(location) \
|
||||
--template-file ./infrastructure/bicep/main.bicep \
|
||||
--template-file ./infrastructure/main.bicep \
|
||||
--parameters location=$(location) prefix=$(namespace) postfix=$(postfix) env=$(environment)
|
|
@ -32,6 +32,11 @@ module "aml_workspace" {
|
|||
enable_aml_computecluster = var.enable_aml_computecluster
|
||||
storage_account_name = module.storage_account_aml.name
|
||||
|
||||
enable_aml_secure_workspace = var.enable_aml_secure_workspace
|
||||
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
|
||||
subnet_default_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
|
||||
subnet_training_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_training[0].id : ""
|
||||
|
||||
tags = local.tags
|
||||
}
|
||||
|
||||
|
@ -51,6 +56,10 @@ module "storage_account_aml" {
|
|||
firewall_bypass = ["AzureServices"]
|
||||
firewall_virtual_network_subnet_ids = []
|
||||
|
||||
enable_aml_secure_workspace = var.enable_aml_secure_workspace
|
||||
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
|
||||
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
|
||||
|
||||
tags = local.tags
|
||||
}
|
||||
|
||||
|
@ -66,6 +75,10 @@ module "key_vault" {
|
|||
postfix = var.postfix
|
||||
env = var.environment
|
||||
|
||||
enable_aml_secure_workspace = var.enable_aml_secure_workspace
|
||||
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
|
||||
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
|
||||
|
||||
tags = local.tags
|
||||
}
|
||||
|
||||
|
@ -96,5 +109,26 @@ module "container_registry" {
|
|||
postfix = var.postfix
|
||||
env = var.environment
|
||||
|
||||
enable_aml_secure_workspace = var.enable_aml_secure_workspace
|
||||
vnet_id = var.enable_aml_secure_workspace ? azurerm_virtual_network.vnet_default[0].id : ""
|
||||
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
|
||||
|
||||
tags = local.tags
|
||||
}
|
||||
|
||||
module "data_explorer" {
|
||||
source = "./modules/data-explorer"
|
||||
|
||||
rg_name = module.resource_group.name
|
||||
location = module.resource_group.location
|
||||
|
||||
prefix = var.prefix
|
||||
postfix = var.postfix
|
||||
env = var.environment
|
||||
key_vault_id = module.key_vault.id
|
||||
enable_monitoring = var.enable_monitoring
|
||||
|
||||
client_secret = var.client_secret
|
||||
|
||||
tags = local.tags
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
# Bastion
|
||||
|
||||
module "bastion" {
|
||||
source = "./modules/bastion-host"
|
||||
|
||||
prefix = var.prefix
|
||||
postfix = var.postfix
|
||||
env = var.environment
|
||||
|
||||
rg_name = module.resource_group.name
|
||||
location = module.resource_group.location
|
||||
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_bastion[0].id : ""
|
||||
|
||||
enable_aml_secure_workspace = var.enable_aml_secure_workspace
|
||||
|
||||
tags = local.tags
|
||||
}
|
||||
|
||||
# Virtual machine
|
||||
|
||||
module "virtual_machine_jumphost" {
|
||||
source = "./modules/virtual-machine"
|
||||
|
||||
prefix = var.prefix
|
||||
postfix = var.postfix
|
||||
env = var.environment
|
||||
|
||||
rg_name = module.resource_group.name
|
||||
location = module.resource_group.location
|
||||
subnet_id = var.enable_aml_secure_workspace ? azurerm_subnet.snet_default[0].id : ""
|
||||
jumphost_username = var.jumphost_username
|
||||
jumphost_password = var.jumphost_password
|
||||
|
||||
enable_aml_secure_workspace = var.enable_aml_secure_workspace
|
||||
|
||||
tags = local.tags
|
||||
}
|
|
@ -18,12 +18,14 @@ resource "azurerm_machine_learning_workspace" "mlw" {
|
|||
|
||||
# Compute cluster
|
||||
|
||||
resource "azurerm_machine_learning_compute_cluster" "adl_aml_ws_compute_cluster" {
|
||||
resource "azurerm_machine_learning_compute_cluster" "mlw_compute_cluster" {
|
||||
name = "cpu-cluster"
|
||||
location = var.location
|
||||
vm_priority = "LowPriority"
|
||||
vm_size = "Standard_DS3_v2"
|
||||
machine_learning_workspace_id = azurerm_machine_learning_workspace.mlw.id
|
||||
subnet_resource_id = var.enable_aml_secure_workspace ? var.subnet_training_id : ""
|
||||
|
||||
count = var.enable_aml_computecluster ? 1 : 0
|
||||
|
||||
scale_settings {
|
||||
|
@ -33,70 +35,63 @@ resource "azurerm_machine_learning_compute_cluster" "adl_aml_ws_compute_cluster"
|
|||
}
|
||||
}
|
||||
|
||||
# # Datastore
|
||||
# DNS Zones
|
||||
|
||||
# resource "azurerm_resource_group_template_deployment" "arm_aml_create_datastore" {
|
||||
# name = "arm_aml_create_datastore"
|
||||
# resource_group_name = var.rg_name
|
||||
# deployment_mode = "Incremental"
|
||||
# parameters_content = jsonencode({
|
||||
# "WorkspaceName" = {
|
||||
# value = azurerm_machine_learning_workspace.mlw.name
|
||||
# },
|
||||
# "StorageAccountName" = {
|
||||
# value = var.storage_account_name
|
||||
# }
|
||||
# })
|
||||
resource "azurerm_private_dns_zone" "mlw_zone_api" {
|
||||
name = "privatelink.api.azureml.ms"
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
# depends_on = [time_sleep.wait_30_seconds]
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# template_content = <<TEMPLATE
|
||||
# {
|
||||
# "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#",
|
||||
# "contentVersion": "1.0.0.0",
|
||||
# "parameters": {
|
||||
# "WorkspaceName": {
|
||||
# "type": "String"
|
||||
# },
|
||||
# "StorageAccountName": {
|
||||
# "type": "String"
|
||||
# }
|
||||
# },
|
||||
# "resources": [
|
||||
# {
|
||||
# "type": "Microsoft.MachineLearningServices/workspaces/datastores",
|
||||
# "apiVersion": "2021-03-01-preview",
|
||||
# "name": "[concat(parameters('WorkspaceName'), '/default')]",
|
||||
# "dependsOn": [],
|
||||
# "properties": {
|
||||
# "contents": {
|
||||
# "accountName": "[parameters('StorageAccountName')]",
|
||||
# "containerName": "default",
|
||||
# "contentsType": "AzureBlob",
|
||||
# "credentials": {
|
||||
# "credentialsType": "None"
|
||||
# },
|
||||
# "endpoint": "core.windows.net",
|
||||
# "protocol": "https"
|
||||
# },
|
||||
# "description": "Default datastore for mlops-tabular",
|
||||
# "isDefault": false,
|
||||
# "properties": {
|
||||
# "ServiceDataAccessAuthIdentity": "None"
|
||||
# },
|
||||
# "tags": {}
|
||||
# }
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
# TEMPLATE
|
||||
# }
|
||||
resource "azurerm_private_dns_zone" "mlw_zone_notebooks" {
|
||||
name = "privatelink.notebooks.azure.net"
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
# resource "time_sleep" "wait_30_seconds" {
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# depends_on = [
|
||||
# azurerm_machine_learning_workspace.mlw
|
||||
# ]
|
||||
# Linking of DNS zones to Virtual Network
|
||||
|
||||
# create_duration = "30s"
|
||||
# }
|
||||
resource "azurerm_private_dns_zone_virtual_network_link" "mlw_zone_api_link" {
|
||||
name = "${var.prefix}${var.postfix}_link_api"
|
||||
resource_group_name = var.rg_name
|
||||
private_dns_zone_name = azurerm_private_dns_zone.mlw_zone_api[0].name
|
||||
virtual_network_id = var.vnet_id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_private_dns_zone_virtual_network_link" "mlw_zone_notebooks_link" {
|
||||
name = "${var.prefix}${var.postfix}_link_notebooks"
|
||||
resource_group_name = var.rg_name
|
||||
private_dns_zone_name = azurerm_private_dns_zone.mlw_zone_notebooks[0].name
|
||||
virtual_network_id = var.vnet_id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Private Endpoint configuration
|
||||
|
||||
resource "azurerm_private_endpoint" "mlw_pe" {
|
||||
name = "pe-${azurerm_machine_learning_workspace.mlw.name}-amlw"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
subnet_id = var.subnet_default_id
|
||||
|
||||
private_service_connection {
|
||||
name = "psc-aml-${var.prefix}-${var.postfix}${var.env}"
|
||||
private_connection_resource_id = azurerm_machine_learning_workspace.mlw.id
|
||||
subresource_names = ["amlworkspace"]
|
||||
is_manual_connection = false
|
||||
}
|
||||
|
||||
private_dns_zone_group {
|
||||
name = "private-dns-zone-group-ws"
|
||||
private_dns_zone_ids = [azurerm_private_dns_zone.mlw_zone_api[0].id, azurerm_private_dns_zone.mlw_zone_notebooks[0].id]
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
||||
|
|
|
@ -58,3 +58,22 @@ variable "storage_account_name" {
|
|||
type = string
|
||||
description = "The Name of the Storage Account linked to AML workspace"
|
||||
}
|
||||
|
||||
variable "enable_aml_secure_workspace" {
|
||||
description = "Variable to enable or disable AML secure workspace"
|
||||
}
|
||||
|
||||
variable "vnet_id" {
|
||||
type = string
|
||||
description = "The ID of the vnet that should be linked to the DNS zone"
|
||||
}
|
||||
|
||||
variable "subnet_default_id" {
|
||||
type = string
|
||||
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
|
||||
}
|
||||
|
||||
variable "subnet_training_id" {
|
||||
type = string
|
||||
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
|
||||
}
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
resource "azurerm_bastion_host" "bas" {
|
||||
name = "bas-${var.prefix}-${var.postfix}${var.env}"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
sku = "Standard"
|
||||
copy_paste_enabled = false
|
||||
file_copy_enabled = false
|
||||
|
||||
ip_configuration {
|
||||
name = "configuration"
|
||||
subnet_id = var.subnet_id
|
||||
public_ip_address_id = azurerm_public_ip.pip[0].id
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
||||
|
||||
resource "azurerm_public_ip" "pip" {
|
||||
name = "pip-${var.prefix}-${var.postfix}${var.env}"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
allocation_method = "Static"
|
||||
sku = "Standard"
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
variable "rg_name" {
|
||||
type = string
|
||||
description = "Resource group name"
|
||||
}
|
||||
|
||||
variable "location" {
|
||||
type = string
|
||||
description = "Location of the resource group"
|
||||
}
|
||||
|
||||
variable "tags" {
|
||||
type = map(string)
|
||||
default = {}
|
||||
description = "A mapping of tags which should be assigned to the deployed resource"
|
||||
}
|
||||
|
||||
variable "prefix" {
|
||||
type = string
|
||||
description = "Prefix for the module name"
|
||||
}
|
||||
|
||||
variable "postfix" {
|
||||
type = string
|
||||
description = "Postfix for the module name"
|
||||
}
|
||||
|
||||
variable "env" {
|
||||
type = string
|
||||
description = "Environment prefix"
|
||||
}
|
||||
|
||||
variable "subnet_id" {
|
||||
type = string
|
||||
description = "Subnet ID for the bastion"
|
||||
}
|
||||
|
||||
variable "enable_aml_secure_workspace" {
|
||||
description = "Variable to enable or disable AML secure workspace"
|
||||
}
|
|
@ -7,8 +7,53 @@ resource "azurerm_container_registry" "cr" {
|
|||
name = "cr${local.safe_prefix}${local.safe_postfix}${var.env}"
|
||||
resource_group_name = var.rg_name
|
||||
location = var.location
|
||||
sku = "Standard"
|
||||
sku = var.enable_aml_secure_workspace ? "Premium" : "Standard"
|
||||
admin_enabled = true
|
||||
|
||||
tags = var.tags
|
||||
}
|
||||
|
||||
# DNS Zones
|
||||
|
||||
resource "azurerm_private_dns_zone" "cr_zone" {
|
||||
name = "privatelink.azurecr.io"
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Linking of DNS zones to Virtual Network
|
||||
|
||||
resource "azurerm_private_dns_zone_virtual_network_link" "cr_zone_link" {
|
||||
name = "${var.prefix}${var.postfix}_link_acr"
|
||||
resource_group_name = var.rg_name
|
||||
private_dns_zone_name = azurerm_private_dns_zone.cr_zone[0].name
|
||||
virtual_network_id = var.vnet_id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Private Endpoint configuration
|
||||
|
||||
resource "azurerm_private_endpoint" "cr_pe" {
|
||||
name = "pe-${azurerm_container_registry.cr.name}-acr"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
subnet_id = var.subnet_id
|
||||
|
||||
private_service_connection {
|
||||
name = "psc-acr-${var.prefix}-${var.postfix}${var.env}"
|
||||
private_connection_resource_id = azurerm_container_registry.cr.id
|
||||
subresource_names = ["registry"]
|
||||
is_manual_connection = false
|
||||
}
|
||||
|
||||
private_dns_zone_group {
|
||||
name = "private-dns-zone-group-acr"
|
||||
private_dns_zone_ids = [azurerm_private_dns_zone.cr_zone[0].id]
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
|
@ -28,3 +28,17 @@ variable "env" {
|
|||
type = string
|
||||
description = "Environment prefix"
|
||||
}
|
||||
|
||||
variable "enable_aml_secure_workspace" {
|
||||
description = "Variable to enable or disable AML secure workspace"
|
||||
}
|
||||
|
||||
variable "vnet_id" {
|
||||
type = string
|
||||
description = "The ID of the vnet that should be linked to the DNS zone"
|
||||
}
|
||||
|
||||
variable "subnet_id" {
|
||||
type = string
|
||||
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
|
||||
}
|
|
@ -8,4 +8,67 @@ resource "azurerm_key_vault" "kv" {
|
|||
sku_name = "standard"
|
||||
|
||||
tags = var.tags
|
||||
|
||||
access_policy {
|
||||
tenant_id = data.azurerm_client_config.current.tenant_id
|
||||
object_id = data.azurerm_client_config.current.object_id
|
||||
|
||||
key_permissions = [
|
||||
"Create",
|
||||
"Get",
|
||||
]
|
||||
|
||||
secret_permissions = [
|
||||
"Set",
|
||||
"Get",
|
||||
"Delete",
|
||||
"Purge",
|
||||
"Recover"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# DNS Zones
|
||||
|
||||
resource "azurerm_private_dns_zone" "kv_zone" {
|
||||
name = "privatelink.vaultcore.azure.net"
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Linking of DNS zones to Virtual Network
|
||||
|
||||
resource "azurerm_private_dns_zone_virtual_network_link" "kv_zone_link" {
|
||||
name = "${var.prefix}${var.postfix}_link_kv"
|
||||
resource_group_name = var.rg_name
|
||||
private_dns_zone_name = azurerm_private_dns_zone.kv_zone[0].name
|
||||
virtual_network_id = var.vnet_id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Private Endpoint configuration
|
||||
|
||||
resource "azurerm_private_endpoint" "kv_pe" {
|
||||
name = "pe-${azurerm_key_vault.kv.name}-vault"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
subnet_id = var.subnet_id
|
||||
|
||||
private_service_connection {
|
||||
name = "psc-kv-${var.prefix}-${var.postfix}${var.env}"
|
||||
private_connection_resource_id = azurerm_key_vault.kv.id
|
||||
subresource_names = ["vault"]
|
||||
is_manual_connection = false
|
||||
}
|
||||
|
||||
private_dns_zone_group {
|
||||
name = "private-dns-zone-group-kv"
|
||||
private_dns_zone_ids = [azurerm_private_dns_zone.kv_zone[0].id]
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
|
@ -28,3 +28,17 @@ variable "env" {
|
|||
type = string
|
||||
description = "Environment prefix"
|
||||
}
|
||||
|
||||
variable "enable_aml_secure_workspace" {
|
||||
description = "Variable to enable or disable AML secure workspace"
|
||||
}
|
||||
|
||||
variable "vnet_id" {
|
||||
type = string
|
||||
description = "The ID of the vnet that should be linked to the DNS zone"
|
||||
}
|
||||
|
||||
variable "subnet_id" {
|
||||
type = string
|
||||
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
|
||||
}
|
|
@ -32,3 +32,87 @@ resource "azurerm_storage_account_network_rules" "firewall_rules" {
|
|||
virtual_network_subnet_ids = var.firewall_virtual_network_subnet_ids
|
||||
bypass = var.firewall_bypass
|
||||
}
|
||||
|
||||
# DNS Zones
|
||||
|
||||
resource "azurerm_private_dns_zone" "st_zone_blob" {
|
||||
name = "privatelink.blob.core.windows.net"
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_private_dns_zone" "st_zone_file" {
|
||||
name = "privatelink.file.core.windows.net"
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Linking of DNS zones to Virtual Network
|
||||
|
||||
resource "azurerm_private_dns_zone_virtual_network_link" "st_zone_link_blob" {
|
||||
name = "${var.prefix}${var.postfix}_link_st_blob"
|
||||
resource_group_name = var.rg_name
|
||||
private_dns_zone_name = azurerm_private_dns_zone.st_zone_blob[0].name
|
||||
virtual_network_id = var.vnet_id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_private_dns_zone_virtual_network_link" "st_zone_link_file" {
|
||||
name = "${var.prefix}${var.postfix}_link_st_file"
|
||||
resource_group_name = var.rg_name
|
||||
private_dns_zone_name = azurerm_private_dns_zone.st_zone_file[0].name
|
||||
virtual_network_id = var.vnet_id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Private Endpoint configuration
|
||||
|
||||
resource "azurerm_private_endpoint" "st_pe_blob" {
|
||||
name = "pe-${azurerm_storage_account.st.name}-blob"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
subnet_id = var.subnet_id
|
||||
|
||||
private_service_connection {
|
||||
name = "psc-blob-${var.prefix}-${var.postfix}${var.env}"
|
||||
private_connection_resource_id = azurerm_storage_account.st.id
|
||||
subresource_names = ["blob"]
|
||||
is_manual_connection = false
|
||||
}
|
||||
|
||||
private_dns_zone_group {
|
||||
name = "private-dns-zone-group-blob"
|
||||
private_dns_zone_ids = [azurerm_private_dns_zone.st_zone_blob[0].id]
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
||||
|
||||
resource "azurerm_private_endpoint" "st_pe_file" {
|
||||
name = "pe-${azurerm_storage_account.st.name}-file"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
subnet_id = var.subnet_id
|
||||
|
||||
private_service_connection {
|
||||
name = "psc-file-${var.prefix}-${var.postfix}${var.env}"
|
||||
private_connection_resource_id = azurerm_storage_account.st.id
|
||||
subresource_names = ["file"]
|
||||
is_manual_connection = false
|
||||
}
|
||||
|
||||
private_dns_zone_group {
|
||||
name = "private-dns-zone-group-file"
|
||||
private_dns_zone_ids = [azurerm_private_dns_zone.st_zone_file[0].id]
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
|
@ -42,3 +42,17 @@ variable "firewall_virtual_network_subnet_ids" {
|
|||
variable "firewall_bypass" {
|
||||
default = ["None"]
|
||||
}
|
||||
|
||||
variable "enable_aml_secure_workspace" {
|
||||
description = "Variable to enable or disable AML secure workspace"
|
||||
}
|
||||
|
||||
variable "vnet_id" {
|
||||
type = string
|
||||
description = "The ID of the vnet that should be linked to the DNS zone"
|
||||
}
|
||||
|
||||
variable "subnet_id" {
|
||||
type = string
|
||||
description = "The ID of the subnet from which private IP addresses will be allocated for this Private Endpoint"
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
resource "azurerm_virtual_machine" "vm" {
|
||||
name = "wvm-jumphost"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
network_interface_ids = [azurerm_network_interface.vm_nic[0].id]
|
||||
vm_size = "Standard_DS3_v2"
|
||||
|
||||
delete_os_disk_on_termination = true
|
||||
delete_data_disks_on_termination = true
|
||||
|
||||
storage_image_reference {
|
||||
publisher = "microsoft-dsvm"
|
||||
offer = "dsvm-win-2019"
|
||||
sku = "server-2019"
|
||||
version = "latest"
|
||||
}
|
||||
|
||||
os_profile {
|
||||
computer_name = var.jumphost_username
|
||||
admin_username = var.jumphost_username
|
||||
admin_password = var.jumphost_password
|
||||
}
|
||||
|
||||
os_profile_windows_config {
|
||||
provision_vm_agent = true
|
||||
enable_automatic_upgrades = true
|
||||
}
|
||||
|
||||
identity {
|
||||
type = "SystemAssigned"
|
||||
}
|
||||
|
||||
storage_os_disk {
|
||||
name = "disk-${var.prefix}-${var.postfix}${var.env}"
|
||||
caching = "ReadWrite"
|
||||
create_option = "FromImage"
|
||||
managed_disk_type = "StandardSSD_LRS"
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
||||
|
||||
resource "azurerm_network_interface" "vm_nic" {
|
||||
name = "nic-${var.prefix}-${var.postfix}${var.env}"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
ip_configuration {
|
||||
name = "configuration"
|
||||
private_ip_address_allocation = "Dynamic"
|
||||
subnet_id = var.subnet_id
|
||||
# public_ip_address_id = azurerm_public_ip.vm_public_ip.id
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
||||
|
||||
resource "azurerm_network_security_group" "vm_nsg" {
|
||||
name = "nsg-${var.prefix}-${var.postfix}${var.env}"
|
||||
location = var.location
|
||||
resource_group_name = var.rg_name
|
||||
|
||||
security_rule {
|
||||
name = "RDP"
|
||||
priority = 1010
|
||||
direction = "Inbound"
|
||||
access = "Allow"
|
||||
protocol = "Tcp"
|
||||
source_port_range = "*"
|
||||
destination_port_range = 3389
|
||||
source_address_prefix = "*"
|
||||
destination_address_prefix = "*"
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = var.tags
|
||||
}
|
||||
|
||||
resource "azurerm_network_interface_security_group_association" "vm_nsg_association" {
|
||||
network_interface_id = azurerm_network_interface.vm_nic[0].id
|
||||
network_security_group_id = azurerm_network_security_group.vm_nsg[0].id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_dev_test_global_vm_shutdown_schedule" "vm_schedule" {
|
||||
virtual_machine_id = azurerm_virtual_machine.vm[0].id
|
||||
location = var.location
|
||||
enabled = true
|
||||
|
||||
daily_recurrence_time = "2000"
|
||||
timezone = "W. Europe Standard Time"
|
||||
|
||||
notification_settings {
|
||||
enabled = false
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
variable "rg_name" {
|
||||
type = string
|
||||
description = "Resource group name"
|
||||
}
|
||||
|
||||
variable "location" {
|
||||
type = string
|
||||
description = "Location of the resource group"
|
||||
}
|
||||
|
||||
variable "tags" {
|
||||
type = map(string)
|
||||
default = {}
|
||||
description = "A mapping of tags which should be assigned to the deployed resource"
|
||||
}
|
||||
|
||||
variable "prefix" {
|
||||
type = string
|
||||
description = "Prefix for the module name"
|
||||
}
|
||||
|
||||
variable "postfix" {
|
||||
type = string
|
||||
description = "Postfix for the module name"
|
||||
}
|
||||
|
||||
variable "env" {
|
||||
type = string
|
||||
description = "Environment prefix"
|
||||
}
|
||||
|
||||
variable "jumphost_username" {
|
||||
type = string
|
||||
description = "VM username"
|
||||
}
|
||||
|
||||
variable "jumphost_password" {
|
||||
type = string
|
||||
description = "VM password"
|
||||
}
|
||||
|
||||
variable "subnet_id" {
|
||||
type = string
|
||||
description = "Subnet ID for the virtual machine"
|
||||
}
|
||||
|
||||
variable "enable_aml_secure_workspace" {
|
||||
description = "Variable to enable or disable AML secure workspace"
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
# Virtual network
|
||||
|
||||
resource "azurerm_virtual_network" "vnet_default" {
|
||||
name = "vnet-${var.prefix}-${var.postfix}${var.environment}"
|
||||
resource_group_name = module.resource_group.name
|
||||
location = module.resource_group.location
|
||||
address_space = ["10.0.0.0/16"]
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
|
||||
tags = local.tags
|
||||
}
|
||||
|
||||
# Subnets
|
||||
|
||||
resource "azurerm_subnet" "snet_default" {
|
||||
name = "snet-${var.prefix}-${var.postfix}${var.environment}-default"
|
||||
resource_group_name = module.resource_group.name
|
||||
virtual_network_name = azurerm_virtual_network.vnet_default[0].name
|
||||
address_prefixes = ["10.0.1.0/24"]
|
||||
enforce_private_link_endpoint_network_policies = true
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_subnet" "snet_bastion" {
|
||||
name = "AzureBastionSubnet"
|
||||
resource_group_name = module.resource_group.name
|
||||
virtual_network_name = azurerm_virtual_network.vnet_default[0].name
|
||||
address_prefixes = ["10.0.10.0/27"]
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_subnet" "snet_training" {
|
||||
name = "snet-${var.prefix}-${var.postfix}${var.environment}-training"
|
||||
resource_group_name = module.resource_group.name
|
||||
virtual_network_name = azurerm_virtual_network.vnet_default[0].name
|
||||
address_prefixes = ["10.0.2.0/24"]
|
||||
enforce_private_link_endpoint_network_policies = true
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# Network security groups
|
||||
|
||||
resource "azurerm_network_security_group" "nsg_training" {
|
||||
name = "nsg-${var.prefix}-${var.postfix}${var.environment}-training"
|
||||
location = module.resource_group.location
|
||||
resource_group_name = module.resource_group.name
|
||||
|
||||
security_rule {
|
||||
name = "BatchNodeManagement"
|
||||
priority = 100
|
||||
direction = "Inbound"
|
||||
access = "Allow"
|
||||
protocol = "Tcp"
|
||||
source_port_range = "*"
|
||||
destination_port_range = "29876-29877"
|
||||
source_address_prefix = "BatchNodeManagement"
|
||||
destination_address_prefix = "*"
|
||||
}
|
||||
|
||||
security_rule {
|
||||
name = "AzureMachineLearning"
|
||||
priority = 110
|
||||
direction = "Inbound"
|
||||
access = "Allow"
|
||||
protocol = "Tcp"
|
||||
source_port_range = "*"
|
||||
destination_port_range = "44224"
|
||||
source_address_prefix = "AzureMachineLearning"
|
||||
destination_address_prefix = "*"
|
||||
}
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_subnet_network_security_group_association" "nsg-training-link" {
|
||||
subnet_id = azurerm_subnet.snet_training[0].id
|
||||
network_security_group_id = azurerm_network_security_group.nsg_training[0].id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
# User Defined Routes
|
||||
|
||||
resource "azurerm_route_table" "rt_training" {
|
||||
name = "rt-${var.prefix}-${var.postfix}${var.environment}-training"
|
||||
location = module.resource_group.location
|
||||
resource_group_name = module.resource_group.name
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_route" "route_training_internet" {
|
||||
name = "Internet"
|
||||
resource_group_name = module.resource_group.name
|
||||
route_table_name = azurerm_route_table.rt_training[0].name
|
||||
address_prefix = "0.0.0.0/0"
|
||||
next_hop_type = "Internet"
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_route" "route_training_aml" {
|
||||
name = "AzureMLRoute"
|
||||
resource_group_name = module.resource_group.name
|
||||
route_table_name = azurerm_route_table.rt_training[0].name
|
||||
address_prefix = "AzureMachineLearning"
|
||||
next_hop_type = "Internet"
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_route" "route_training_batch" {
|
||||
name = "BatchRoute"
|
||||
resource_group_name = module.resource_group.name
|
||||
route_table_name = azurerm_route_table.rt_training[0].name
|
||||
address_prefix = "BatchNodeManagement"
|
||||
next_hop_type = "Internet"
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
||||
|
||||
resource "azurerm_subnet_route_table_association" "rt_training_link" {
|
||||
subnet_id = azurerm_subnet.snet_training[0].id
|
||||
route_table_id = azurerm_route_table.rt_training[0].id
|
||||
|
||||
count = var.enable_aml_secure_workspace ? 1 : 0
|
||||
}
|
|
@ -9,6 +9,14 @@ variables:
|
|||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
|
||||
parameters:
|
||||
- name: jumphost_username
|
||||
type: string
|
||||
default: "azureuser"
|
||||
- name: jumphost_password
|
||||
type: string
|
||||
default: "ThisIsNotVerySecure!"
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
|
@ -51,4 +59,10 @@ stages :
|
|||
- template: templates/infra/run-terraform-init.yml@mlops-templates
|
||||
- template: templates/infra/run-terraform-validate.yml@mlops-templates
|
||||
- template: templates/infra/run-terraform-plan.yml@mlops-templates
|
||||
parameters:
|
||||
jumphost_username: ${{parameters.jumphost_username}}
|
||||
jumphost_password: ${{parameters.jumphost_password}}
|
||||
- template: templates/infra/run-terraform-apply.yml@mlops-templates
|
||||
parameters:
|
||||
jumphost_username: ${{parameters.jumphost_username}}
|
||||
jumphost_password: ${{parameters.jumphost_password}}
|
||||
|
|
|
@ -21,3 +21,27 @@ variable "postfix" {
|
|||
variable "enable_aml_computecluster" {
|
||||
description = "Variable to enable or disable AML compute cluster"
|
||||
}
|
||||
|
||||
variable "enable_aml_secure_workspace" {
|
||||
description = "Variable to enable or disable AML secure workspace"
|
||||
}
|
||||
|
||||
variable "jumphost_username" {
|
||||
type = string
|
||||
description = "VM username"
|
||||
default = "azureuser"
|
||||
}
|
||||
|
||||
variable "jumphost_password" {
|
||||
type = string
|
||||
description = "VM password"
|
||||
default = "ThisIsNotVerySecure!"
|
||||
}
|
||||
|
||||
variable "enable_monitoring" {
|
||||
description = "Variable to enable or disable Monitoring"
|
||||
}
|
||||
|
||||
variable "client_secret" {
|
||||
description = "Service Principal Secret"
|
||||
}
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
name: nlp_inference_conda_env
|
||||
channels:
|
||||
- pytorch
|
||||
- anaconda
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- pip=21.2.4
|
||||
- pytorch=1.10.0
|
||||
- torchvision=0.11.1
|
||||
- torchaudio=0.10.0
|
||||
- cudatoolkit=11.1.1
|
||||
- nvidia-apex=0.1.0
|
||||
- gxx_linux-64=8.5.0
|
||||
- pip:
|
||||
- azureml-defaults==1.39.0
|
||||
- azureml-mlflow==1.39.0
|
||||
- azureml-telemetry==1.39.0
|
||||
- azureml-train-core==1.39.0
|
||||
- mlflow==1.24.0
|
||||
- transformers==4.17.0
|
||||
- 'inference-schema[numpy-support]==1.3.0'
|
||||
- applicationinsights==0.11.10
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче