Merge pull request #121 from Azure/main-dec31

Merge main-dec31 into main
This commit is contained in:
Scott Donohoo 2023-01-05 09:24:12 -06:00 коммит произвёл GitHub
Родитель 15d5a58521 52efca0aca
Коммит 09470468e1
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
76 изменённых файлов: 13999 добавлений и 444 удалений

Просмотреть файл

@ -7,7 +7,8 @@ dependencies:
- pip
- pip:
- azureml-mlflow==1.38.0
- azureml-sdk==1.38.0
- azure-ai-ml==1.0.0
- pyarrow==10.0.0
- scikit-learn==0.24.1
- pandas==1.2.1
- joblib==1.0.0

Просмотреть файл

@ -4,6 +4,12 @@
Prepares raw data and provides training, validation and test datasets
"""
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Prepares raw data and provides training, validation and test datasets
"""
import argparse
from pathlib import Path
@ -41,21 +47,52 @@ CAT_NOM_COLS = [
"vendor",
]
CAT_ORD_COLS = [
]
TARGET_COL = "cost"
NUMERIC_COLS = [
"distance",
"dropoff_latitude",
"dropoff_longitude",
"passengers",
"pickup_latitude",
"pickup_longitude",
"pickup_weekday",
"pickup_month",
"pickup_monthday",
"pickup_hour",
"pickup_minute",
"pickup_second",
"dropoff_weekday",
"dropoff_month",
"dropoff_monthday",
"dropoff_hour",
"dropoff_minute",
"dropoff_second",
]
CAT_NOM_COLS = [
"store_forward",
"vendor",
]
CAT_ORD_COLS = [
]
def parse_args():
'''Parse input arguments'''
'''Parse input arguments'''
parser = argparse.ArgumentParser("prep")
parser.add_argument("--raw_data", type=str, help="Path to raw data")
parser.add_argument("--train_data", type=str, help="Path to train dataset")
parser.add_argument("--val_data", type=str, help="Path to test dataset")
parser.add_argument("--test_data", type=str, help="Path to test dataset")
parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
args = parser.parse_args()
return args
@ -65,17 +102,15 @@ def log_training_data(df, table_name):
collector = Online_Collector(table_name)
collector.batch_collect(df)
def main(args):
'''Read, split, and save datasets'''
def main(args):
'''Read, split, and save datasets'''
# ------------ Reading Data ------------ #
# -------------------------------------- #
print("mounted_path files: ")
arr = os.listdir(args.raw_data)
print(arr)
data = pd.read_csv((Path(args.raw_data) / 'taxi-data.csv'))
data = pd.read_csv((Path(args.raw_data)))
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
# ------------- Split Data ------------- #
@ -92,19 +127,24 @@ def main(args):
train = data[msk_train]
val = data[msk_val]
test = data[msk_test]
test = data[msk_test]
mlflow.log_metric('train size', train.shape[0])
mlflow.log_metric('val size', val.shape[0])
mlflow.log_metric('test size', test.shape[0])
train.to_parquet((Path(args.train_data) / "train.parquet"))
val.to_parquet((Path(args.val_data) / "val.parquet"))
test.to_parquet((Path(args.test_data) / "test.parquet"))
train.to_parquet((Path(args.train_data) / "train.parquet"))
val.to_parquet((Path(args.val_data) / "val.parquet"))
test.to_parquet((Path(args.test_data) / "test.parquet"))
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
log_training_data(data, args.table_name)
if __name__ == "__main__":
mlflow.start_run()
@ -124,9 +164,9 @@ if __name__ == "__main__":
for line in lines:
print(line)
main(args)
mlflow.end_run()

Просмотреть файл

@ -0,0 +1,11 @@
$schema: https://azuremlschemas.azureedge.net/latest/data.schema.json
# Supported paths include:
# local: ./<path>
# blob: https://<account_name>.blob.core.windows.net/<container_name>/<path>
# ADLS gen2: abfss://<file_system>@<account_name>.dfs.core.windows.net/<path>/
# Datastore: azureml://datastores/<data_store_name>/paths/<path>
type: uri_file
name: taxi-data
description: taxi dataset
path: ../../../data/taxi-data.csv

Просмотреть файл

@ -1,17 +1,18 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline
experiment_name: taxi-fare-training
experiment_name: taxi-fare-training
description: Training Pipeline to train a model that predicts taxi fare price
# <inputs_and_outputs>
inputs:
input: #using local data, will create an anonymous data asset
type: uri_folder
path: ../../../data/
input:
type: uri_file
path: azureml:taxi-data@latest
enable_monitoring: 'false'
table_name: 'taximonitoring'
outputs:
outputs:
train_data:
val_data:
test_data:
@ -32,8 +33,8 @@ jobs:
display_name: prep-data
code: ../../../data-science/src
command: >-
python prep.py
--raw_data ${{inputs.raw_data}}
python prep.py
--raw_data ${{inputs.raw_data}}
--train_data ${{outputs.train_data}}
--val_data ${{outputs.val_data}}
--test_data ${{outputs.test_data}}
@ -54,24 +55,24 @@ jobs:
display_name: train-model
code: ../../../data-science/src
command: >-
python train.py
--train_data ${{inputs.train_data}}
python train.py
--prepared_data ${{inputs.prepared_data}}
--model_output ${{outputs.model_output}}
environment: azureml:taxi-train-env@latest
inputs:
train_data: ${{parent.jobs.prep_data.outputs.train_data}}
outputs:
model_output: ${{parent.outputs.trained_model}}
evaluate_model:
name: evaluate_model
display_name: evaluate-model
code: ../../../data-science/src
command: >-
python evaluate.py
--model_name ${{inputs.model_name}}
--model_input ${{inputs.model_input}}
--test_data ${{inputs.test_data}}
python evaluate.py
--model_name ${{inputs.model_name}}
--model_input ${{inputs.model_input}}
--test_data ${{inputs.test_data}}
--evaluation_output ${{outputs.evaluation_output}}
environment: azureml:taxi-train-env@latest
inputs:
@ -86,9 +87,9 @@ jobs:
display_name: register-model
code: ../../../data-science/src
command: >-
python register.py
--model_name ${{inputs.model_name}}
--model_path ${{inputs.model_path}}
python register.py
--model_name ${{inputs.model_name}}
--model_path ${{inputs.model_path}}
--evaluation_output ${{inputs.evaluation_output}}
--model_info_output_path ${{outputs.model_info_output_path}}
environment: azureml:taxi-train-env@latest

Просмотреть файл

@ -2,61 +2,62 @@
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: aml-cli-v2
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: aml-cli-v2
trigger:
- none
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
build_type: conda
environment_name: taxi-train-env
environment_file: mlops/azureml/train/train-env.yml
enable_monitoring: $(enable_monitoring)
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: LowPriority
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_file: mlops/azureml/train/pipeline.yml
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
enable_monitoring: $(enable_monitoring)
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
environment_name: taxi-train-env
environment_file: mlops/azureml/train/train-env.yml
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: low_priority
- template: templates/${{ variables.version }}/register-data.yml@mlops-templates
parameters:
data_type: uri_file
data_name: taxi-data
data_file: mlops/azureml/train/data.yml
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_file: mlops/azureml/train/pipeline.yml
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
enable_monitoring: $(enable_monitoring)

Просмотреть файл

@ -1,44 +1,63 @@
name: deploy-batch-endpoint-pipeline
on:
on:
workflow_dispatch:
jobs:
set-env-branch:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
steps:
- id: set-prod-branch
name: set-prod-branch
if: ${{ github.ref == 'refs/heads/main'}}
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
- id: set-dev-branch
name: setdevbranch
if: ${{ github.ref != 'refs/heads/main'}}
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
- id: set-output-defaults
name: set-output-defaults
run: |
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
needs: set-env-branch
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: ${{ needs.set-env-branch.outputs.config-file}}
create-compute:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: batch-cluster
size: STANDARD_DS3_V2
min_instances: 0
max_instances: 5
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: batch-cluster
size: STANDARD_DS3_V2
min_instances: 0
max_instances: 5
cluster_tier: low_priority
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-endpoint:
needs: [get-config,create-compute]
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: [get-config, create-compute]
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-deployment:
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config,create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
deployment_name: eptestdeploy
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config, create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
endpoint_type: batch
deployment_name: eptestdeploy
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -1,42 +1,70 @@
name: deploy-model-training-pipeline
on:
on:
workflow_dispatch:
jobs:
set-env-branch:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
steps:
- id: set-prod-branch
name: set-prod-branch
if: ${{ github.ref == 'refs/heads/main'}}
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
- id: set-dev-branch
name: setdevbranch
if: ${{ github.ref != 'refs/heads/main'}}
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
- id: set-output-defaults
name: set-output-defaults
run: |
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
needs: set-env-branch
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: ${{ needs.set-env-branch.outputs.config-file}}
register-environment:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
environment_file: mlops/azureml/train/train-env.yml
conda_file: data-science/environment/train-conda.yml
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: get-config
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
environment_file: mlops/azureml/train/train-env.yml
conda_file: data-science/environment/train-conda.yml
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
register-dataset:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
name: taxi-data
data_file: mlops/azureml/train/data.yml
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-compute:
needs: [get-config,register-environment]
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: LowPriority
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
run-pipeline:
needs: [get-config,register-environment,create-compute]
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
parameters-file: mlops/azureml/train/pipeline.yml
job-name: test
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: [get-config]
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: low_priority
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
run-model-training-pipeline:
needs: [get-config, register-environment, register-dataset, create-compute]
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
parameters-file: mlops/azureml/train/pipeline.yml
job-name: test
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -1,42 +1,60 @@
name: deploy-online-endpoint-pipeline
on:
on:
workflow_dispatch:
jobs:
set-env-branch:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
steps:
- id: set-prod-branch
name: set-prod-branch
if: ${{ github.ref == 'refs/heads/main'}}
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
- id: set-dev-branch
name: setdevbranch
if: ${{ github.ref != 'refs/heads/main'}}
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
- id: set-output-defaults
name: set-output-defaults
run: |
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
needs: set-env-branch
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: ${{ needs.set-env-branch.outputs.config-file}}
create-endpoint:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
endpoint_type: online
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
endpoint_type: online
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-deployment:
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config,create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
endpoint_type: online
deployment_name: taxi-online-dp
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config, create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
endpoint_type: online
deployment_name: taxi-online-dp
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
allocate-traffic:
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
needs: [get-config,create-deployment]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
traffic_allocation: taxi-online-dp=100
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
needs: [get-config, create-deployment]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
traffic_allocation: taxi-online-dp=100
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -10,7 +10,7 @@ variables:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk
value: python-sdk-v1
trigger:
- none

Просмотреть файл

@ -10,7 +10,7 @@ variables:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk
value: python-sdk-v1
trigger:
- none

Просмотреть файл

@ -10,7 +10,7 @@ variables:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk
value: python-sdk-v1
trigger:
- none

Просмотреть файл

@ -7,7 +7,8 @@ dependencies:
- pip
- pip:
- azureml-mlflow==1.38.0
- azureml-sdk==1.38.0
- azure-ai-ml==1.0.0
- pyarrow==10.0.0
- scikit-learn==0.24.1
- pandas==1.2.1
- joblib==1.0.0

Просмотреть файл

@ -0,0 +1,174 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Evaluates trained ML model using test dataset.
Saves predictions, evaluation results and deploy flag.
"""
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import mlflow
import mlflow.sklearn
import mlflow.pyfunc
from mlflow.tracking import MlflowClient
TARGET_COL = "cost"
NUMERIC_COLS = [
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
"dropoff_hour", "dropoff_minute", "dropoff_second"
]
CAT_NOM_COLS = [
"store_forward", "vendor"
]
CAT_ORD_COLS = [
]
def parse_args():
'''Parse input arguments'''
parser = argparse.ArgumentParser("predict")
parser.add_argument("--model_name", type=str, help="Name of registered model")
parser.add_argument("--model_input", type=str, help="Path of input model")
parser.add_argument("--test_data", type=str, help="Path to test dataset")
parser.add_argument("--evaluation_output", type=str, help="Path of eval results")
args = parser.parse_args()
return args
def main(args):
'''Read trained model and test dataset, evaluate model and save result'''
# Load the test data
test_data = pd.read_parquet(Path(args.test_data))
# Split the data into inputs and outputs
y_test = test_data[TARGET_COL]
X_test = test_data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS]
# Load the model from input port
model = mlflow.sklearn.load_model(args.model_input)
# ---------------- Model Evaluation ---------------- #
yhat_test, score = model_evaluation(X_test, y_test, model, args.evaluation_output)
# ----------------- Model Promotion ---------------- #
predictions, deploy_flag = model_promotion(args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score)
def model_evaluation(X_test, y_test, model, evaluation_output):
# Get predictions to y_test (y_test)
yhat_test = model.predict(X_test)
# Save the output data with feature columns, predicted cost, and actual cost in csv file
output_data = X_test.copy()
output_data["real_label"] = y_test
output_data["predicted_label"] = yhat_test
output_data.to_csv((Path(evaluation_output) / "predictions.csv"))
# Evaluate Model performance with the test set
r2 = r2_score(y_test, yhat_test)
mse = mean_squared_error(y_test, yhat_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, yhat_test)
# Print score report to a text file
(Path(evaluation_output) / "score.txt").write_text(
f"Scored with the following model:\n{format(model)}"
)
with open((Path(evaluation_output) / "score.txt"), "a") as outfile:
outfile.write("Mean squared error: {mse.2f} \n")
outfile.write("Root mean squared error: {rmse.2f} \n")
outfile.write("Mean absolute error: {mae.2f} \n")
outfile.write("Coefficient of determination: {r2.2f} \n")
mlflow.log_metric("test r2", r2)
mlflow.log_metric("test mse", mse)
mlflow.log_metric("test rmse", rmse)
mlflow.log_metric("test mae", mae)
# Visualize results
plt.scatter(y_test, yhat_test, color='black')
plt.plot(y_test, y_test, color='blue', linewidth=3)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.title("Comparing Model Predictions to Real values - Test Data")
plt.savefig("predictions.png")
mlflow.log_artifact("predictions.png")
return yhat_test, r2
def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, score):
scores = {}
predictions = {}
client = MlflowClient()
for model_run in client.search_model_versions(f"name='{model_name}'"):
model_version = model_run.version
mdl = mlflow.pyfunc.load_model(
model_uri=f"models:/{model_name}/{model_version}")
predictions[f"{model_name}:{model_version}"] = mdl.predict(X_test)
scores[f"{model_name}:{model_version}"] = r2_score(
y_test, predictions[f"{model_name}:{model_version}"])
if scores:
if score >= max(list(scores.values())):
deploy_flag = 1
else:
deploy_flag = 0
else:
deploy_flag = 1
print(f"Deploy flag: {deploy_flag}")
with open((Path(evaluation_output) / "deploy_flag"), 'w') as outfile:
outfile.write(f"{int(deploy_flag)}")
# add current model score and predictions
scores["current model"] = score
predictions["currrent model"] = yhat_test
perf_comparison_plot = pd.DataFrame(
scores, index=["r2 score"]).plot(kind='bar', figsize=(15, 10))
perf_comparison_plot.figure.savefig("perf_comparison.png")
perf_comparison_plot.figure.savefig(Path(evaluation_output) / "perf_comparison.png")
mlflow.log_metric("deploy flag", bool(deploy_flag))
mlflow.log_artifact("perf_comparison.png")
return predictions, deploy_flag
if __name__ == "__main__":
mlflow.start_run()
args = parse_args()
lines = [
f"Model name: {args.model_name}",
f"Model path: {args.model_input}",
f"Test data path: {args.test_data}",
f"Evaluation output path: {args.evaluation_output}",
]
for line in lines:
print(line)
main(args)
mlflow.end_run()

Просмотреть файл

@ -0,0 +1,113 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Prepares raw data and provides training, validation and test datasets
"""
import argparse
from pathlib import Path
import os
import numpy as np
import pandas as pd
import mlflow
TARGET_COL = "cost"
NUMERIC_COLS = [
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
"dropoff_hour", "dropoff_minute", "dropoff_second"
]
CAT_NOM_COLS = [
"store_forward", "vendor"
]
CAT_ORD_COLS = [
]
def parse_args():
'''Parse input arguments'''
parser = argparse.ArgumentParser("prep")
parser.add_argument("--raw_data", type=str, help="Path to raw data")
parser.add_argument("--train_data", type=str, help="Path to train dataset")
parser.add_argument("--val_data", type=str, help="Path to test dataset")
parser.add_argument("--test_data", type=str, help="Path to test dataset")
parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
args = parser.parse_args()
return args
def log_training_data(df, table_name):
from obs.collector import Online_Collector
collector = Online_Collector(table_name)
collector.batch_collect(df)
def main(args):
'''Read, split, and save datasets'''
# ------------ Reading Data ------------ #
# -------------------------------------- #
data = pd.read_csv((Path(args.raw_data)))
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
# ------------- Split Data ------------- #
# -------------------------------------- #
# Split data into train, val and test datasets
random_data = np.random.rand(len(data))
msk_train = random_data < 0.7
msk_val = (random_data >= 0.7) & (random_data < 0.85)
msk_test = random_data >= 0.85
train = data[msk_train]
val = data[msk_val]
test = data[msk_test]
mlflow.log_metric('train size', train.shape[0])
mlflow.log_metric('val size', val.shape[0])
mlflow.log_metric('test size', test.shape[0])
train.to_parquet((Path(args.train_data) / "train.parquet"))
val.to_parquet((Path(args.val_data) / "val.parquet"))
test.to_parquet((Path(args.test_data) / "test.parquet"))
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
log_training_data(data, args.table_name)
if __name__ == "__main__":
mlflow.start_run()
# ---------- Parse Arguments ----------- #
# -------------------------------------- #
args = parse_args()
lines = [
f"Raw data path: {args.raw_data}",
f"Train dataset output path: {args.train_data}",
f"Val dataset output path: {args.val_data}",
f"Test dataset path: {args.test_data}",
]
for line in lines:
print(line)
main(args)
mlflow.end_run()

Просмотреть файл

@ -0,0 +1,85 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Registers trained ML model if deploy flag is True.
"""
import argparse
from pathlib import Path
import pickle
import mlflow
import os
import json
def parse_args():
'''Parse input arguments'''
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str, help='Name under which model will be registered')
parser.add_argument('--model_path', type=str, help='Model directory')
parser.add_argument('--evaluation_output', type=str, help='Path of eval results')
parser.add_argument(
"--model_info_output_path", type=str, help="Path to write model info JSON"
)
args, _ = parser.parse_known_args()
print(f'Arguments: {args}')
return args
def main(args):
'''Loads model, registers it if deply flag is True'''
with open((Path(args.evaluation_output) / "deploy_flag"), 'rb') as infile:
deploy_flag = int(infile.read())
mlflow.log_metric("deploy flag", int(deploy_flag))
deploy_flag=1
if deploy_flag==1:
print("Registering ", args.model_name)
# load model
model = mlflow.sklearn.load_model(args.model_path)
# log model using mlflow
mlflow.sklearn.log_model(model, args.model_name)
# register logged model using mlflow
run_id = mlflow.active_run().info.run_id
model_uri = f'runs:/{run_id}/{args.model_name}'
mlflow_model = mlflow.register_model(model_uri, args.model_name)
model_version = mlflow_model.version
# write model info
print("Writing JSON")
dict = {"id": "{0}:{1}".format(args.model_name, model_version)}
output_path = os.path.join(args.model_info_output_path, "model_info.json")
with open(output_path, "w") as of:
json.dump(dict, fp=of)
else:
print("Model will not be registered!")
if __name__ == "__main__":
mlflow.start_run()
# ---------- Parse Arguments ----------- #
# -------------------------------------- #
args = parse_args()
lines = [
f"Model name: {args.model_name}",
f"Model path: {args.model_path}",
f"Evaluation output path: {args.evaluation_output}",
]
for line in lines:
print(line)
main(args)
mlflow.end_run()

Просмотреть файл

@ -0,0 +1,147 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
Trains ML model using training dataset. Saves trained model.
"""
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import mlflow
import mlflow.sklearn
TARGET_COL = "cost"
NUMERIC_COLS = [
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
"dropoff_hour", "dropoff_minute", "dropoff_second"
]
CAT_NOM_COLS = [
"store_forward", "vendor"
]
CAT_ORD_COLS = [
]
def parse_args():
'''Parse input arguments'''
parser = argparse.ArgumentParser("train")
parser.add_argument("--train_data", type=str, help="Path to train dataset")
parser.add_argument("--model_output", type=str, help="Path of output model")
# classifier specific arguments
parser.add_argument('--regressor__n_estimators', type=int, default=500,
help='Number of trees')
parser.add_argument('--regressor__bootstrap', type=int, default=1,
help='Method of selecting samples for training each tree')
parser.add_argument('--regressor__max_depth', type=int, default=10,
help=' Maximum number of levels in tree')
parser.add_argument('--regressor__max_features', type=str, default='auto',
help='Number of features to consider at every split')
parser.add_argument('--regressor__min_samples_leaf', type=int, default=4,
help='Minimum number of samples required at each leaf node')
parser.add_argument('--regressor__min_samples_split', type=int, default=5,
help='Minimum number of samples required to split a node')
args = parser.parse_args()
return args
def main(args):
'''Read train dataset, train model, save trained model'''
# Read train data
train_data = pd.read_parquet(Path(args.train_data))
# Split the data into input(X) and output(y)
y_train = train_data[TARGET_COL]
X_train = train_data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS]
# Train a Random Forest Regression Model with the training set
model = RandomForestRegressor(n_estimators = args.regressor__n_estimators,
bootstrap = args.regressor__bootstrap,
max_depth = args.regressor__max_depth,
max_features = args.regressor__max_features,
min_samples_leaf = args.regressor__min_samples_leaf,
min_samples_split = args.regressor__min_samples_split,
random_state=0)
# log model hyperparameters
mlflow.log_param("model", "RandomForestRegressor")
mlflow.log_param("n_estimators", args.regressor__n_estimators)
mlflow.log_param("bootstrap", args.regressor__bootstrap)
mlflow.log_param("max_depth", args.regressor__max_depth)
mlflow.log_param("max_features", args.regressor__max_features)
mlflow.log_param("min_samples_leaf", args.regressor__min_samples_leaf)
mlflow.log_param("min_samples_split", args.regressor__min_samples_split)
# Train model with the train set
model.fit(X_train, y_train)
# Predict using the Regression Model
yhat_train = model.predict(X_train)
# Evaluate Regression performance with the train set
r2 = r2_score(y_train, yhat_train)
mse = mean_squared_error(y_train, yhat_train)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_train, yhat_train)
# log model performance metrics
mlflow.log_metric("train r2", r2)
mlflow.log_metric("train mse", mse)
mlflow.log_metric("train rmse", rmse)
mlflow.log_metric("train mae", mae)
# Visualize results
plt.scatter(y_train, yhat_train, color='black')
plt.plot(y_train, y_train, color='blue', linewidth=3)
plt.xlabel("Real value")
plt.ylabel("Predicted value")
plt.savefig("regression_results.png")
mlflow.log_artifact("regression_results.png")
# Save the model
mlflow.sklearn.save_model(sk_model=model, path=args.model_output)
if __name__ == "__main__":
mlflow.start_run()
# ---------- Parse Arguments ----------- #
# -------------------------------------- #
args = parse_args()
lines = [
f"Train dataset input path: {args.train_data}",
f"Model output path: {args.model_output}",
f"n_estimators: {args.regressor__n_estimators}",
f"bootstrap: {args.regressor__bootstrap}",
f"max_depth: {args.regressor__max_depth}",
f"max_features: {args.regressor__max_features}",
f"min_samples_leaf: {args.regressor__min_samples_leaf}",
f"min_samples_split: {args.regressor__min_samples_split}"
]
for line in lines:
print(line)
main(args)
mlflow.end_run()

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,3 @@
{"input_data": [[2.86,40.66551971,-73.98258972,1,40.69801331,-73.97357178,0,2,1,1,19,21,3,56,1,1,19,21,21,57],
[3.98,40.68072128,-73.931633,1,40.6909523,-73.99185181,0,2,0,1,4,21,44,11,0,1,4,21,59,35]]
}

Просмотреть файл

@ -0,0 +1,195 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import argparse
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
from azure.ai.ml.entities import AmlCompute
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Environment
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import Input, Output, command
from azure.ai.ml.constants import AssetTypes, InputOutputModes
import json
import yaml
import os
def parse_args():
parser = argparse.ArgumentParser("Deploy Training Pipeline")
parser.add_argument("--experiment_name", type=str, help="Experiment Name")
parser.add_argument("--compute_name", type=str, help="Compute Cluster Name")
parser.add_argument("--data_name", type=str, help="Data Asset Name")
parser.add_argument("--environment_name", type=str, help="Registered Environment Name")
parser.add_argument("--enable_monitoring", type=str, help="Enable Monitoring", default="false")
parser.add_argument("--table_name", type=str, help="ADX Monitoring Table Name", default="taximonitoring")
args = parser.parse_args()
return parser.parse_args()
def main():
args = parse_args()
print(args)
credential = DefaultAzureCredential()
try:
ml_client = MLClient.from_config(credential, path='config.json')
except Exception as ex:
print("HERE IN THE EXCEPTION BLOCK")
print(ex)
try:
print(ml_client.compute.get(args.compute_name))
except:
print("No compute found")
print(os.getcwd())
print('current', os.listdir())
# Create pipeline job
# 1. Define components
parent_dir = "data-science/src"
prep_data = command(
name="prep_data",
display_name="prep-data",
code=os.path.join(parent_dir, "prep"),
command="python prep.py \
--raw_data ${{inputs.raw_data}} \
--train_data ${{outputs.train_data}} \
--val_data ${{outputs.val_data}} \
--test_data ${{outputs.test_data}} \
--enable_monitoring ${{inputs.enable_monitoring}} \
--table_name ${{inputs.table_name}}",
environment=args.environment_name+"@latest",
inputs={
"raw_data": Input(type="uri_file"),
"enable_monitoring": Input(type="string"),
"table_name": Input(type="string")
},
outputs={
"train_data": Output(type="uri_folder"),
"val_data": Output(type="uri_folder"),
"test_data": Output(type="uri_folder"),
}
)
train_model = command(
name="train_model",
display_name="train-model",
code=os.path.join(parent_dir, "train"),
command="python train.py \
--train_data ${{inputs.train_data}} \
--model_output ${{outputs.model_output}}",
environment=args.environment_name+"@latest",
inputs={"train_data": Input(type="uri_folder")},
outputs={"model_output": Output(type="uri_folder")}
)
evaluate_model = command(
name="evaluate_model",
display_name="evaluate-model",
code=os.path.join(parent_dir, "evaluate"),
command="python evaluate.py \
--model_name ${{inputs.model_name}} \
--model_input ${{inputs.model_input}} \
--test_data ${{inputs.test_data}} \
--evaluation_output ${{outputs.evaluation_output}}",
environment=args.environment_name+"@latest",
inputs={
"model_name": Input(type="string"),
"model_input": Input(type="uri_folder"),
"test_data": Input(type="uri_folder")
},
outputs={
"evaluation_output": Output(type="uri_folder")
}
)
register_model = command(
name="register_model",
display_name="register-model",
code=os.path.join(parent_dir, "register"),
command="python register.py \
--model_name ${{inputs.model_name}} \
--model_path ${{inputs.model_path}} \
--evaluation_output ${{inputs.evaluation_output}} \
--model_info_output_path ${{outputs.model_info_output_path}}",
environment=args.environment_name+"@latest",
inputs={
"model_name": Input(type="string"),
"model_path": Input(type="uri_folder"),
"evaluation_output": Input(type="uri_folder")
},
outputs={
"model_info_output_path": Output(type="uri_folder")
}
)
# 2. Construct pipeline
@pipeline()
def taxi_training_pipeline(raw_data, enable_monitoring, table_name):
prep = prep_data(
raw_data=raw_data,
enable_monitoring=enable_monitoring,
table_name=table_name
)
train = train_model(
train_data=prep.outputs.train_data
)
evaluate = evaluate_model(
model_name="taxi-model",
model_input=train.outputs.model_output,
test_data=prep.outputs.test_data
)
register = register_model(
model_name="taxi-model",
model_path=train.outputs.model_output,
evaluation_output=evaluate.outputs.evaluation_output
)
return {
"pipeline_job_train_data": prep.outputs.train_data,
"pipeline_job_test_data": prep.outputs.test_data,
"pipeline_job_trained_model": train.outputs.model_output,
"pipeline_job_score_report": evaluate.outputs.evaluation_output,
}
pipeline_job = taxi_training_pipeline(
Input(path=args.data_name + "@latest", type="uri_file"), args.enable_monitoring, args.table_name
)
# set pipeline level compute
pipeline_job.settings.default_compute = args.compute_name
# set pipeline level datastore
pipeline_job.settings.default_datastore = "workspaceblobstore"
pipeline_job = ml_client.jobs.create_or_update(
pipeline_job, experiment_name=args.experiment_name
)
pipeline_job
ml_client.jobs.stream(pipeline_job.name)
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,66 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk-v2
- name: endpoint_name
value: taxi-batch-$(namespace)$(postfix)$(environment)
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: CreateBatchEndpoint
displayName: Create/Update Batch Endpoint
jobs:
- job: DeployBatchEndpoint
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
- template: templates/aml-cli-v2/create-compute.yml@mlops-templates
parameters:
cluster_name: batch-cluster # name must match cluster name in deployment file below
size: STANDARD_DS3_V2
min_instances: 0
max_instances: 5
cluster_tier: dedicated
- template: templates/${{ variables.version }}/create-batch-endpoint.yml@mlops-templates
parameters:
endpoint_name: "${{ variables.endpoint_name }}"
endpoint_description: "Taxi batch endpoint"
auth_mode: "aad_token"
- template: templates/${{ variables.version }}/create-batch-deployment.yml@mlops-templates
parameters:
deployment_name: taxi-batch-dp
deployment_description: "Taxi batch deployment"
endpoint_name: "${{ variables.endpoint_name }}"
model_path: "taxi-model@latest"
compute: batch-cluster
- template: templates/${{ variables.version }}/test-batch-endpoint.yml@mlops-templates
parameters:
endpoint_name: "${{ variables.endpoint_name }}"
sample_request: data/taxi-batch.csv
request_type: uri_file #either uri_folder or uri_file

Просмотреть файл

@ -0,0 +1,70 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk-v2
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
- template: templates/python-sdk-v2/install-requirements.yml@mlops-templates
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
- template: templates/aml-cli-v2/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: low_priority
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
environment_name: taxi-train-env
environment_description: "Training Environment for Taxi Pipeline"
environment_path: data-science/environment/train-conda.yml
build_type: conda
- template: templates/${{ variables.version }}/register-data-asset.yml@mlops-templates
parameters:
data_name: taxi-data
data_description: taxi-training-dataset
data_path: data/taxi-data.csv
data_type: uri_file
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_path: mlops/azureml/train/run_pipeline.py
experiment_name: taxi-train-pipeline
data_name: taxi-data
environment_name: taxi-train-env
compute_name: cpu-cluster
enable_monitoring: $(enable_monitoring)
table_name: "taximonitoring"

Просмотреть файл

@ -0,0 +1,57 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk-v2
- name: endpoint_name
value: taxi-online-$(namespace)$(postfix)$(environment)
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: CreateOnlineEndpoint
displayName: Create/Update Online Endpoint
jobs:
- job: DeployOnlineEndpoint
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/create-online-endpoint.yml@mlops-templates
parameters:
endpoint_name: "${{ variables.endpoint_name }}"
endpoint_description: "Taxi Online Endpoint"
auth_mode: "aml_token"
- template: templates/${{ variables.version }}/create-online-deployment.yml@mlops-templates
parameters:
deployment_name: taxi-online-dp
endpoint_name: "${{ variables.endpoint_name }}"
model_path: "taxi-model@latest"
traffic_allocation: 100
- template: templates/${{ variables.version }}/test-online-endpoint.yml@mlops-templates
parameters:
endpoint_name: "${{ variables.endpoint_name }}"
sample_request: data/taxi-request.json

Просмотреть файл

@ -1,16 +0,0 @@
channels:
- defaults
- anaconda
- conda-forge
dependencies:
- python=3.7.5
- pip
- pip:
- azureml-mlflow==1.38.0
- azureml-sdk==1.38.0
- scikit-learn==0.24.1
- pandas==1.2.1
- joblib==1.0.0
- matplotlib==3.3.3
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector

Просмотреть файл

@ -1,6 +1,6 @@
azureml-mlflow==1.38.0
azureml-sdk==1.38.0
pyarrow==10.0.0
scikit-learn==0.24.1
pandas==1.2.1
joblib==1.2.0
matplotlib==3.3.3
matplotlib==3.3.3

Просмотреть файл

@ -71,11 +71,7 @@ def main(args):
# ------------ Reading Data ------------ #
# -------------------------------------- #
print("mounted_path files: ")
arr = os.listdir(args.raw_data)
print(arr)
data = pd.read_csv((Path(args.raw_data) / 'taxi-data.csv'))
data = pd.read_csv((Path(args.raw_data)))
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
# ------------- Split Data ------------- #
@ -101,7 +97,7 @@ def main(args):
val.to_parquet((Path(args.val_data) / "val.parquet"))
test.to_parquet((Path(args.test_data) / "test.parquet"))
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
log_training_data(data, args.table_name)
@ -129,4 +125,4 @@ if __name__ == "__main__":
mlflow.end_run()

Просмотреть файл

@ -82,7 +82,9 @@ def test_prep_data():
df = pd.DataFrame(data)
df.to_csv(os.path.join(raw_data, "taxi-data.csv"))
raw_data= os.path.join(raw_data, "taxi-data.csv")
cmd = f"python data-science/src/prep/prep.py --raw_data={raw_data} --train_data={train_data} --val_data={val_data} --test_data={test_data}"
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
out, err = p.communicate()
result = str(out).split('\\n')

Просмотреть файл

@ -2,63 +2,75 @@
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: aml-cli-v2
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: aml-cli-v2
trigger:
- none
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
- repository: rai-vnext-preview # Template Repo
- repository: rai-vnext-preview # Template Repo
name: Azure/rai-vnext-preview # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- checkout: rai-vnext-preview
path: s/
- template: templates/tests/unit-tests.yml@mlops-templates
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
build_type: conda
environment_name: taxi-train-env
environment_file: mlops/azureml/train/train-env.yml
enable_monitoring: $(enable_monitoring)
- template: register-rai-components.yml
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_file: mlops/azureml/train/pipeline.yml
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
enable_monitoring: $(enable_monitoring)
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/tests/unit-tests.yml@mlops-templates
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
environment_name: taxi-train-env
environment_file: mlops/azureml/train/train-env.yml
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: low_priority
- template: templates/${{ variables.version }}/register-data.yml@mlops-templates
parameters:
data_type: uri_file
data_name: taxi-data
data_file: mlops/azureml/train/data.yml
- checkout: rai-vnext-preview
path: s/
- template: register-rai-components.yml
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_file: mlops/azureml/train/pipeline.yml
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
enable_monitoring: $(enable_monitoring)

Просмотреть файл

@ -1,15 +1,14 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# Dev environment
variables:
# Global
ap_vm_image: ubuntu-20.04
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
postfix: 0659
location: westus
postfix: 0001
location: eastus
environment: dev
enable_aml_computecluster: true
enable_monitoring: false
@ -29,8 +28,9 @@ variables:
storage_account: st$(namespace)$(postfix)$(environment)
# For terraform reference
terraform_version: 0.14.7
terraform_version: 1.3.6
terraform_workingdir: infrastructure
terraform_st_location: $(location)
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
terraform_st_container_name: default

Просмотреть файл

@ -3,18 +3,16 @@
# Prod environment
variables:
# Global
ap_vm_image: ubuntu-20.04
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
postfix: 0518
location: westeurope
postfix: 0001
location: eastus
environment: prod
enable_aml_computecluster: true
enable_monitoring: false
# Azure DevOps
ado_service_connection_rg: Azure-ARM-Prod
ado_service_connection_aml_ws: Azure-ARM-Prod
@ -30,8 +28,9 @@ variables:
storage_account: st$(namespace)$(postfix)$(environment)
# For terraform reference
terraform_version: 0.14.7
terraform_version: 1.3.6
terraform_workingdir: infrastructure
terraform_st_location: $(location)
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
terraform_st_container_name: default

Просмотреть файл

@ -1,51 +1,90 @@
name: deploy-cv-model-training-pipeline
on:
on:
workflow_dispatch:
jobs:
set-env-branch:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
steps:
- id: set-prod-branch
name: set-prod-branch
if: ${{ github.ref == 'refs/heads/main'}}
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
- id: set-dev-branch
name: setdevbranch
if: ${{ github.ref != 'refs/heads/main'}}
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
- id: set-output-defaults
name: set-output-defaults
run: |
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
create-compute:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: gpu-cluster
size: Standard_NC6
min_instances: 0
max_instances: 1
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: set-env-branch
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: ${{ needs.set-env-branch.outputs.config-file}}
create-dataprep-compute:
needs: [get-config]
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: low_priority
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-training-compute:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
with:
cluster_name: gpu-cluster
size: Standard_NC6
min_instances: 0
max_instances: 1
cluster_tier: low_priority
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
register-environment:
needs: [get-config,create-compute]
uses: ./.github/workflows/register-environment.yml
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
environment_file: mlops/azureml/train/train-env.yaml
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: [get-config, create-dataprep-compute, create-training-compute]
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
environment_file: mlops/azureml/train/train-env.yaml
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
register-dataset:
needs: [get-config,register-environment]
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
data_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
file_type: Training
name: stanford_dogs
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
run-pipeline:
needs: [get-config,register-dataset]
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
parameters-file: mlops/azureml/train/pipeline.yaml
job-name: cv-train
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: [get-config, register-environment]
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
data_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
file_type: Training
name: stanford_dogs
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
run-model-training-pipeline:
needs:
[
get-config,
create-dataprep-compute,
create-training-compute,
register-environment,
register-dataset,
]
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
parameters-file: mlops/azureml/train/pipeline.yaml
job-name: cv-train
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

@ -1,42 +1,60 @@
name: deploy-online-endpoint-pipeline
on:
on:
workflow_dispatch:
jobs:
set-env-branch:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
steps:
- id: set-prod-branch
name: set-prod-branch
if: ${{ github.ref == 'refs/heads/main'}}
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
- id: set-dev-branch
name: setdevbranch
if: ${{ github.ref != 'refs/heads/main'}}
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
- id: set-output-defaults
name: set-output-defaults
run: |
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
get-config:
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: config-infra-prod.yml
needs: set-env-branch
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: ${{ needs.set-env-branch.outputs.config-file}}
create-endpoint:
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
endpoint_name: dogs-classifier-online2
endpoint_type: online
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
needs: get-config
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
endpoint_name: dogs-classifier-online2
endpoint_type: online
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
create-deployment:
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config,create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
endpoint_name: dogs-classifier-online2
endpoint_type: online
deployment_name: dogs-online-dp
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
needs: [get-config, create-endpoint]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
endpoint_name: dogs-classifier-online2
endpoint_type: online
deployment_name: dogs-online-dp
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
allocate-traffic:
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
needs: [get-config,create-deployment]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
traffic_allocation: dogs-online-dp=100
endpoint_name: dogs-classifier-online2
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
needs: [get-config, create-deployment]
with:
resource_group: ${{ needs.get-config.outputs.resource_group }}
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
traffic_allocation: dogs-online-dp=100
endpoint_name: dogs-classifier-online2
secrets:
creds: ${{secrets.AZURE_CREDENTIALS}}

Просмотреть файл

Просмотреть файл

Не удается отобразить этот файл, потому что он слишком большой.

Просмотреть файл

@ -0,0 +1,103 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- template: ../../config-aml.yml
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk-v1
trigger:
- none
pool:
vmImage: $(ap_vm_image)
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
steps:
# Setup
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
# Environment
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
parameters:
environment_name: $(training_env_name)
build_type: folder
environment_file: $(training_env_path)
# Compute
- template: templates/${{ variables.version }}/get-compute.yml@mlops-templates
parameters:
compute_type: training
# Datasets (images + labels)
# Images dataset
- task: Bash@3
displayName: "Download data"
inputs:
targetType: inline
script: |
mkdir -p $(training_dataset_local_path)
curl $(training_dataset_storage_url) | tar xvf - --no-same-owner -C $(training_dataset_local_path)
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
# Labels dataset
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
datasetName: $(labels_dataset_name)
datasetDescription: $(labels_dataset_description)
datasetLocalPath: $(labels_dataset_local_path)
datasetPathOnDatastore: $(labels_dataset_path_on_datastore)
datasetType: $(labels_dataset_type)
# Deploy training pipeline
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
- task: Bash@3
name: export_pipeline_id
displayName: "Export Pipeline ID"
inputs:
targetType: "inline"
script: |
echo "##vso[task.setvariable variable=pipeline_id;isOutput=true;]$(pipeline_id)"
# Run training
- job: invoke_pipeline
displayName: "Invoke pipeline"
pool: server
timeoutInMinutes: 0
dependsOn: DeployTrainingPipeline
variables:
pipeline_id: $[ dependencies.DeployTrainingPipeline.outputs['export_pipeline_id.pipeline_id'] ]
steps:
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
displayName: "Invoke AML Pipeline"
inputs:
azureSubscription: "$(ado_service_connection_aml_ws)"
PipelineId: "$(PIPELINE_ID)"
ExperimentName: "$(training_experiment_name)"

Просмотреть файл

@ -1,105 +0,0 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- template: ../../config-aml.yml
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk
trigger:
- none
pool:
vmImage: $(ap_vm_image)
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
steps:
# Setup
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
# Environment
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
parameters:
environment_name: $(training_env_name)
build_type: folder
environment_file: $(training_env_path)
# Compute
- template: templates/${{ variables.version }}/get-compute.yml@mlops-templates
parameters:
compute_type: training
# Datasets (images + labels)
# Images dataset
- task: Bash@3
displayName: 'Download data'
inputs:
targetType: inline
script: |
mkdir -p $(training_dataset_local_path)
curl $(training_dataset_storage_url) | tar xvf - --no-same-owner -C $(training_dataset_local_path)
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
# Labels dataset
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
parameters:
data_type: training
datasetName: $(labels_dataset_name)
datasetDescription: $(labels_dataset_description)
datasetLocalPath: $(labels_dataset_local_path)
datasetPathOnDatastore: $(labels_dataset_path_on_datastore)
datasetType: $(labels_dataset_type)
# Deploy training pipeline
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
- task: Bash@3
name: export_pipeline_id
displayName: "Export Pipeline ID"
inputs:
targetType: "inline"
script: |
echo "##vso[task.setvariable variable=pipeline_id;isOutput=true;]$(pipeline_id)"
# Run training
- job: invoke_pipeline
displayName: 'Invoke pipeline'
pool: server
timeoutInMinutes: 0
dependsOn: DeployTrainingPipeline
variables:
pipeline_id: $[ dependencies.DeployTrainingPipeline.outputs['export_pipeline_id.pipeline_id'] ]
steps:
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
displayName: 'Invoke AML Pipeline'
inputs:
azureSubscription: '$(ado_service_connection_aml_ws)'
PipelineId: '$(PIPELINE_ID)'
ExperimentName: '$(training_experiment_name)'

Просмотреть файл

@ -0,0 +1,63 @@
name: tf-gha-deploy-infra.yml
on:
#push:
workflow_dispatch:
env:
config_env: "none"
jobs:
set-env-branch:
runs-on: ubuntu-latest
outputs:
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
steps:
- id: set-prod-branch
name: set-prod-branch
if: ${{ github.ref == 'refs/heads/main'}}
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
- id: set-dev-branch
name: setdevbranch
if: ${{ github.ref != 'refs/heads/main'}}
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
- id: set-output-defaults
name: set-output-defaults
run: |
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
get-config:
needs: set-env-branch
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
with:
file_name: ${{ needs.set-env-branch.outputs.config-file}}
test-terraform-state-deployment:
needs: [get-config, set-env-branch]
uses: Azure/mlops-templates/.github/workflows/tf-gha-install-terraform.yml@main
with:
TFAction: "apply"
dply_environment: ${{ needs.set-env-branch.outputs.config-file }}
location: ${{ needs.get-config.outputs.location }}
namespace: ${{ needs.get-config.outputs.namespace }}
postfix: ${{ needs.get-config.outputs.postfix }}
environment: ${{ needs.get-config.outputs.environment }}
enable_aml_computecluster: ${{ needs.get-config.outputs.enable_aml_computecluster == true }} ## TODO review the evaluation of boolean
enable_monitoring: ${{ needs.get-config.outputs.enable_monitoring == true }} ## TODO review the evaluation of boolean
terraform_version: ${{ needs.get-config.outputs.terraform_version }}
terraform_workingdir: ${{ needs.get-config.outputs.terraform_workingdir }}
terraform_st_location: ${{ needs.get-config.outputs.terraform_st_location }}
terraform_st_storage_account: ${{ needs.get-config.outputs.terraform_st_storage_account }}
terraform_st_resource_group: ${{ needs.get-config.outputs.terraform_st_resource_group }}
terraform_st_container_name: ${{ needs.get-config.outputs.terraform_st_container_name }}
terraform_st_key: ${{ needs.get-config.outputs.terraform_st_key }}
terraform_plan_location: ${{ needs.get-config.outputs.location }}
terraform_plan_vnet: "TBD" # TBD
secrets:
azure_creds: ${{ secrets.AZURE_CREDENTIALS }}
clientId: ${{ secrets.ARM_CLIENT_ID }}
clientSecret: ${{ secrets.ARM_CLIENT_SECRET }}
subscriptionId: ${{ secrets.ARM_SUBSCRIPTION_ID }}
tenantId: ${{ secrets.ARM_TENANT_ID }}
deploy-azureml-resources:
runs-on: ubuntu-latest
steps:
- id: deploy-aml-workspace
name: deploy-aml-workspace
run: echo "OK"

Просмотреть файл

@ -56,4 +56,4 @@ resource "azurerm_key_vault_secret" "ADX_DB" {
value = azurerm_kusto_database.database[0].name
key_vault_id = var.key_vault_id
count = var.enable_monitoring ? 1 : 0
}
}

Просмотреть файл

@ -19,13 +19,13 @@ resource "azurerm_storage_account" "st" {
is_hns_enabled = var.hns_enabled
tags = var.tags
}
# Virtual Network & Firewall configuration
resource "azurerm_storage_account_network_rules" "firewall_rules" {
resource_group_name = var.rg_name
storage_account_name = azurerm_storage_account.st.name
storage_account_id = azurerm_storage_account.st.id
default_action = "Allow"
ip_rules = [] # [data.http.ip.body]

Просмотреть файл

@ -16,3 +16,13 @@ psutil==5.9.0
# for unit testing
pytest==7.1.2
# for azure ml SDK v2
azure-ai-ml==1.1.0
azure-common==1.1.28
azure-core==1.26.1
azure-identity==1.10.0
azure-mgmt-core==1.3.0
azure-storage-blob==12.14.1
azure-storage-file-datalake==12.9.1
azure-storage-file-share==12.7.0

Просмотреть файл

@ -63,7 +63,6 @@ stages:
cluster_tier: dedicated
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
build_type: conda
environment_name: nlp_summarization_train
environment_file: mlops/azureml/train/train-env.yml
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates

Просмотреть файл

@ -0,0 +1,24 @@
name: nlp_inference_conda_env
channels:
- pytorch
- anaconda
- defaults
- conda-forge
dependencies:
- python=3.8
- pip=21.2.4
- pytorch=1.10.0
- torchvision=0.11.1
- torchaudio=0.10.0
- cudatoolkit=11.1.1
- nvidia-apex=0.1.0
- gxx_linux-64=8.5.0
- pip:
- azureml-defaults==1.39.0
- azureml-mlflow==1.39.0
- azureml-telemetry==1.39.0
- azureml-train-core==1.39.0
- mlflow==1.24.0
- transformers==4.17.0
- 'inference-schema[numpy-support]==1.3.0'
- applicationinsights==0.11.10

Просмотреть файл

@ -0,0 +1,7 @@
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
FROM nvcr.io/nvidia/pytorch:22.04-py3
# Install dependencies missing in this container
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
COPY requirements.txt ./
RUN pip install -r requirements.txt

Просмотреть файл

@ -0,0 +1,21 @@
# data science requirements
# torchvision==0.12.0
# torch==1.11.0
pytorch_lightning==1.6.4
transformers==4.18.0
datasets==2.3.2
rouge_score==0.0.4
sentencepiece==0.1.96
# for metrics reporting/plotting
mlflow==1.25.1
azureml-mlflow==1.41.0
# matplotlib==3.5.2
# tqdm==4.64.0
psutil==5.9.0
# for unit testing
pytest==7.1.2
# for azure ml SDK v2
azure-ai-ml==1.1.0

Просмотреть файл

@ -0,0 +1,88 @@
import os
import argparse
import logging
import mlflow
import json
from distutils.util import strtobool
def main():
"""Main function of the script."""
# initialize root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# input and output arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"--baseline_metrics",
type=str,
required=True,
help="path to baseline metrics folder containing all_results.json",
)
parser.add_argument(
"--candidate_metrics",
type=str,
required=True,
help="path to candidate metrics folder containing all_results.json",
)
parser.add_argument(
"--reference_metric",
type=str,
default="predict_rougeLsum",
help="name of reference metric for shipping flag (default: predict_rougeLsum)",
)
parser.add_argument(
"--force_comparison", type=strtobool, default=False, help="set to True to bypass comparison and set --deploy_flag to True"
)
parser.add_argument(
"--deploy_flag", type=str, help="a deploy flag whether to deploy or not"
)
args = parser.parse_args()
# Start Logging
mlflow.start_run()
logger.info(f"Running with arguments: {args}")
# open metrics on both sides
with open(os.path.join(args.baseline_metrics, "all_results.json")) as in_file:
baseline_metrics = json.loads(in_file.read())
with open(os.path.join(args.candidate_metrics, "all_results.json")) as in_file:
candidate_metrics = json.loads(in_file.read())
# should we ship or not?
if args.force_comparison:
deploy_flag = True
else:
deploy_flag = (
candidate_metrics[args.reference_metric]
> baseline_metrics[args.reference_metric]
)
logger.info("baseline_metrics[{}]={}, candidate_metrics[{}]={}, deploy_flag={} (force_comparison={})".format(
args.reference_metric,
baseline_metrics[args.reference_metric],
args.reference_metric,
candidate_metrics[args.reference_metric],
deploy_flag,
args.force_comparison
))
# save deploy_flag as a file
os.makedirs(args.deploy_flag, exist_ok=True)
with open(os.path.join(args.deploy_flag, "deploy_flag"), "w") as out_file:
out_file.write("%d" % int(deploy_flag))
# Stop Logging
mlflow.end_run()
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,151 @@
"""
This scripts prepares a HuggingFace dataset to be used
for fine-tuning. It encodes the train/val/test tests and
outputs as JSONL files.
"""
import os
import argparse
import logging
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import mlflow
def main():
"""Main function of the script."""
# initialize root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# input and output arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset_name", type=str, help="name of dataset or path to input dataset_name"
)
parser.add_argument(
"--dataset_config", type=str, help="config for huggingface dataset"
)
parser.add_argument("--text_column", type=str, help="name of text_column")
parser.add_argument("--summary_column", type=str, help="name of summary_column")
parser.add_argument(
"--max_input_length", type=int, default=512, help="max_input_length"
)
parser.add_argument(
"--max_target_length", type=int, default=40, help="max_target_length"
)
parser.add_argument(
"--padding", type=str, default="max_length", help="padding type"
)
parser.add_argument(
"--model_arch",
type=str,
help="name of the model to prepare for in HF model library",
)
parser.add_argument(
"--limit_samples", type=int, default=-1, help="sample size from input dataset"
)
parser.add_argument("--encodings", type=str, help="path to tokenized dataset")
parser.add_argument(
"--source_prefix",
type=str,
help="A prefix to add before every source text (useful for T5 models).",
)
args = parser.parse_args()
# Start Logging
mlflow.start_run()
logger.info(f"Running with arguments: {args}")
# get tokenizer ready
tokenizer = AutoTokenizer.from_pretrained(args.model_arch)
logger.info(f"tokenizer: {tokenizer}")
prefix = args.source_prefix if args.source_prefix is not None else ""
if args.source_prefix is None and "t5" in args.model_arch.lower():
logger.warning(
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
"`--source_prefix 'summarize: ' `"
)
# Load HuggingFace dataset
raw_dataset = load_dataset(args.dataset_name, args.dataset_config)
logger.info(f"raw dataset length: {raw_dataset.num_rows}")
mlflow.log_metric("train_samples", raw_dataset["train"].shape[0])
mlflow.log_metric("test_samples", raw_dataset["test"].shape[0])
mlflow.log_metric("validation_samples", raw_dataset["validation"].shape[0])
if args.limit_samples > 0:
sample_sizes = {
k: min(len(raw_dataset[k]), args.limit_samples) for k in raw_dataset.keys()
}
raw_dataset = DatasetDict(
{
k: raw_dataset[k].select([i for i in range(sample_sizes[k])])
for k in raw_dataset.keys()
}
)
logger.info("sampled raw dataset:")
logger.info(raw_dataset.num_rows)
def preprocess_function(examples):
# remove pairs where at least one record is None
inputs, targets = [], []
for i in range(len(examples[args.text_column])):
if (
examples[args.text_column][i] is not None
and examples[args.summary_column][i] is not None
):
inputs.append(examples[args.text_column][i])
targets.append(examples[args.summary_column][i])
inputs = [prefix + inp for inp in inputs]
model_inputs = tokenizer(
inputs,
max_length=args.max_input_length,
padding=args.padding,
truncation=True,
)
# Set up the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(
examples[args.summary_column],
max_length=args.max_target_length,
padding=args.padding,
truncation=True,
)
# replace all tokenizer.pad_token_id in the labels by -100 to ignore padding in the loss.
if args.padding == "max_length":
labels["input_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in label]
for label in labels["input_ids"]
]
model_inputs["labels"] = labels["input_ids"]
return model_inputs
preprocessed_datasets = raw_dataset.map(preprocess_function, batched=True)
logger.info(f"preprocessed_datasets: {preprocessed_datasets}")
output_path = os.path.join(args.encodings)
os.makedirs(output_path, exist_ok=True)
preprocessed_datasets.save_to_disk(output_path)
logger.info(f"tokenized data is saved to {output_path}")
# Stop Logging
mlflow.end_run()
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,82 @@
from azureml.core import Run
from azureml.core.model import Model
import os
import argparse
import logging
import mlflow
def main():
"""Main function of the script."""
# initialize root logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# input and output arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_folder",
type=str,
required=True,
help="folder containing model",
)
parser.add_argument(
"--register_as",
type=str,
required=True,
help="name to use for model registration in AzureML",
)
parser.add_argument(
"--deploy_flag", type=str, required=True, help="a deploy flag whether to deploy or not"
)
args = parser.parse_args()
logger.info(f"Running with arguments: {args}")
# Start Logging
mlflow.start_run()
if os.path.isfile(args.deploy_flag):
deploy_flag_file_path = args.deploy_flag
else:
deploy_flag_file_path = os.path.join(args.deploy_flag, "deploy_flag")
logger.info(f"Opening deploy_flag file from {deploy_flag_file_path}")
with open(deploy_flag_file_path, 'rb') as in_file:
deploy_flag = bool(int(in_file.read()))
if deploy_flag:
logger.info(f"Deploy flag is True, registering model as {args.register_as}...")
run = Run.get_context()
# if we're running locally, except
if run.__class__.__name__ == "_OfflineRun":
raise Exception("You can't run this script locally, you will need to run it as an AzureML job.")
_ = Model.register(
run.experiment.workspace,
model_name=args.register_as,
model_path=args.model_folder,
tags={
"type": "huggingface",
"task": "summarization"
},
description="Huggingface model finetuned for summarization",
)
else:
logger.info(f"Deploy flag is False, pass.")
# Stop Logging
mlflow.end_run()
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,269 @@
import logging
import os
from datasets import load_metric, load_from_disk
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
DataCollatorForSeq2Seq,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
HfArgumentParser,
IntervalStrategy,
)
from transformers.trainer_callback import TrainerCallback
import torch
import nltk
from dataclasses import dataclass, field
from typing import Optional
import numpy as np
import mlflow
from pynvml import *
import time
# Input arguments are set with dataclass. Huggingface library stores the default training args in TrainingArguments dataclass
# user args are also defined in dataclasses, we will then load arguments from a tuple of user defined and built-in dataclasses.
@dataclass
class DataArgs:
# Inputs
preprocessed_datasets: str = field(
default=None, metadata={"help": "path to preprocesed datasets"}
)
# Processing parameters
max_target_length: Optional[int] = field(
default=128,
metadata={"help": "maxi sequence length for target text after tokenization."},
)
limit_samples: Optional[int] = field(
default=-1,
metadata={"help": "limit the number of samples for faster run."},
)
@dataclass
class ModelArgs:
model_name: Optional[str] = field(default=None, metadata={"help": "model name"})
model_path: Optional[str] = field(
default=None, metadata={"help": "path to existing model file to load"}
)
model_output: Optional[str] = field(
default=None, metadata={"help": "path to save the model"}
)
nltk.download("punkt")
def print_gpu_utilization():
nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(handle)
print(f"GPU memory occupied: {info.used//1024**2} MB.")
def print_summary(result):
print(f"Time: {result.metrics['train_runtime']:.2f}")
print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
print_gpu_utilization()
def postprocess_text(preds, labels):
"""Postprocess output for computing metrics"""
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]
# rougeLSum expects newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
return preds, labels
def compute_metrics(eval_preds, tokenizer, metric):
"""Compute metric based on predictions from evaluation"""
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(
predictions=decoded_preds, references=decoded_labels, use_stemmer=True
)
# Extract a few results from ROUGE
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
prediction_lens = [
np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
class CustomCallback(TrainerCallback):
"""A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
This is a hotfix for the issue raised here:
https://github.com/huggingface/transformers/issues/18870
"""
def on_log(self, args, state, control, logs=None, **kwargs):
if state.is_world_process_zero:
metrics = {}
for k, v in logs.items():
if isinstance(v, (int, float)):
metrics[k] = v
mlflow.log_metrics(metrics=metrics, step=state.global_step)
def main():
# Setup logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
formatter = logging.Formatter(
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
# initialize the mlflow session
mlflow.start_run()
parser = HfArgumentParser((ModelArgs, DataArgs, Seq2SeqTrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
logger.info(f"Running with arguments: {model_args}, {data_args}, {training_args}")
# Check if this is the main node
is_this_main_node = int(os.environ.get("RANK", "0")) == 0
if is_this_main_node:
logger.info("This is the main Node")
input_datasets = load_from_disk(data_args.preprocessed_datasets)
logger.info(f"preprocessed dataset is loaded")
if model_args.model_path:
logger.info("using a saved model")
model = AutoModelForSeq2SeqLM.from_pretrained(model_args.model_path)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_path)
else:
logger.info("using a model from model library")
model = AutoModelForSeq2SeqLM.from_pretrained(model_args.model_name)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name)
# Artificially limit the number of samples (for testing)
if training_args.do_train: # if using --do-train from Seq2SeqTrainingArguments
if data_args.limit_samples > 0:
max_train_samples = min(len(input_datasets["train"]), data_args.limit_samples)
train_dataset = input_datasets["train"].select(range(max_train_samples))
logger.info(f"train: making a {max_train_samples} sample of the data")
else:
train_dataset = input_datasets["train"]
if training_args.do_eval:
if data_args.limit_samples > 0:
max_eval_samples = min(
len(input_datasets["validation"]), data_args.limit_samples
)
eval_dataset = input_datasets["validation"].select(range(max_eval_samples))
logger.info(f"eval: making a {max_eval_samples} sample of the data")
else:
eval_dataset = input_datasets["validation"]
if training_args.do_predict:
if data_args.limit_samples > 0:
max_predict_samples = min(
len(input_datasets["test"]), data_args.limit_samples
)
predict_dataset = input_datasets["test"].select(range(max_predict_samples))
logger.info(f"predict: making a {max_predict_samples} sample of the data")
else:
predict_dataset = input_datasets["test"]
# Data collator
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
)
# Metric
metric = load_metric("rouge")
if training_args.do_train:
logging_steps = len(train_dataset) // training_args.per_device_train_batch_size
training_args.logging_steps = logging_steps
#training_args.output_dir = "outputs"
training_args.save_strategy = "epoch"
training_args.evaluation_strategy = IntervalStrategy.EPOCH
training_args.predict_with_generate = True
training_args.report_to = [] # use our own callback
logger.info(f"training args: {training_args}")
# Initialize our Trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=lambda preds : compute_metrics(preds, tokenizer, metric),
callbacks=[CustomCallback]
)
# Start the actual training (to include evaluation use --do-eval)
if training_args.do_train:
logger.info("Start training")
start = time.time()
train_result = trainer.train()
mlflow.log_metric(
"time/epoch", (time.time() - start) / 60 / training_args.num_train_epochs
)
logger.info(
"training is done"
) # Only print gpu utilization if gpu is available
if torch.cuda.is_available():
print_summary(train_result)
# Save the model as an output
if model_args.model_output and is_this_main_node:
logger.info(f"Saving the model at {model_args.model_output}")
os.makedirs(model_args.model_output, exist_ok=True)
trainer.save_model(model_args.model_output)
# Just run the predictions
if training_args.do_predict:
logger.info("*** Predict ***")
max_length = (
training_args.generation_max_length
if training_args.generation_max_length is not None
else data_args.max_target_length
)
predict_results = trainer.predict(
predict_dataset, metric_key_prefix="predict", max_length=max_length
)
metrics = predict_results.metrics
metrics["predict_samples"] = len(predict_dataset)
trainer.log_metrics("predict", metrics)
trainer.save_metrics("predict", metrics)
# Stop Logging
mlflow.end_run()
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,44 @@
import os
import logging
import json
from transformers import (
AutoModelForSeq2SeqLM,
AutoTokenizer,
)
def init():
"""
This function is called when the container is initialized/started, typically after create/update of the deployment.
You can write the logic here to perform init operations like caching the model in memory
"""
global model, tokenizer
# AZUREML_MODEL_DIR is an environment variable created during deployment.
# It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
model_path = os.path.join(os.getenv("AZUREML_MODEL_DIR"), os.listdir(os.getenv("AZUREML_MODEL_DIR"))[0])
print("model_path")
print(os.listdir(model_path))
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
print("Init complete")
def run(raw_data):
global model, tokenizer
"""
This function is called for every invocation of the endpoint to perform the actual scoring/prediction.
In the example we extract the data from the json input and call the scikit-learn model's predict()
method and return the result back
"""
logging.info("Request received")
article = json.loads(raw_data)["data"]
if "t5" in model.config.architectures[0].lower():
article= "summarize:" + article
inputs = tokenizer(article, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(
inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
)
result = tokenizer.decode(outputs[0])
print(result)
logging.info("Request processed")
return result

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,405 @@
"""MLOps v2 NLP Python SDK training submission script."""
import os
import argparse
# Azure ML sdk v2 imports
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient
from azure.ai.ml import command
from azure.ai.ml import Input, Output
from azure.ai.ml import dsl, Input, Output
def get_config_parger(parser: argparse.ArgumentParser = None):
"""Builds the argument parser for the script."""
if parser is None:
parser = argparse.ArgumentParser(description=__doc__)
group = parser.add_argument_group("Azure ML references")
group.add_argument(
"--config_location",
type=str,
required=False,
help="Subscription ID",
)
group.add_argument(
"--subscription_id",
type=str,
required=False,
help="Subscription ID",
)
group.add_argument(
"--resource_group",
type=str,
required=False,
help="Resource group name",
)
group.add_argument(
"--workspace_name",
type=str,
required=False,
help="Workspace name",
)
# Experiment Name
group.add_argument(
"-n",
type=str,
required=True,
default="nlp_summarization_train",
help="Experiment name",
)
parser.add_argument(
"--wait",
default=False,
action="store_true",
help="wait for the job to finish",
)
group = parser.add_argument_group("Training parameters")
group.add_argument(
"--limit_samples",
type=int,
default=1000,
)
group.add_argument(
"--pretrained_model_name",
type=str,
default="t5-small",
)
group.add_argument(
"--num_train_epochs",
type=int,
default=5,
)
group.add_argument(
"--batch_size",
type=int,
default=8,
)
group.add_argument(
"--learning_rate",
type=float,
default=0.00005,
)
group.add_argument(
"--model_registration_name",
type=str,
default="pubmed-summarization",
)
group = parser.add_argument_group("Compute parameters")
group.add_argument(
"--cpu_compute",
type=str,
default="cpu-cluster",
)
group.add_argument(
"--cpu_compute_large",
type=str,
default="cpu-cluster-lg",
)
group.add_argument(
"--gpu_compute",
type=str,
default="gpu-cluster",
)
group.add_argument(
"--training_nodes",
type=int,
default=1,
)
group.add_argument(
"--gpus_per_node",
type=int,
default=1,
)
return parser
def connect_to_aml(args):
"""Connect to Azure ML workspace using provided cli arguments."""
try:
credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
except Exception as ex:
# Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
credential = InteractiveBrowserCredential()
# Get a handle to workspace
try:
# ml_client to connect using local config.json
ml_client = MLClient.from_config(credential, path='config.json')
except Exception as ex:
print(
"Could not find config.json, using config.yaml refs to Azure ML workspace instead."
)
# tries to connect using cli args if provided else using config.yaml
ml_client = MLClient(
subscription_id=args.subscription_id,
resource_group_name=args.resource_group,
workspace_name=args.workspace_name,
credential=credential,
)
return ml_client
def build_components(args):
"""Builds the components for the pipeline."""
DATA_SCIENCE_FOLDER = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "..","..", "..", "data-science", "src"
)
prep_finetuning_dataset = command(
name="prep_finetuning_dataset",
display_name="Prepare dataset for training",
inputs={
"dataset_name": Input(type="string"),
"dataset_config": Input(type="string"),
"text_column": Input(type="string"),
"summary_column": Input(type="string"),
"limit_samples": Input(type="integer"),
"max_input_length": Input(type="integer"),
"max_target_length": Input(type="integer"),
"padding": Input(type="string"),
"pretrained_model_name": Input(type="string"),
},
outputs=dict(
encodings=Output(type="uri_folder", mode="rw_mount"),
),
code=DATA_SCIENCE_FOLDER,
command="""python summarization/prepare.py \
--dataset_name ${{inputs.dataset_name}} \
--dataset_config ${{inputs.dataset_config}} \
--text_column ${{inputs.text_column}} \
--summary_column ${{inputs.summary_column}} \
--limit_samples ${{inputs.limit_samples}} \
--model_arch ${{inputs.pretrained_model_name}} \
--max_input_length ${{inputs.max_input_length}} \
--max_target_length ${{inputs.max_target_length}} \
--padding ${{inputs.padding}} \
--encodings ${{outputs.encodings}}\
""",
environment="nlp_summarization_train@latest",
)
finetune_model = command(
name="finetune_model",
display_name="Fine-tune summarization model",
inputs={
"preprocessed_datasets": Input(type="uri_folder"),
"pretrained_model_name": Input(type="string"),
"limit_samples": Input(type="integer"),
"learning_rate": Input(type="number"),
"num_train_epochs": Input(type="integer"),
"per_device_train_batch_size": Input(type="integer"),
"per_device_eval_batch_size": Input(type="integer"),
},
outputs=dict(
finetuned_model=Output(type="uri_folder", mode="rw_mount"),
),
code=DATA_SCIENCE_FOLDER,
command="""python summarization/run.py \
--preprocessed_datasets ${{inputs.preprocessed_datasets}} \
--learning_rate ${{inputs.learning_rate}} \
--per_device_train_batch_size ${{inputs.per_device_train_batch_size}} \
--per_device_eval_batch_size ${{inputs.per_device_eval_batch_size}} \
--limit_samples ${{inputs.limit_samples}} \
--model_name ${{inputs.pretrained_model_name}} \
--model_output ${{outputs.finetuned_model}}\
--output_dir outputs \
--num_train_epochs ${{inputs.num_train_epochs}} \
--do_train --do_eval \
""",
environment="nlp_summarization_train@latest",
distribution={
"type": "PyTorch",
# set process count to the number of gpus on the node
"process_count_per_instance": args.gpus_per_node,
},
# set instance count to the number of nodes you want to use
instance_count=args.training_nodes,
)
evaluate_model = command(
name="evaluate_model",
display_name="Run eval on a model",
inputs={
"preprocessed_datasets": Input(type="uri_folder"),
"model_path": Input(type="uri_folder", optional=True),
"model_name": Input(type="string", optional=True),
"limit_samples": Input(type="integer"),
"max_target_length": Input(type="integer"),
},
outputs=dict(
metrics=Output(type="uri_folder", mode="rw_mount"),
),
code=DATA_SCIENCE_FOLDER,
command="""python summarization/run.py \
--preprocessed_datasets ${{inputs.preprocessed_datasets}} \
--limit_samples ${{inputs.limit_samples}} \
--output_dir ${{outputs.metrics}} \
$[[--model_path ${{inputs.model_path}}]] \
$[[--model_name ${{inputs.model_name}}]] \
--max_target_length ${{inputs.max_target_length}} \
--do_predict \
""",
environment="nlp_summarization_train@latest",
)
compare_models = command(
name="compare_models",
display_name="Compare finetuned to baseline",
inputs={
"baseline_metrics": Input(type="uri_folder"),
"candidate_metrics": Input(type="uri_folder"),
"reference_metric": Input(type="string"),
},
outputs=dict(
deploy_flag=Output(type="uri_folder", mode="rw_mount"),
),
code=DATA_SCIENCE_FOLDER,
command="""python summarization/compare.py \
--baseline_metrics ${{inputs.baseline_metrics}} \
--candidate_metrics ${{inputs.candidate_metrics}} \
--reference_metric ${{inputs.reference_metric}} \
--deploy_flag ${{outputs.deploy_flag}} \
--force_comparison True\
""",
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
)
register_model = command(
name="register_model",
display_name="Register model",
inputs={
"model": Input(type="uri_folder"),
"deploy_flag": Input(type="uri_folder"),
"model_registration_name": Input(type="string"),
},
code=DATA_SCIENCE_FOLDER,
command="""python summarization/register.py \
--model_folder ${{inputs.model}} \
--deploy_flag ${{inputs.deploy_flag}} \
--register_as ${{inputs.model_registration_name}} \
""",
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
)
return {
"prep_finetuning_dataset": prep_finetuning_dataset,
"finetune_model": finetune_model,
"evaluate_model": evaluate_model,
"compare_models": compare_models,
"register_model": register_model,
}
def main():
"""Main entry point for the script."""
parser = get_config_parger()
args, _ = parser.parse_known_args()
ml_client = connect_to_aml(args)
# get components from build function
components_dict = build_components(args)
prep_finetuning_dataset = components_dict["prep_finetuning_dataset"]
finetune_model = components_dict["finetune_model"]
evaluate_model = components_dict["evaluate_model"]
compare_models = components_dict["compare_models"]
register_model = components_dict["register_model"]
# build the pipeline using Azure ML SDK v2
@dsl.pipeline(
name="NLP Training Pipeline",
description="NLP Training Pipeline",
)
def nlp_training_pipeline(
limit_samples: int,
pretrained_model_name: str,
num_train_epochs: int,
batch_size: int,
learning_rate: float,
model_registration_name: str,
):
prep_finetuning_dataset_step = prep_finetuning_dataset(
dataset_name="ccdv/pubmed-summarization",
dataset_config="section",
text_column="article",
summary_column="abstract",
limit_samples=limit_samples,
max_input_length=512,
max_target_length=40,
padding="max_length",
pretrained_model_name=pretrained_model_name,
)
prep_finetuning_dataset_step.compute = args.cpu_compute_large
finetune_model_step = finetune_model(
preprocessed_datasets=prep_finetuning_dataset_step.outputs.encodings,
pretrained_model_name=pretrained_model_name,
limit_samples=limit_samples,
learning_rate=learning_rate,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
)
finetune_model_step.compute = args.gpu_compute
evaluate_finetuned_model_step = evaluate_model(
preprocessed_datasets=prep_finetuning_dataset_step.outputs.encodings,
model_path=finetune_model_step.outputs.finetuned_model,
limit_samples=limit_samples,
max_target_length=40,
)
evaluate_finetuned_model_step.compute = args.gpu_compute
evaluate_baseline_model_step = evaluate_model(
preprocessed_datasets=prep_finetuning_dataset_step.outputs.encodings,
model_name=pretrained_model_name,
limit_samples=limit_samples,
max_target_length=40,
)
evaluate_baseline_model_step.compute = args.gpu_compute
compare_models_step = compare_models(
baseline_metrics=evaluate_finetuned_model_step.outputs.metrics,
candidate_metrics=evaluate_baseline_model_step.outputs.metrics,
reference_metric="predict_rougeLsum",
)
compare_models_step.compute = args.cpu_compute
register_model_step = register_model(
model=finetune_model_step.outputs.finetuned_model,
deploy_flag=compare_models_step.outputs.deploy_flag,
model_registration_name=model_registration_name,
)
register_model_step.compute = args.cpu_compute
# instanciates the job
pipeline_job = nlp_training_pipeline(
limit_samples=args.limit_samples,
pretrained_model_name=args.pretrained_model_name,
num_train_epochs=args.num_train_epochs,
batch_size=args.batch_size,
learning_rate=args.learning_rate,
model_registration_name=args.model_registration_name,
)
# submits the job
print("Submitting the pipeline job to your AzureML workspace...")
pipeline_job = ml_client.jobs.create_or_update(
pipeline_job, experiment_name=args.n
)
print("The url to see your live job running is returned by the sdk:")
print(pipeline_job.services["Studio"].endpoint)
if args.wait:
ml_client.jobs.stream(pipeline_job.name)
if __name__ == "__main__":
main()

Просмотреть файл

@ -0,0 +1,71 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
variables:
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
# 'main' branch: PRD environment
- template: ../../config-infra-prod.yml
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
# 'develop' or feature branches: DEV environment
- template: ../../config-infra-dev.yml
- name: version
value: python-sdk-v2
trigger:
- none
pool:
vmImage: ubuntu-20.04
resources:
repositories:
- repository: mlops-templates # Template Repo
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
endpoint: github-connection # need to set up and hardcode
type: github
ref: main
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
- template: templates/python-sdk-v2/install-requirements.yml@mlops-templates
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster
size: STANDARD_DS3_V2
min_instances: 0
max_instances: 1
cluster_tier: dedicated
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster-lg
size: Standard_D14_v2
min_instances: 0
max_instances: 1
cluster_tier: dedicated
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
parameters:
cluster_name: gpu-cluster
size: Standard_NV6
min_instances: 0
max_instances: 1
cluster_tier: dedicated
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
build_type: docker
environment_name: nlp_summarization_train
environment_path: data-science/environments/training
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_path: mlops/azureml/train/pipeline-train.py
experiment_name: $(environment)_nlp_summarization_$(Build.SourceBranchName)