Merge pull request #121 from Azure/main-dec31
Merge main-dec31 into main
This commit is contained in:
Коммит
09470468e1
|
@ -7,7 +7,8 @@ dependencies:
|
|||
- pip
|
||||
- pip:
|
||||
- azureml-mlflow==1.38.0
|
||||
- azureml-sdk==1.38.0
|
||||
- azure-ai-ml==1.0.0
|
||||
- pyarrow==10.0.0
|
||||
- scikit-learn==0.24.1
|
||||
- pandas==1.2.1
|
||||
- joblib==1.0.0
|
||||
|
|
|
@ -4,6 +4,12 @@
|
|||
Prepares raw data and provides training, validation and test datasets
|
||||
"""
|
||||
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
"""
|
||||
Prepares raw data and provides training, validation and test datasets
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
from pathlib import Path
|
||||
|
@ -41,21 +47,52 @@ CAT_NOM_COLS = [
|
|||
"vendor",
|
||||
]
|
||||
|
||||
CAT_ORD_COLS = [
|
||||
]
|
||||
TARGET_COL = "cost"
|
||||
|
||||
NUMERIC_COLS = [
|
||||
"distance",
|
||||
"dropoff_latitude",
|
||||
"dropoff_longitude",
|
||||
"passengers",
|
||||
"pickup_latitude",
|
||||
"pickup_longitude",
|
||||
"pickup_weekday",
|
||||
"pickup_month",
|
||||
"pickup_monthday",
|
||||
"pickup_hour",
|
||||
"pickup_minute",
|
||||
"pickup_second",
|
||||
"dropoff_weekday",
|
||||
"dropoff_month",
|
||||
"dropoff_monthday",
|
||||
"dropoff_hour",
|
||||
"dropoff_minute",
|
||||
"dropoff_second",
|
||||
]
|
||||
|
||||
CAT_NOM_COLS = [
|
||||
"store_forward",
|
||||
"vendor",
|
||||
]
|
||||
|
||||
CAT_ORD_COLS = [
|
||||
]
|
||||
|
||||
def parse_args():
|
||||
'''Parse input arguments'''
|
||||
'''Parse input arguments'''
|
||||
|
||||
parser = argparse.ArgumentParser("prep")
|
||||
parser.add_argument("--raw_data", type=str, help="Path to raw data")
|
||||
parser.add_argument("--train_data", type=str, help="Path to train dataset")
|
||||
parser.add_argument("--val_data", type=str, help="Path to test dataset")
|
||||
parser.add_argument("--test_data", type=str, help="Path to test dataset")
|
||||
|
||||
|
||||
parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
|
||||
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
@ -65,17 +102,15 @@ def log_training_data(df, table_name):
|
|||
collector = Online_Collector(table_name)
|
||||
collector.batch_collect(df)
|
||||
|
||||
def main(args):
|
||||
'''Read, split, and save datasets'''
|
||||
def main(args):
|
||||
'''Read, split, and save datasets'''
|
||||
|
||||
# ------------ Reading Data ------------ #
|
||||
# -------------------------------------- #
|
||||
|
||||
print("mounted_path files: ")
|
||||
arr = os.listdir(args.raw_data)
|
||||
print(arr)
|
||||
|
||||
data = pd.read_csv((Path(args.raw_data) / 'taxi-data.csv'))
|
||||
data = pd.read_csv((Path(args.raw_data)))
|
||||
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
|
||||
|
||||
# ------------- Split Data ------------- #
|
||||
|
@ -92,19 +127,24 @@ def main(args):
|
|||
train = data[msk_train]
|
||||
val = data[msk_val]
|
||||
test = data[msk_test]
|
||||
test = data[msk_test]
|
||||
|
||||
mlflow.log_metric('train size', train.shape[0])
|
||||
mlflow.log_metric('val size', val.shape[0])
|
||||
mlflow.log_metric('test size', test.shape[0])
|
||||
|
||||
train.to_parquet((Path(args.train_data) / "train.parquet"))
|
||||
val.to_parquet((Path(args.val_data) / "val.parquet"))
|
||||
test.to_parquet((Path(args.test_data) / "test.parquet"))
|
||||
train.to_parquet((Path(args.train_data) / "train.parquet"))
|
||||
val.to_parquet((Path(args.val_data) / "val.parquet"))
|
||||
test.to_parquet((Path(args.test_data) / "test.parquet"))
|
||||
|
||||
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
|
||||
if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
|
||||
log_training_data(data, args.table_name)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
mlflow.start_run()
|
||||
|
@ -124,9 +164,9 @@ if __name__ == "__main__":
|
|||
|
||||
for line in lines:
|
||||
print(line)
|
||||
|
||||
|
||||
main(args)
|
||||
|
||||
mlflow.end_run()
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/data.schema.json
|
||||
|
||||
# Supported paths include:
|
||||
# local: ./<path>
|
||||
# blob: https://<account_name>.blob.core.windows.net/<container_name>/<path>
|
||||
# ADLS gen2: abfss://<file_system>@<account_name>.dfs.core.windows.net/<path>/
|
||||
# Datastore: azureml://datastores/<data_store_name>/paths/<path>
|
||||
type: uri_file
|
||||
name: taxi-data
|
||||
description: taxi dataset
|
||||
path: ../../../data/taxi-data.csv
|
|
@ -1,17 +1,18 @@
|
|||
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
|
||||
type: pipeline
|
||||
experiment_name: taxi-fare-training
|
||||
experiment_name: taxi-fare-training
|
||||
description: Training Pipeline to train a model that predicts taxi fare price
|
||||
|
||||
# <inputs_and_outputs>
|
||||
inputs:
|
||||
input: #using local data, will create an anonymous data asset
|
||||
type: uri_folder
|
||||
path: ../../../data/
|
||||
input:
|
||||
type: uri_file
|
||||
path: azureml:taxi-data@latest
|
||||
enable_monitoring: 'false'
|
||||
table_name: 'taximonitoring'
|
||||
|
||||
outputs:
|
||||
outputs:
|
||||
train_data:
|
||||
val_data:
|
||||
test_data:
|
||||
|
@ -32,8 +33,8 @@ jobs:
|
|||
display_name: prep-data
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python prep.py
|
||||
--raw_data ${{inputs.raw_data}}
|
||||
python prep.py
|
||||
--raw_data ${{inputs.raw_data}}
|
||||
--train_data ${{outputs.train_data}}
|
||||
--val_data ${{outputs.val_data}}
|
||||
--test_data ${{outputs.test_data}}
|
||||
|
@ -54,24 +55,24 @@ jobs:
|
|||
display_name: train-model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python train.py
|
||||
--train_data ${{inputs.train_data}}
|
||||
python train.py
|
||||
--prepared_data ${{inputs.prepared_data}}
|
||||
--model_output ${{outputs.model_output}}
|
||||
environment: azureml:taxi-train-env@latest
|
||||
inputs:
|
||||
train_data: ${{parent.jobs.prep_data.outputs.train_data}}
|
||||
outputs:
|
||||
model_output: ${{parent.outputs.trained_model}}
|
||||
|
||||
|
||||
evaluate_model:
|
||||
name: evaluate_model
|
||||
display_name: evaluate-model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python evaluate.py
|
||||
--model_name ${{inputs.model_name}}
|
||||
--model_input ${{inputs.model_input}}
|
||||
--test_data ${{inputs.test_data}}
|
||||
python evaluate.py
|
||||
--model_name ${{inputs.model_name}}
|
||||
--model_input ${{inputs.model_input}}
|
||||
--test_data ${{inputs.test_data}}
|
||||
--evaluation_output ${{outputs.evaluation_output}}
|
||||
environment: azureml:taxi-train-env@latest
|
||||
inputs:
|
||||
|
@ -86,9 +87,9 @@ jobs:
|
|||
display_name: register-model
|
||||
code: ../../../data-science/src
|
||||
command: >-
|
||||
python register.py
|
||||
--model_name ${{inputs.model_name}}
|
||||
--model_path ${{inputs.model_path}}
|
||||
python register.py
|
||||
--model_name ${{inputs.model_name}}
|
||||
--model_path ${{inputs.model_path}}
|
||||
--evaluation_output ${{inputs.evaluation_output}}
|
||||
--model_info_output_path ${{outputs.model_info_output_path}}
|
||||
environment: azureml:taxi-train-env@latest
|
||||
|
|
|
@ -2,61 +2,62 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
|
||||
trigger:
|
||||
- none
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
build_type: conda
|
||||
environment_name: taxi-train-env
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: cpu-cluster
|
||||
size: Standard_DS3_v2
|
||||
min_instances: 0
|
||||
max_instances: 4
|
||||
cluster_tier: LowPriority
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_file: mlops/azureml/train/pipeline.yml
|
||||
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
|
||||
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: taxi-train-env
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: cpu-cluster
|
||||
size: Standard_DS3_v2
|
||||
min_instances: 0
|
||||
max_instances: 4
|
||||
cluster_tier: low_priority
|
||||
- template: templates/${{ variables.version }}/register-data.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: uri_file
|
||||
data_name: taxi-data
|
||||
data_file: mlops/azureml/train/data.yml
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_file: mlops/azureml/train/pipeline.yml
|
||||
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
|
||||
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
|
|
|
@ -1,44 +1,63 @@
|
|||
name: deploy-batch-endpoint-pipeline
|
||||
|
||||
on:
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
set-env-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
|
||||
steps:
|
||||
- id: set-prod-branch
|
||||
name: set-prod-branch
|
||||
if: ${{ github.ref == 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
|
||||
- id: set-dev-branch
|
||||
name: setdevbranch
|
||||
if: ${{ github.ref != 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
|
||||
- id: set-output-defaults
|
||||
name: set-output-defaults
|
||||
run: |
|
||||
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
needs: set-env-branch
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: ${{ needs.set-env-branch.outputs.config-file}}
|
||||
create-compute:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: batch-cluster
|
||||
size: STANDARD_DS3_V2
|
||||
min_instances: 0
|
||||
max_instances: 5
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: batch-cluster
|
||||
size: STANDARD_DS3_V2
|
||||
min_instances: 0
|
||||
max_instances: 5
|
||||
cluster_tier: low_priority
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-endpoint:
|
||||
needs: [get-config,create-compute]
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
|
||||
endpoint_type: batch
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: [get-config, create-compute]
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/batch/batch-endpoint.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
|
||||
endpoint_type: batch
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-deployment:
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config,create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
|
||||
endpoint_type: batch
|
||||
deployment_name: eptestdeploy
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config, create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/batch/batch-deployment.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.bep) }}
|
||||
endpoint_type: batch
|
||||
deployment_name: eptestdeploy
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
|
|
|
@ -1,42 +1,70 @@
|
|||
name: deploy-model-training-pipeline
|
||||
|
||||
on:
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
set-env-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
|
||||
steps:
|
||||
- id: set-prod-branch
|
||||
name: set-prod-branch
|
||||
if: ${{ github.ref == 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
|
||||
- id: set-dev-branch
|
||||
name: setdevbranch
|
||||
if: ${{ github.ref != 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
|
||||
- id: set-output-defaults
|
||||
name: set-output-defaults
|
||||
run: |
|
||||
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
needs: set-env-branch
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: ${{ needs.set-env-branch.outputs.config-file}}
|
||||
register-environment:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
conda_file: data-science/environment/train-conda.yml
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
conda_file: data-science/environment/train-conda.yml
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
register-dataset:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
name: taxi-data
|
||||
data_file: mlops/azureml/train/data.yml
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-compute:
|
||||
needs: [get-config,register-environment]
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: cpu-cluster
|
||||
size: Standard_DS3_v2
|
||||
min_instances: 0
|
||||
max_instances: 4
|
||||
cluster_tier: LowPriority
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
run-pipeline:
|
||||
needs: [get-config,register-environment,create-compute]
|
||||
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
parameters-file: mlops/azureml/train/pipeline.yml
|
||||
job-name: test
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: [get-config]
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: cpu-cluster
|
||||
size: Standard_DS3_v2
|
||||
min_instances: 0
|
||||
max_instances: 4
|
||||
cluster_tier: low_priority
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
run-model-training-pipeline:
|
||||
needs: [get-config, register-environment, register-dataset, create-compute]
|
||||
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
parameters-file: mlops/azureml/train/pipeline.yml
|
||||
job-name: test
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
|
|
|
@ -1,42 +1,60 @@
|
|||
name: deploy-online-endpoint-pipeline
|
||||
|
||||
on:
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
set-env-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
|
||||
steps:
|
||||
- id: set-prod-branch
|
||||
name: set-prod-branch
|
||||
if: ${{ github.ref == 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
|
||||
- id: set-dev-branch
|
||||
name: setdevbranch
|
||||
if: ${{ github.ref != 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
|
||||
- id: set-output-defaults
|
||||
name: set-output-defaults
|
||||
run: |
|
||||
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
needs: set-env-branch
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: ${{ needs.set-env-branch.outputs.config-file}}
|
||||
create-endpoint:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
|
||||
endpoint_type: online
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
|
||||
endpoint_type: online
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-deployment:
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config,create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
|
||||
endpoint_type: online
|
||||
deployment_name: taxi-online-dp
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config, create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
|
||||
endpoint_type: online
|
||||
deployment_name: taxi-online-dp
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
allocate-traffic:
|
||||
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
|
||||
needs: [get-config,create-deployment]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
traffic_allocation: taxi-online-dp=100
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
|
||||
needs: [get-config, create-deployment]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
traffic_allocation: taxi-online-dp=100
|
||||
endpoint_name: ${{ format('taxi-gha-{0}', needs.get-config.outputs.oep) }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
|
|
|
@ -10,7 +10,7 @@ variables:
|
|||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk
|
||||
value: python-sdk-v1
|
||||
|
||||
trigger:
|
||||
- none
|
|
@ -10,7 +10,7 @@ variables:
|
|||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk
|
||||
value: python-sdk-v1
|
||||
|
||||
trigger:
|
||||
- none
|
|
@ -10,7 +10,7 @@ variables:
|
|||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk
|
||||
value: python-sdk-v1
|
||||
|
||||
trigger:
|
||||
- none
|
|
@ -7,7 +7,8 @@ dependencies:
|
|||
- pip
|
||||
- pip:
|
||||
- azureml-mlflow==1.38.0
|
||||
- azureml-sdk==1.38.0
|
||||
- azure-ai-ml==1.0.0
|
||||
- pyarrow==10.0.0
|
||||
- scikit-learn==0.24.1
|
||||
- pandas==1.2.1
|
||||
- joblib==1.0.0
|
|
@ -0,0 +1,174 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
"""
|
||||
Evaluates trained ML model using test dataset.
|
||||
Saves predictions, evaluation results and deploy flag.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
|
||||
|
||||
import mlflow
|
||||
import mlflow.sklearn
|
||||
import mlflow.pyfunc
|
||||
from mlflow.tracking import MlflowClient
|
||||
|
||||
TARGET_COL = "cost"
|
||||
|
||||
NUMERIC_COLS = [
|
||||
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
|
||||
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
|
||||
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
|
||||
"dropoff_hour", "dropoff_minute", "dropoff_second"
|
||||
]
|
||||
|
||||
CAT_NOM_COLS = [
|
||||
"store_forward", "vendor"
|
||||
]
|
||||
|
||||
CAT_ORD_COLS = [
|
||||
]
|
||||
|
||||
def parse_args():
|
||||
'''Parse input arguments'''
|
||||
|
||||
parser = argparse.ArgumentParser("predict")
|
||||
parser.add_argument("--model_name", type=str, help="Name of registered model")
|
||||
parser.add_argument("--model_input", type=str, help="Path of input model")
|
||||
parser.add_argument("--test_data", type=str, help="Path to test dataset")
|
||||
parser.add_argument("--evaluation_output", type=str, help="Path of eval results")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def main(args):
|
||||
'''Read trained model and test dataset, evaluate model and save result'''
|
||||
|
||||
# Load the test data
|
||||
test_data = pd.read_parquet(Path(args.test_data))
|
||||
|
||||
# Split the data into inputs and outputs
|
||||
y_test = test_data[TARGET_COL]
|
||||
X_test = test_data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS]
|
||||
|
||||
# Load the model from input port
|
||||
model = mlflow.sklearn.load_model(args.model_input)
|
||||
|
||||
# ---------------- Model Evaluation ---------------- #
|
||||
yhat_test, score = model_evaluation(X_test, y_test, model, args.evaluation_output)
|
||||
|
||||
# ----------------- Model Promotion ---------------- #
|
||||
predictions, deploy_flag = model_promotion(args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score)
|
||||
|
||||
|
||||
|
||||
def model_evaluation(X_test, y_test, model, evaluation_output):
|
||||
|
||||
# Get predictions to y_test (y_test)
|
||||
yhat_test = model.predict(X_test)
|
||||
|
||||
# Save the output data with feature columns, predicted cost, and actual cost in csv file
|
||||
output_data = X_test.copy()
|
||||
output_data["real_label"] = y_test
|
||||
output_data["predicted_label"] = yhat_test
|
||||
output_data.to_csv((Path(evaluation_output) / "predictions.csv"))
|
||||
|
||||
# Evaluate Model performance with the test set
|
||||
r2 = r2_score(y_test, yhat_test)
|
||||
mse = mean_squared_error(y_test, yhat_test)
|
||||
rmse = np.sqrt(mse)
|
||||
mae = mean_absolute_error(y_test, yhat_test)
|
||||
|
||||
# Print score report to a text file
|
||||
(Path(evaluation_output) / "score.txt").write_text(
|
||||
f"Scored with the following model:\n{format(model)}"
|
||||
)
|
||||
with open((Path(evaluation_output) / "score.txt"), "a") as outfile:
|
||||
outfile.write("Mean squared error: {mse.2f} \n")
|
||||
outfile.write("Root mean squared error: {rmse.2f} \n")
|
||||
outfile.write("Mean absolute error: {mae.2f} \n")
|
||||
outfile.write("Coefficient of determination: {r2.2f} \n")
|
||||
|
||||
mlflow.log_metric("test r2", r2)
|
||||
mlflow.log_metric("test mse", mse)
|
||||
mlflow.log_metric("test rmse", rmse)
|
||||
mlflow.log_metric("test mae", mae)
|
||||
|
||||
# Visualize results
|
||||
plt.scatter(y_test, yhat_test, color='black')
|
||||
plt.plot(y_test, y_test, color='blue', linewidth=3)
|
||||
plt.xlabel("Real value")
|
||||
plt.ylabel("Predicted value")
|
||||
plt.title("Comparing Model Predictions to Real values - Test Data")
|
||||
plt.savefig("predictions.png")
|
||||
mlflow.log_artifact("predictions.png")
|
||||
|
||||
return yhat_test, r2
|
||||
|
||||
def model_promotion(model_name, evaluation_output, X_test, y_test, yhat_test, score):
|
||||
|
||||
scores = {}
|
||||
predictions = {}
|
||||
|
||||
client = MlflowClient()
|
||||
|
||||
for model_run in client.search_model_versions(f"name='{model_name}'"):
|
||||
model_version = model_run.version
|
||||
mdl = mlflow.pyfunc.load_model(
|
||||
model_uri=f"models:/{model_name}/{model_version}")
|
||||
predictions[f"{model_name}:{model_version}"] = mdl.predict(X_test)
|
||||
scores[f"{model_name}:{model_version}"] = r2_score(
|
||||
y_test, predictions[f"{model_name}:{model_version}"])
|
||||
|
||||
if scores:
|
||||
if score >= max(list(scores.values())):
|
||||
deploy_flag = 1
|
||||
else:
|
||||
deploy_flag = 0
|
||||
else:
|
||||
deploy_flag = 1
|
||||
print(f"Deploy flag: {deploy_flag}")
|
||||
|
||||
with open((Path(evaluation_output) / "deploy_flag"), 'w') as outfile:
|
||||
outfile.write(f"{int(deploy_flag)}")
|
||||
|
||||
# add current model score and predictions
|
||||
scores["current model"] = score
|
||||
predictions["currrent model"] = yhat_test
|
||||
|
||||
perf_comparison_plot = pd.DataFrame(
|
||||
scores, index=["r2 score"]).plot(kind='bar', figsize=(15, 10))
|
||||
perf_comparison_plot.figure.savefig("perf_comparison.png")
|
||||
perf_comparison_plot.figure.savefig(Path(evaluation_output) / "perf_comparison.png")
|
||||
|
||||
mlflow.log_metric("deploy flag", bool(deploy_flag))
|
||||
mlflow.log_artifact("perf_comparison.png")
|
||||
|
||||
return predictions, deploy_flag
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
mlflow.start_run()
|
||||
|
||||
args = parse_args()
|
||||
|
||||
lines = [
|
||||
f"Model name: {args.model_name}",
|
||||
f"Model path: {args.model_input}",
|
||||
f"Test data path: {args.test_data}",
|
||||
f"Evaluation output path: {args.evaluation_output}",
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
print(line)
|
||||
|
||||
main(args)
|
||||
|
||||
mlflow.end_run()
|
|
@ -0,0 +1,113 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
"""
|
||||
Prepares raw data and provides training, validation and test datasets
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
from pathlib import Path
|
||||
import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import mlflow
|
||||
|
||||
TARGET_COL = "cost"
|
||||
|
||||
NUMERIC_COLS = [
|
||||
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
|
||||
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
|
||||
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
|
||||
"dropoff_hour", "dropoff_minute", "dropoff_second"
|
||||
]
|
||||
|
||||
CAT_NOM_COLS = [
|
||||
"store_forward", "vendor"
|
||||
]
|
||||
|
||||
CAT_ORD_COLS = [
|
||||
]
|
||||
|
||||
def parse_args():
|
||||
'''Parse input arguments'''
|
||||
|
||||
parser = argparse.ArgumentParser("prep")
|
||||
parser.add_argument("--raw_data", type=str, help="Path to raw data")
|
||||
parser.add_argument("--train_data", type=str, help="Path to train dataset")
|
||||
parser.add_argument("--val_data", type=str, help="Path to test dataset")
|
||||
parser.add_argument("--test_data", type=str, help="Path to test dataset")
|
||||
|
||||
parser.add_argument("--enable_monitoring", type=str, help="enable logging to ADX")
|
||||
parser.add_argument("--table_name", type=str, default="mlmonitoring", help="Table name in ADX for logging")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def log_training_data(df, table_name):
|
||||
from obs.collector import Online_Collector
|
||||
collector = Online_Collector(table_name)
|
||||
collector.batch_collect(df)
|
||||
|
||||
def main(args):
|
||||
'''Read, split, and save datasets'''
|
||||
|
||||
# ------------ Reading Data ------------ #
|
||||
# -------------------------------------- #
|
||||
|
||||
data = pd.read_csv((Path(args.raw_data)))
|
||||
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
|
||||
|
||||
# ------------- Split Data ------------- #
|
||||
# -------------------------------------- #
|
||||
|
||||
# Split data into train, val and test datasets
|
||||
|
||||
random_data = np.random.rand(len(data))
|
||||
|
||||
msk_train = random_data < 0.7
|
||||
msk_val = (random_data >= 0.7) & (random_data < 0.85)
|
||||
msk_test = random_data >= 0.85
|
||||
|
||||
train = data[msk_train]
|
||||
val = data[msk_val]
|
||||
test = data[msk_test]
|
||||
|
||||
mlflow.log_metric('train size', train.shape[0])
|
||||
mlflow.log_metric('val size', val.shape[0])
|
||||
mlflow.log_metric('test size', test.shape[0])
|
||||
|
||||
train.to_parquet((Path(args.train_data) / "train.parquet"))
|
||||
val.to_parquet((Path(args.val_data) / "val.parquet"))
|
||||
test.to_parquet((Path(args.test_data) / "test.parquet"))
|
||||
|
||||
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
|
||||
log_training_data(data, args.table_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
mlflow.start_run()
|
||||
|
||||
# ---------- Parse Arguments ----------- #
|
||||
# -------------------------------------- #
|
||||
|
||||
args = parse_args()
|
||||
|
||||
lines = [
|
||||
f"Raw data path: {args.raw_data}",
|
||||
f"Train dataset output path: {args.train_data}",
|
||||
f"Val dataset output path: {args.val_data}",
|
||||
f"Test dataset path: {args.test_data}",
|
||||
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
print(line)
|
||||
|
||||
main(args)
|
||||
|
||||
mlflow.end_run()
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
"""
|
||||
Registers trained ML model if deploy flag is True.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
import mlflow
|
||||
|
||||
import os
|
||||
import json
|
||||
|
||||
def parse_args():
|
||||
'''Parse input arguments'''
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name', type=str, help='Name under which model will be registered')
|
||||
parser.add_argument('--model_path', type=str, help='Model directory')
|
||||
parser.add_argument('--evaluation_output', type=str, help='Path of eval results')
|
||||
parser.add_argument(
|
||||
"--model_info_output_path", type=str, help="Path to write model info JSON"
|
||||
)
|
||||
args, _ = parser.parse_known_args()
|
||||
print(f'Arguments: {args}')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
'''Loads model, registers it if deply flag is True'''
|
||||
|
||||
with open((Path(args.evaluation_output) / "deploy_flag"), 'rb') as infile:
|
||||
deploy_flag = int(infile.read())
|
||||
|
||||
mlflow.log_metric("deploy flag", int(deploy_flag))
|
||||
deploy_flag=1
|
||||
if deploy_flag==1:
|
||||
|
||||
print("Registering ", args.model_name)
|
||||
|
||||
# load model
|
||||
model = mlflow.sklearn.load_model(args.model_path)
|
||||
|
||||
# log model using mlflow
|
||||
mlflow.sklearn.log_model(model, args.model_name)
|
||||
|
||||
# register logged model using mlflow
|
||||
run_id = mlflow.active_run().info.run_id
|
||||
model_uri = f'runs:/{run_id}/{args.model_name}'
|
||||
mlflow_model = mlflow.register_model(model_uri, args.model_name)
|
||||
model_version = mlflow_model.version
|
||||
|
||||
# write model info
|
||||
print("Writing JSON")
|
||||
dict = {"id": "{0}:{1}".format(args.model_name, model_version)}
|
||||
output_path = os.path.join(args.model_info_output_path, "model_info.json")
|
||||
with open(output_path, "w") as of:
|
||||
json.dump(dict, fp=of)
|
||||
|
||||
else:
|
||||
print("Model will not be registered!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
mlflow.start_run()
|
||||
|
||||
# ---------- Parse Arguments ----------- #
|
||||
# -------------------------------------- #
|
||||
|
||||
args = parse_args()
|
||||
|
||||
lines = [
|
||||
f"Model name: {args.model_name}",
|
||||
f"Model path: {args.model_path}",
|
||||
f"Evaluation output path: {args.evaluation_output}",
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
print(line)
|
||||
|
||||
main(args)
|
||||
|
||||
mlflow.end_run()
|
|
@ -0,0 +1,147 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
"""
|
||||
Trains ML model using training dataset. Saves trained model.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
|
||||
|
||||
import mlflow
|
||||
import mlflow.sklearn
|
||||
|
||||
TARGET_COL = "cost"
|
||||
|
||||
NUMERIC_COLS = [
|
||||
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
|
||||
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
|
||||
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
|
||||
"dropoff_hour", "dropoff_minute", "dropoff_second"
|
||||
]
|
||||
|
||||
CAT_NOM_COLS = [
|
||||
"store_forward", "vendor"
|
||||
]
|
||||
|
||||
CAT_ORD_COLS = [
|
||||
]
|
||||
|
||||
|
||||
def parse_args():
|
||||
'''Parse input arguments'''
|
||||
|
||||
parser = argparse.ArgumentParser("train")
|
||||
parser.add_argument("--train_data", type=str, help="Path to train dataset")
|
||||
parser.add_argument("--model_output", type=str, help="Path of output model")
|
||||
|
||||
# classifier specific arguments
|
||||
parser.add_argument('--regressor__n_estimators', type=int, default=500,
|
||||
help='Number of trees')
|
||||
parser.add_argument('--regressor__bootstrap', type=int, default=1,
|
||||
help='Method of selecting samples for training each tree')
|
||||
parser.add_argument('--regressor__max_depth', type=int, default=10,
|
||||
help=' Maximum number of levels in tree')
|
||||
parser.add_argument('--regressor__max_features', type=str, default='auto',
|
||||
help='Number of features to consider at every split')
|
||||
parser.add_argument('--regressor__min_samples_leaf', type=int, default=4,
|
||||
help='Minimum number of samples required at each leaf node')
|
||||
parser.add_argument('--regressor__min_samples_split', type=int, default=5,
|
||||
help='Minimum number of samples required to split a node')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def main(args):
|
||||
'''Read train dataset, train model, save trained model'''
|
||||
|
||||
# Read train data
|
||||
train_data = pd.read_parquet(Path(args.train_data))
|
||||
|
||||
# Split the data into input(X) and output(y)
|
||||
y_train = train_data[TARGET_COL]
|
||||
X_train = train_data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS]
|
||||
|
||||
# Train a Random Forest Regression Model with the training set
|
||||
model = RandomForestRegressor(n_estimators = args.regressor__n_estimators,
|
||||
bootstrap = args.regressor__bootstrap,
|
||||
max_depth = args.regressor__max_depth,
|
||||
max_features = args.regressor__max_features,
|
||||
min_samples_leaf = args.regressor__min_samples_leaf,
|
||||
min_samples_split = args.regressor__min_samples_split,
|
||||
random_state=0)
|
||||
|
||||
# log model hyperparameters
|
||||
mlflow.log_param("model", "RandomForestRegressor")
|
||||
mlflow.log_param("n_estimators", args.regressor__n_estimators)
|
||||
mlflow.log_param("bootstrap", args.regressor__bootstrap)
|
||||
mlflow.log_param("max_depth", args.regressor__max_depth)
|
||||
mlflow.log_param("max_features", args.regressor__max_features)
|
||||
mlflow.log_param("min_samples_leaf", args.regressor__min_samples_leaf)
|
||||
mlflow.log_param("min_samples_split", args.regressor__min_samples_split)
|
||||
|
||||
# Train model with the train set
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
# Predict using the Regression Model
|
||||
yhat_train = model.predict(X_train)
|
||||
|
||||
# Evaluate Regression performance with the train set
|
||||
r2 = r2_score(y_train, yhat_train)
|
||||
mse = mean_squared_error(y_train, yhat_train)
|
||||
rmse = np.sqrt(mse)
|
||||
mae = mean_absolute_error(y_train, yhat_train)
|
||||
|
||||
# log model performance metrics
|
||||
mlflow.log_metric("train r2", r2)
|
||||
mlflow.log_metric("train mse", mse)
|
||||
mlflow.log_metric("train rmse", rmse)
|
||||
mlflow.log_metric("train mae", mae)
|
||||
|
||||
# Visualize results
|
||||
plt.scatter(y_train, yhat_train, color='black')
|
||||
plt.plot(y_train, y_train, color='blue', linewidth=3)
|
||||
plt.xlabel("Real value")
|
||||
plt.ylabel("Predicted value")
|
||||
plt.savefig("regression_results.png")
|
||||
mlflow.log_artifact("regression_results.png")
|
||||
|
||||
# Save the model
|
||||
mlflow.sklearn.save_model(sk_model=model, path=args.model_output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
mlflow.start_run()
|
||||
|
||||
# ---------- Parse Arguments ----------- #
|
||||
# -------------------------------------- #
|
||||
|
||||
args = parse_args()
|
||||
|
||||
lines = [
|
||||
f"Train dataset input path: {args.train_data}",
|
||||
f"Model output path: {args.model_output}",
|
||||
f"n_estimators: {args.regressor__n_estimators}",
|
||||
f"bootstrap: {args.regressor__bootstrap}",
|
||||
f"max_depth: {args.regressor__max_depth}",
|
||||
f"max_features: {args.regressor__max_features}",
|
||||
f"min_samples_leaf: {args.regressor__min_samples_leaf}",
|
||||
f"min_samples_split: {args.regressor__min_samples_split}"
|
||||
]
|
||||
|
||||
for line in lines:
|
||||
print(line)
|
||||
|
||||
main(args)
|
||||
|
||||
mlflow.end_run()
|
||||
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,3 @@
|
|||
{"input_data": [[2.86,40.66551971,-73.98258972,1,40.69801331,-73.97357178,0,2,1,1,19,21,3,56,1,1,19,21,21,57],
|
||||
[3.98,40.68072128,-73.931633,1,40.6909523,-73.99185181,0,2,0,1,4,21,44,11,0,1,4,21,59,35]]
|
||||
}
|
|
@ -0,0 +1,195 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
import argparse
|
||||
|
||||
from azure.ai.ml.entities import Data
|
||||
from azure.ai.ml.constants import AssetTypes
|
||||
|
||||
from azure.identity import DefaultAzureCredential
|
||||
from azure.ai.ml import MLClient
|
||||
|
||||
from azure.ai.ml.entities import AmlCompute
|
||||
from azure.ai.ml.entities import Data
|
||||
from azure.ai.ml.constants import AssetTypes
|
||||
from azure.ai.ml.entities import Environment
|
||||
from azure.ai.ml.dsl import pipeline
|
||||
from azure.ai.ml import Input, Output, command
|
||||
from azure.ai.ml.constants import AssetTypes, InputOutputModes
|
||||
|
||||
import json
|
||||
import yaml
|
||||
import os
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser("Deploy Training Pipeline")
|
||||
parser.add_argument("--experiment_name", type=str, help="Experiment Name")
|
||||
parser.add_argument("--compute_name", type=str, help="Compute Cluster Name")
|
||||
parser.add_argument("--data_name", type=str, help="Data Asset Name")
|
||||
parser.add_argument("--environment_name", type=str, help="Registered Environment Name")
|
||||
parser.add_argument("--enable_monitoring", type=str, help="Enable Monitoring", default="false")
|
||||
parser.add_argument("--table_name", type=str, help="ADX Monitoring Table Name", default="taximonitoring")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
print(args)
|
||||
|
||||
credential = DefaultAzureCredential()
|
||||
try:
|
||||
ml_client = MLClient.from_config(credential, path='config.json')
|
||||
|
||||
except Exception as ex:
|
||||
print("HERE IN THE EXCEPTION BLOCK")
|
||||
print(ex)
|
||||
|
||||
try:
|
||||
print(ml_client.compute.get(args.compute_name))
|
||||
except:
|
||||
print("No compute found")
|
||||
|
||||
|
||||
|
||||
print(os.getcwd())
|
||||
print('current', os.listdir())
|
||||
|
||||
# Create pipeline job
|
||||
|
||||
# 1. Define components
|
||||
parent_dir = "data-science/src"
|
||||
|
||||
prep_data = command(
|
||||
name="prep_data",
|
||||
display_name="prep-data",
|
||||
code=os.path.join(parent_dir, "prep"),
|
||||
command="python prep.py \
|
||||
--raw_data ${{inputs.raw_data}} \
|
||||
--train_data ${{outputs.train_data}} \
|
||||
--val_data ${{outputs.val_data}} \
|
||||
--test_data ${{outputs.test_data}} \
|
||||
--enable_monitoring ${{inputs.enable_monitoring}} \
|
||||
--table_name ${{inputs.table_name}}",
|
||||
environment=args.environment_name+"@latest",
|
||||
inputs={
|
||||
"raw_data": Input(type="uri_file"),
|
||||
"enable_monitoring": Input(type="string"),
|
||||
"table_name": Input(type="string")
|
||||
},
|
||||
outputs={
|
||||
"train_data": Output(type="uri_folder"),
|
||||
"val_data": Output(type="uri_folder"),
|
||||
"test_data": Output(type="uri_folder"),
|
||||
}
|
||||
)
|
||||
|
||||
train_model = command(
|
||||
name="train_model",
|
||||
display_name="train-model",
|
||||
code=os.path.join(parent_dir, "train"),
|
||||
command="python train.py \
|
||||
--train_data ${{inputs.train_data}} \
|
||||
--model_output ${{outputs.model_output}}",
|
||||
environment=args.environment_name+"@latest",
|
||||
inputs={"train_data": Input(type="uri_folder")},
|
||||
outputs={"model_output": Output(type="uri_folder")}
|
||||
)
|
||||
|
||||
evaluate_model = command(
|
||||
name="evaluate_model",
|
||||
display_name="evaluate-model",
|
||||
code=os.path.join(parent_dir, "evaluate"),
|
||||
command="python evaluate.py \
|
||||
--model_name ${{inputs.model_name}} \
|
||||
--model_input ${{inputs.model_input}} \
|
||||
--test_data ${{inputs.test_data}} \
|
||||
--evaluation_output ${{outputs.evaluation_output}}",
|
||||
environment=args.environment_name+"@latest",
|
||||
inputs={
|
||||
"model_name": Input(type="string"),
|
||||
"model_input": Input(type="uri_folder"),
|
||||
"test_data": Input(type="uri_folder")
|
||||
},
|
||||
outputs={
|
||||
"evaluation_output": Output(type="uri_folder")
|
||||
}
|
||||
)
|
||||
|
||||
register_model = command(
|
||||
name="register_model",
|
||||
display_name="register-model",
|
||||
code=os.path.join(parent_dir, "register"),
|
||||
command="python register.py \
|
||||
--model_name ${{inputs.model_name}} \
|
||||
--model_path ${{inputs.model_path}} \
|
||||
--evaluation_output ${{inputs.evaluation_output}} \
|
||||
--model_info_output_path ${{outputs.model_info_output_path}}",
|
||||
environment=args.environment_name+"@latest",
|
||||
inputs={
|
||||
"model_name": Input(type="string"),
|
||||
"model_path": Input(type="uri_folder"),
|
||||
"evaluation_output": Input(type="uri_folder")
|
||||
},
|
||||
outputs={
|
||||
"model_info_output_path": Output(type="uri_folder")
|
||||
}
|
||||
)
|
||||
|
||||
# 2. Construct pipeline
|
||||
@pipeline()
|
||||
def taxi_training_pipeline(raw_data, enable_monitoring, table_name):
|
||||
|
||||
prep = prep_data(
|
||||
raw_data=raw_data,
|
||||
enable_monitoring=enable_monitoring,
|
||||
table_name=table_name
|
||||
)
|
||||
|
||||
train = train_model(
|
||||
train_data=prep.outputs.train_data
|
||||
)
|
||||
|
||||
evaluate = evaluate_model(
|
||||
model_name="taxi-model",
|
||||
model_input=train.outputs.model_output,
|
||||
test_data=prep.outputs.test_data
|
||||
)
|
||||
|
||||
|
||||
register = register_model(
|
||||
model_name="taxi-model",
|
||||
model_path=train.outputs.model_output,
|
||||
evaluation_output=evaluate.outputs.evaluation_output
|
||||
)
|
||||
|
||||
return {
|
||||
"pipeline_job_train_data": prep.outputs.train_data,
|
||||
"pipeline_job_test_data": prep.outputs.test_data,
|
||||
"pipeline_job_trained_model": train.outputs.model_output,
|
||||
"pipeline_job_score_report": evaluate.outputs.evaluation_output,
|
||||
}
|
||||
|
||||
|
||||
pipeline_job = taxi_training_pipeline(
|
||||
Input(path=args.data_name + "@latest", type="uri_file"), args.enable_monitoring, args.table_name
|
||||
)
|
||||
|
||||
# set pipeline level compute
|
||||
pipeline_job.settings.default_compute = args.compute_name
|
||||
# set pipeline level datastore
|
||||
pipeline_job.settings.default_datastore = "workspaceblobstore"
|
||||
|
||||
pipeline_job = ml_client.jobs.create_or_update(
|
||||
pipeline_job, experiment_name=args.experiment_name
|
||||
)
|
||||
|
||||
pipeline_job
|
||||
ml_client.jobs.stream(pipeline_job.name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,66 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk-v2
|
||||
- name: endpoint_name
|
||||
value: taxi-batch-$(namespace)$(postfix)$(environment)
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: CreateBatchEndpoint
|
||||
displayName: Create/Update Batch Endpoint
|
||||
jobs:
|
||||
- job: DeployBatchEndpoint
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: batch-cluster # name must match cluster name in deployment file below
|
||||
size: STANDARD_DS3_V2
|
||||
min_instances: 0
|
||||
max_instances: 5
|
||||
cluster_tier: dedicated
|
||||
- template: templates/${{ variables.version }}/create-batch-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_name: "${{ variables.endpoint_name }}"
|
||||
endpoint_description: "Taxi batch endpoint"
|
||||
auth_mode: "aad_token"
|
||||
- template: templates/${{ variables.version }}/create-batch-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: taxi-batch-dp
|
||||
deployment_description: "Taxi batch deployment"
|
||||
endpoint_name: "${{ variables.endpoint_name }}"
|
||||
model_path: "taxi-model@latest"
|
||||
compute: batch-cluster
|
||||
- template: templates/${{ variables.version }}/test-batch-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_name: "${{ variables.endpoint_name }}"
|
||||
sample_request: data/taxi-batch.csv
|
||||
request_type: uri_file #either uri_folder or uri_file
|
|
@ -0,0 +1,70 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk-v2
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/python-sdk-v2/install-requirements.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: cpu-cluster
|
||||
size: Standard_DS3_v2
|
||||
min_instances: 0
|
||||
max_instances: 4
|
||||
cluster_tier: low_priority
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: taxi-train-env
|
||||
environment_description: "Training Environment for Taxi Pipeline"
|
||||
environment_path: data-science/environment/train-conda.yml
|
||||
build_type: conda
|
||||
- template: templates/${{ variables.version }}/register-data-asset.yml@mlops-templates
|
||||
parameters:
|
||||
data_name: taxi-data
|
||||
data_description: taxi-training-dataset
|
||||
data_path: data/taxi-data.csv
|
||||
data_type: uri_file
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_path: mlops/azureml/train/run_pipeline.py
|
||||
experiment_name: taxi-train-pipeline
|
||||
data_name: taxi-data
|
||||
environment_name: taxi-train-env
|
||||
compute_name: cpu-cluster
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
table_name: "taximonitoring"
|
|
@ -0,0 +1,57 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk-v2
|
||||
- name: endpoint_name
|
||||
value: taxi-online-$(namespace)$(postfix)$(environment)
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: CreateOnlineEndpoint
|
||||
displayName: Create/Update Online Endpoint
|
||||
jobs:
|
||||
- job: DeployOnlineEndpoint
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/create-online-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_name: "${{ variables.endpoint_name }}"
|
||||
endpoint_description: "Taxi Online Endpoint"
|
||||
auth_mode: "aml_token"
|
||||
- template: templates/${{ variables.version }}/create-online-deployment.yml@mlops-templates
|
||||
parameters:
|
||||
deployment_name: taxi-online-dp
|
||||
endpoint_name: "${{ variables.endpoint_name }}"
|
||||
model_path: "taxi-model@latest"
|
||||
traffic_allocation: 100
|
||||
- template: templates/${{ variables.version }}/test-online-endpoint.yml@mlops-templates
|
||||
parameters:
|
||||
endpoint_name: "${{ variables.endpoint_name }}"
|
||||
sample_request: data/taxi-request.json
|
|
@ -1,16 +0,0 @@
|
|||
channels:
|
||||
- defaults
|
||||
- anaconda
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.7.5
|
||||
- pip
|
||||
- pip:
|
||||
- azureml-mlflow==1.38.0
|
||||
- azureml-sdk==1.38.0
|
||||
- scikit-learn==0.24.1
|
||||
- pandas==1.2.1
|
||||
- joblib==1.0.0
|
||||
- matplotlib==3.3.3
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-client
|
||||
- git+https://github.com/microsoft/AzureML-Observability#subdirectory=aml-obs-collector
|
|
@ -1,6 +1,6 @@
|
|||
azureml-mlflow==1.38.0
|
||||
azureml-sdk==1.38.0
|
||||
pyarrow==10.0.0
|
||||
scikit-learn==0.24.1
|
||||
pandas==1.2.1
|
||||
joblib==1.2.0
|
||||
matplotlib==3.3.3
|
||||
matplotlib==3.3.3
|
|
@ -71,11 +71,7 @@ def main(args):
|
|||
# ------------ Reading Data ------------ #
|
||||
# -------------------------------------- #
|
||||
|
||||
print("mounted_path files: ")
|
||||
arr = os.listdir(args.raw_data)
|
||||
print(arr)
|
||||
|
||||
data = pd.read_csv((Path(args.raw_data) / 'taxi-data.csv'))
|
||||
data = pd.read_csv((Path(args.raw_data)))
|
||||
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
|
||||
|
||||
# ------------- Split Data ------------- #
|
||||
|
@ -101,7 +97,7 @@ def main(args):
|
|||
val.to_parquet((Path(args.val_data) / "val.parquet"))
|
||||
test.to_parquet((Path(args.test_data) / "test.parquet"))
|
||||
|
||||
if (args.enable_monitoring.lower == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower == 'yes'):
|
||||
if (args.enable_monitoring.lower() == 'true' or args.enable_monitoring == '1' or args.enable_monitoring.lower() == 'yes'):
|
||||
log_training_data(data, args.table_name)
|
||||
|
||||
|
||||
|
@ -129,4 +125,4 @@ if __name__ == "__main__":
|
|||
|
||||
mlflow.end_run()
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -82,7 +82,9 @@ def test_prep_data():
|
|||
df = pd.DataFrame(data)
|
||||
df.to_csv(os.path.join(raw_data, "taxi-data.csv"))
|
||||
|
||||
raw_data= os.path.join(raw_data, "taxi-data.csv")
|
||||
cmd = f"python data-science/src/prep/prep.py --raw_data={raw_data} --train_data={train_data} --val_data={val_data} --test_data={test_data}"
|
||||
|
||||
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
|
||||
out, err = p.communicate()
|
||||
result = str(out).split('\\n')
|
||||
|
|
|
@ -2,63 +2,75 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: aml-cli-v2
|
||||
|
||||
trigger:
|
||||
- none
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
- repository: rai-vnext-preview # Template Repo
|
||||
- repository: rai-vnext-preview # Template Repo
|
||||
name: Azure/rai-vnext-preview # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- checkout: rai-vnext-preview
|
||||
path: s/
|
||||
- template: templates/tests/unit-tests.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
build_type: conda
|
||||
environment_name: taxi-train-env
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- template: register-rai-components.yml
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_file: mlops/azureml/train/pipeline.yml
|
||||
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
|
||||
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/tests/unit-tests.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: taxi-train-env
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: cpu-cluster
|
||||
size: Standard_DS3_v2
|
||||
min_instances: 0
|
||||
max_instances: 4
|
||||
cluster_tier: low_priority
|
||||
- template: templates/${{ variables.version }}/register-data.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: uri_file
|
||||
data_name: taxi-data
|
||||
data_file: mlops/azureml/train/data.yml
|
||||
- checkout: rai-vnext-preview
|
||||
path: s/
|
||||
- template: register-rai-components.yml
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_file: mlops/azureml/train/pipeline.yml
|
||||
experiment_name: $(environment)_taxi_fare_train_$(Build.SourceBranchName)
|
||||
display_name: $(environment)_taxi_fare_run_$(Build.BuildID)
|
||||
enable_monitoring: $(enable_monitoring)
|
||||
|
|
|
@ -1,15 +1,14 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# Dev environment
|
||||
variables:
|
||||
|
||||
# Global
|
||||
ap_vm_image: ubuntu-20.04
|
||||
|
||||
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
|
||||
postfix: 0659
|
||||
location: westus
|
||||
|
||||
postfix: 0001
|
||||
location: eastus
|
||||
environment: dev
|
||||
enable_aml_computecluster: true
|
||||
enable_monitoring: false
|
||||
|
@ -29,8 +28,9 @@ variables:
|
|||
storage_account: st$(namespace)$(postfix)$(environment)
|
||||
|
||||
# For terraform reference
|
||||
terraform_version: 0.14.7
|
||||
terraform_version: 1.3.6
|
||||
terraform_workingdir: infrastructure
|
||||
terraform_st_location: $(location)
|
||||
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
|
||||
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
|
||||
terraform_st_container_name: default
|
||||
|
|
|
@ -3,18 +3,16 @@
|
|||
|
||||
# Prod environment
|
||||
variables:
|
||||
|
||||
# Global
|
||||
ap_vm_image: ubuntu-20.04
|
||||
|
||||
namespace: mlopsv2 #Note: A namespace with many characters will cause storage account creation to fail due to storage account names having a limit of 24 characters.
|
||||
postfix: 0518
|
||||
location: westeurope
|
||||
postfix: 0001
|
||||
location: eastus
|
||||
environment: prod
|
||||
enable_aml_computecluster: true
|
||||
enable_monitoring: false
|
||||
|
||||
|
||||
# Azure DevOps
|
||||
ado_service_connection_rg: Azure-ARM-Prod
|
||||
ado_service_connection_aml_ws: Azure-ARM-Prod
|
||||
|
@ -30,8 +28,9 @@ variables:
|
|||
storage_account: st$(namespace)$(postfix)$(environment)
|
||||
|
||||
# For terraform reference
|
||||
terraform_version: 0.14.7
|
||||
terraform_version: 1.3.6
|
||||
terraform_workingdir: infrastructure
|
||||
terraform_st_location: $(location)
|
||||
terraform_st_resource_group: rg-$(namespace)-$(postfix)$(environment)-tf
|
||||
terraform_st_storage_account: st$(namespace)$(postfix)$(environment)tf
|
||||
terraform_st_container_name: default
|
||||
|
|
|
@ -1,51 +1,90 @@
|
|||
name: deploy-cv-model-training-pipeline
|
||||
|
||||
on:
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
set-env-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
|
||||
steps:
|
||||
- id: set-prod-branch
|
||||
name: set-prod-branch
|
||||
if: ${{ github.ref == 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
|
||||
- id: set-dev-branch
|
||||
name: setdevbranch
|
||||
if: ${{ github.ref != 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
|
||||
- id: set-output-defaults
|
||||
name: set-output-defaults
|
||||
run: |
|
||||
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
create-compute:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: gpu-cluster
|
||||
size: Standard_NC6
|
||||
min_instances: 0
|
||||
max_instances: 1
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: set-env-branch
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: ${{ needs.set-env-branch.outputs.config-file}}
|
||||
create-dataprep-compute:
|
||||
needs: [get-config]
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: cpu-cluster
|
||||
size: Standard_DS3_v2
|
||||
min_instances: 0
|
||||
max_instances: 4
|
||||
cluster_tier: low_priority
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-training-compute:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-compute.yml@main
|
||||
with:
|
||||
cluster_name: gpu-cluster
|
||||
size: Standard_NC6
|
||||
min_instances: 0
|
||||
max_instances: 1
|
||||
cluster_tier: low_priority
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
register-environment:
|
||||
needs: [get-config,create-compute]
|
||||
uses: ./.github/workflows/register-environment.yml
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
environment_file: mlops/azureml/train/train-env.yaml
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: [get-config, create-dataprep-compute, create-training-compute]
|
||||
uses: Azure/mlops-templates/.github/workflows/register-environment.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
environment_file: mlops/azureml/train/train-env.yaml
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
register-dataset:
|
||||
needs: [get-config,register-environment]
|
||||
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
data_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
|
||||
file_type: Training
|
||||
name: stanford_dogs
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
run-pipeline:
|
||||
needs: [get-config,register-dataset]
|
||||
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
parameters-file: mlops/azureml/train/pipeline.yaml
|
||||
job-name: cv-train
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: [get-config, register-environment]
|
||||
uses: Azure/mlops-templates/.github/workflows/register-dataset.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
data_file: mlops/azureml/train/create_stanford_dogs_dataset.yaml
|
||||
file_type: Training
|
||||
name: stanford_dogs
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
run-model-training-pipeline:
|
||||
needs:
|
||||
[
|
||||
get-config,
|
||||
create-dataprep-compute,
|
||||
create-training-compute,
|
||||
register-environment,
|
||||
register-dataset,
|
||||
]
|
||||
uses: Azure/mlops-templates/.github/workflows/run-pipeline.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
parameters-file: mlops/azureml/train/pipeline.yaml
|
||||
job-name: cv-train
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
|
|
|
@ -1,42 +1,60 @@
|
|||
name: deploy-online-endpoint-pipeline
|
||||
|
||||
on:
|
||||
on:
|
||||
workflow_dispatch:
|
||||
jobs:
|
||||
set-env-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
|
||||
steps:
|
||||
- id: set-prod-branch
|
||||
name: set-prod-branch
|
||||
if: ${{ github.ref == 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
|
||||
- id: set-dev-branch
|
||||
name: setdevbranch
|
||||
if: ${{ github.ref != 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
|
||||
- id: set-output-defaults
|
||||
name: set-output-defaults
|
||||
run: |
|
||||
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
|
||||
get-config:
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: config-infra-prod.yml
|
||||
needs: set-env-branch
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: ${{ needs.set-env-branch.outputs.config-file}}
|
||||
create-endpoint:
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
endpoint_name: dogs-classifier-online2
|
||||
endpoint_type: online
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
needs: get-config
|
||||
uses: Azure/mlops-templates/.github/workflows/create-endpoint.yml@main
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-endpoint.yml
|
||||
endpoint_name: dogs-classifier-online2
|
||||
endpoint_type: online
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
create-deployment:
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config,create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
endpoint_name: dogs-classifier-online2
|
||||
endpoint_type: online
|
||||
deployment_name: dogs-online-dp
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
uses: Azure/mlops-templates/.github/workflows/create-deployment.yml@main
|
||||
needs: [get-config, create-endpoint]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
endpoint_file: mlops/azureml/deploy/online/online-deployment.yml
|
||||
endpoint_name: dogs-classifier-online2
|
||||
endpoint_type: online
|
||||
deployment_name: dogs-online-dp
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
allocate-traffic:
|
||||
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
|
||||
needs: [get-config,create-deployment]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
traffic_allocation: dogs-online-dp=100
|
||||
endpoint_name: dogs-classifier-online2
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
uses: Azure/mlops-templates/.github/workflows/allocate-traffic.yml@main
|
||||
needs: [get-config, create-deployment]
|
||||
with:
|
||||
resource_group: ${{ needs.get-config.outputs.resource_group }}
|
||||
workspace_name: ${{ needs.get-config.outputs.aml_workspace }}
|
||||
traffic_allocation: dogs-online-dp=100
|
||||
endpoint_name: dogs-classifier-online2
|
||||
secrets:
|
||||
creds: ${{secrets.AZURE_CREDENTIALS}}
|
||||
|
|
Не удается отобразить этот файл, потому что он слишком большой.
|
|
@ -0,0 +1,103 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- template: ../../config-aml.yml
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk-v1
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: $(ap_vm_image)
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
steps:
|
||||
# Setup
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
|
||||
# Environment
|
||||
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: $(training_env_name)
|
||||
build_type: folder
|
||||
environment_file: $(training_env_path)
|
||||
|
||||
# Compute
|
||||
- template: templates/${{ variables.version }}/get-compute.yml@mlops-templates
|
||||
parameters:
|
||||
compute_type: training
|
||||
|
||||
# Datasets (images + labels)
|
||||
# Images dataset
|
||||
- task: Bash@3
|
||||
displayName: "Download data"
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
mkdir -p $(training_dataset_local_path)
|
||||
curl $(training_dataset_storage_url) | tar xvf - --no-same-owner -C $(training_dataset_local_path)
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
# Labels dataset
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
datasetName: $(labels_dataset_name)
|
||||
datasetDescription: $(labels_dataset_description)
|
||||
datasetLocalPath: $(labels_dataset_local_path)
|
||||
datasetPathOnDatastore: $(labels_dataset_path_on_datastore)
|
||||
datasetType: $(labels_dataset_type)
|
||||
|
||||
# Deploy training pipeline
|
||||
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
|
||||
- task: Bash@3
|
||||
name: export_pipeline_id
|
||||
displayName: "Export Pipeline ID"
|
||||
inputs:
|
||||
targetType: "inline"
|
||||
script: |
|
||||
echo "##vso[task.setvariable variable=pipeline_id;isOutput=true;]$(pipeline_id)"
|
||||
|
||||
# Run training
|
||||
- job: invoke_pipeline
|
||||
displayName: "Invoke pipeline"
|
||||
pool: server
|
||||
timeoutInMinutes: 0
|
||||
dependsOn: DeployTrainingPipeline
|
||||
variables:
|
||||
pipeline_id: $[ dependencies.DeployTrainingPipeline.outputs['export_pipeline_id.pipeline_id'] ]
|
||||
steps:
|
||||
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
|
||||
displayName: "Invoke AML Pipeline"
|
||||
inputs:
|
||||
azureSubscription: "$(ado_service_connection_aml_ws)"
|
||||
PipelineId: "$(PIPELINE_ID)"
|
||||
ExperimentName: "$(training_experiment_name)"
|
|
@ -1,105 +0,0 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- template: ../../config-aml.yml
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk
|
||||
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: $(ap_vm_image)
|
||||
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
steps:
|
||||
|
||||
# Setup
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/${{ variables.version }}/install-az-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/connect-to-workspace.yml@mlops-templates
|
||||
|
||||
# Environment
|
||||
- template: templates/${{ variables.version }}/create-environment.yml@mlops-templates
|
||||
parameters:
|
||||
environment_name: $(training_env_name)
|
||||
build_type: folder
|
||||
environment_file: $(training_env_path)
|
||||
|
||||
# Compute
|
||||
- template: templates/${{ variables.version }}/get-compute.yml@mlops-templates
|
||||
parameters:
|
||||
compute_type: training
|
||||
|
||||
# Datasets (images + labels)
|
||||
# Images dataset
|
||||
- task: Bash@3
|
||||
displayName: 'Download data'
|
||||
inputs:
|
||||
targetType: inline
|
||||
script: |
|
||||
mkdir -p $(training_dataset_local_path)
|
||||
curl $(training_dataset_storage_url) | tar xvf - --no-same-owner -C $(training_dataset_local_path)
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
# Labels dataset
|
||||
- template: templates/${{ variables.version }}/register-dataset.yml@mlops-templates
|
||||
parameters:
|
||||
data_type: training
|
||||
datasetName: $(labels_dataset_name)
|
||||
datasetDescription: $(labels_dataset_description)
|
||||
datasetLocalPath: $(labels_dataset_local_path)
|
||||
datasetPathOnDatastore: $(labels_dataset_path_on_datastore)
|
||||
datasetType: $(labels_dataset_type)
|
||||
|
||||
# Deploy training pipeline
|
||||
- template: templates/${{ variables.version }}/deploy-training-pipeline.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/add-pipeline-to-endpoint.yml@mlops-templates
|
||||
- task: Bash@3
|
||||
name: export_pipeline_id
|
||||
displayName: "Export Pipeline ID"
|
||||
inputs:
|
||||
targetType: "inline"
|
||||
script: |
|
||||
echo "##vso[task.setvariable variable=pipeline_id;isOutput=true;]$(pipeline_id)"
|
||||
|
||||
# Run training
|
||||
- job: invoke_pipeline
|
||||
displayName: 'Invoke pipeline'
|
||||
pool: server
|
||||
timeoutInMinutes: 0
|
||||
dependsOn: DeployTrainingPipeline
|
||||
variables:
|
||||
pipeline_id: $[ dependencies.DeployTrainingPipeline.outputs['export_pipeline_id.pipeline_id'] ]
|
||||
steps:
|
||||
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
|
||||
displayName: 'Invoke AML Pipeline'
|
||||
inputs:
|
||||
azureSubscription: '$(ado_service_connection_aml_ws)'
|
||||
PipelineId: '$(PIPELINE_ID)'
|
||||
ExperimentName: '$(training_experiment_name)'
|
|
@ -0,0 +1,63 @@
|
|||
name: tf-gha-deploy-infra.yml
|
||||
|
||||
on:
|
||||
#push:
|
||||
workflow_dispatch:
|
||||
env:
|
||||
config_env: "none"
|
||||
jobs:
|
||||
set-env-branch:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
config-file: ${{ steps.set-output-defaults.outputs.config-file }}
|
||||
steps:
|
||||
- id: set-prod-branch
|
||||
name: set-prod-branch
|
||||
if: ${{ github.ref == 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-prod.yml" >> $GITHUB_ENV;
|
||||
- id: set-dev-branch
|
||||
name: setdevbranch
|
||||
if: ${{ github.ref != 'refs/heads/main'}}
|
||||
run: echo "config_env=config-infra-dev.yml" >> $GITHUB_ENV;
|
||||
- id: set-output-defaults
|
||||
name: set-output-defaults
|
||||
run: |
|
||||
echo "config-file=$config_env" >> $GITHUB_OUTPUT;
|
||||
get-config:
|
||||
needs: set-env-branch
|
||||
uses: Azure/mlops-templates/.github/workflows/read-yaml.yml@main
|
||||
with:
|
||||
file_name: ${{ needs.set-env-branch.outputs.config-file}}
|
||||
test-terraform-state-deployment:
|
||||
needs: [get-config, set-env-branch]
|
||||
uses: Azure/mlops-templates/.github/workflows/tf-gha-install-terraform.yml@main
|
||||
with:
|
||||
TFAction: "apply"
|
||||
dply_environment: ${{ needs.set-env-branch.outputs.config-file }}
|
||||
location: ${{ needs.get-config.outputs.location }}
|
||||
namespace: ${{ needs.get-config.outputs.namespace }}
|
||||
postfix: ${{ needs.get-config.outputs.postfix }}
|
||||
environment: ${{ needs.get-config.outputs.environment }}
|
||||
enable_aml_computecluster: ${{ needs.get-config.outputs.enable_aml_computecluster == true }} ## TODO review the evaluation of boolean
|
||||
enable_monitoring: ${{ needs.get-config.outputs.enable_monitoring == true }} ## TODO review the evaluation of boolean
|
||||
terraform_version: ${{ needs.get-config.outputs.terraform_version }}
|
||||
terraform_workingdir: ${{ needs.get-config.outputs.terraform_workingdir }}
|
||||
terraform_st_location: ${{ needs.get-config.outputs.terraform_st_location }}
|
||||
terraform_st_storage_account: ${{ needs.get-config.outputs.terraform_st_storage_account }}
|
||||
terraform_st_resource_group: ${{ needs.get-config.outputs.terraform_st_resource_group }}
|
||||
terraform_st_container_name: ${{ needs.get-config.outputs.terraform_st_container_name }}
|
||||
terraform_st_key: ${{ needs.get-config.outputs.terraform_st_key }}
|
||||
terraform_plan_location: ${{ needs.get-config.outputs.location }}
|
||||
terraform_plan_vnet: "TBD" # TBD
|
||||
secrets:
|
||||
azure_creds: ${{ secrets.AZURE_CREDENTIALS }}
|
||||
clientId: ${{ secrets.ARM_CLIENT_ID }}
|
||||
clientSecret: ${{ secrets.ARM_CLIENT_SECRET }}
|
||||
subscriptionId: ${{ secrets.ARM_SUBSCRIPTION_ID }}
|
||||
tenantId: ${{ secrets.ARM_TENANT_ID }}
|
||||
deploy-azureml-resources:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- id: deploy-aml-workspace
|
||||
name: deploy-aml-workspace
|
||||
run: echo "OK"
|
|
@ -56,4 +56,4 @@ resource "azurerm_key_vault_secret" "ADX_DB" {
|
|||
value = azurerm_kusto_database.database[0].name
|
||||
key_vault_id = var.key_vault_id
|
||||
count = var.enable_monitoring ? 1 : 0
|
||||
}
|
||||
}
|
|
@ -19,13 +19,13 @@ resource "azurerm_storage_account" "st" {
|
|||
is_hns_enabled = var.hns_enabled
|
||||
|
||||
tags = var.tags
|
||||
|
||||
}
|
||||
|
||||
# Virtual Network & Firewall configuration
|
||||
|
||||
resource "azurerm_storage_account_network_rules" "firewall_rules" {
|
||||
resource_group_name = var.rg_name
|
||||
storage_account_name = azurerm_storage_account.st.name
|
||||
storage_account_id = azurerm_storage_account.st.id
|
||||
|
||||
default_action = "Allow"
|
||||
ip_rules = [] # [data.http.ip.body]
|
||||
|
|
|
@ -16,3 +16,13 @@ psutil==5.9.0
|
|||
|
||||
# for unit testing
|
||||
pytest==7.1.2
|
||||
|
||||
# for azure ml SDK v2
|
||||
azure-ai-ml==1.1.0
|
||||
azure-common==1.1.28
|
||||
azure-core==1.26.1
|
||||
azure-identity==1.10.0
|
||||
azure-mgmt-core==1.3.0
|
||||
azure-storage-blob==12.14.1
|
||||
azure-storage-file-datalake==12.9.1
|
||||
azure-storage-file-share==12.7.0
|
|
@ -63,7 +63,6 @@ stages:
|
|||
cluster_tier: dedicated
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
build_type: conda
|
||||
environment_name: nlp_summarization_train
|
||||
environment_file: mlops/azureml/train/train-env.yml
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
name: nlp_inference_conda_env
|
||||
channels:
|
||||
- pytorch
|
||||
- anaconda
|
||||
- defaults
|
||||
- conda-forge
|
||||
dependencies:
|
||||
- python=3.8
|
||||
- pip=21.2.4
|
||||
- pytorch=1.10.0
|
||||
- torchvision=0.11.1
|
||||
- torchaudio=0.10.0
|
||||
- cudatoolkit=11.1.1
|
||||
- nvidia-apex=0.1.0
|
||||
- gxx_linux-64=8.5.0
|
||||
- pip:
|
||||
- azureml-defaults==1.39.0
|
||||
- azureml-mlflow==1.39.0
|
||||
- azureml-telemetry==1.39.0
|
||||
- azureml-train-core==1.39.0
|
||||
- mlflow==1.24.0
|
||||
- transformers==4.17.0
|
||||
- 'inference-schema[numpy-support]==1.3.0'
|
||||
- applicationinsights==0.11.10
|
|
@ -0,0 +1,7 @@
|
|||
# check release notes https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html
|
||||
FROM nvcr.io/nvidia/pytorch:22.04-py3
|
||||
|
||||
# Install dependencies missing in this container
|
||||
# NOTE: container already has matplotlib==3.5.1 tqdm==4.62.0
|
||||
COPY requirements.txt ./
|
||||
RUN pip install -r requirements.txt
|
|
@ -0,0 +1,21 @@
|
|||
# data science requirements
|
||||
# torchvision==0.12.0
|
||||
# torch==1.11.0
|
||||
pytorch_lightning==1.6.4
|
||||
transformers==4.18.0
|
||||
datasets==2.3.2
|
||||
rouge_score==0.0.4
|
||||
sentencepiece==0.1.96
|
||||
|
||||
# for metrics reporting/plotting
|
||||
mlflow==1.25.1
|
||||
azureml-mlflow==1.41.0
|
||||
# matplotlib==3.5.2
|
||||
# tqdm==4.64.0
|
||||
psutil==5.9.0
|
||||
|
||||
# for unit testing
|
||||
pytest==7.1.2
|
||||
|
||||
# for azure ml SDK v2
|
||||
azure-ai-ml==1.1.0
|
|
@ -0,0 +1,88 @@
|
|||
import os
|
||||
import argparse
|
||||
import logging
|
||||
import mlflow
|
||||
import json
|
||||
from distutils.util import strtobool
|
||||
|
||||
def main():
|
||||
"""Main function of the script."""
|
||||
# initialize root logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
|
||||
)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# input and output arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--baseline_metrics",
|
||||
type=str,
|
||||
required=True,
|
||||
help="path to baseline metrics folder containing all_results.json",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--candidate_metrics",
|
||||
type=str,
|
||||
required=True,
|
||||
help="path to candidate metrics folder containing all_results.json",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--reference_metric",
|
||||
type=str,
|
||||
default="predict_rougeLsum",
|
||||
help="name of reference metric for shipping flag (default: predict_rougeLsum)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force_comparison", type=strtobool, default=False, help="set to True to bypass comparison and set --deploy_flag to True"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deploy_flag", type=str, help="a deploy flag whether to deploy or not"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Start Logging
|
||||
mlflow.start_run()
|
||||
|
||||
logger.info(f"Running with arguments: {args}")
|
||||
|
||||
# open metrics on both sides
|
||||
with open(os.path.join(args.baseline_metrics, "all_results.json")) as in_file:
|
||||
baseline_metrics = json.loads(in_file.read())
|
||||
with open(os.path.join(args.candidate_metrics, "all_results.json")) as in_file:
|
||||
candidate_metrics = json.loads(in_file.read())
|
||||
|
||||
# should we ship or not?
|
||||
if args.force_comparison:
|
||||
deploy_flag = True
|
||||
else:
|
||||
deploy_flag = (
|
||||
candidate_metrics[args.reference_metric]
|
||||
> baseline_metrics[args.reference_metric]
|
||||
)
|
||||
|
||||
logger.info("baseline_metrics[{}]={}, candidate_metrics[{}]={}, deploy_flag={} (force_comparison={})".format(
|
||||
args.reference_metric,
|
||||
baseline_metrics[args.reference_metric],
|
||||
args.reference_metric,
|
||||
candidate_metrics[args.reference_metric],
|
||||
deploy_flag,
|
||||
args.force_comparison
|
||||
))
|
||||
|
||||
# save deploy_flag as a file
|
||||
os.makedirs(args.deploy_flag, exist_ok=True)
|
||||
with open(os.path.join(args.deploy_flag, "deploy_flag"), "w") as out_file:
|
||||
out_file.write("%d" % int(deploy_flag))
|
||||
|
||||
# Stop Logging
|
||||
mlflow.end_run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,151 @@
|
|||
"""
|
||||
This scripts prepares a HuggingFace dataset to be used
|
||||
for fine-tuning. It encodes the train/val/test tests and
|
||||
outputs as JSONL files.
|
||||
"""
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
from datasets import load_dataset, DatasetDict
|
||||
from transformers import AutoTokenizer
|
||||
import mlflow
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function of the script."""
|
||||
# initialize root logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
|
||||
)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# input and output arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--dataset_name", type=str, help="name of dataset or path to input dataset_name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset_config", type=str, help="config for huggingface dataset"
|
||||
)
|
||||
parser.add_argument("--text_column", type=str, help="name of text_column")
|
||||
parser.add_argument("--summary_column", type=str, help="name of summary_column")
|
||||
parser.add_argument(
|
||||
"--max_input_length", type=int, default=512, help="max_input_length"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max_target_length", type=int, default=40, help="max_target_length"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--padding", type=str, default="max_length", help="padding type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_arch",
|
||||
type=str,
|
||||
help="name of the model to prepare for in HF model library",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit_samples", type=int, default=-1, help="sample size from input dataset"
|
||||
)
|
||||
parser.add_argument("--encodings", type=str, help="path to tokenized dataset")
|
||||
parser.add_argument(
|
||||
"--source_prefix",
|
||||
type=str,
|
||||
help="A prefix to add before every source text (useful for T5 models).",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Start Logging
|
||||
mlflow.start_run()
|
||||
|
||||
logger.info(f"Running with arguments: {args}")
|
||||
|
||||
# get tokenizer ready
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model_arch)
|
||||
logger.info(f"tokenizer: {tokenizer}")
|
||||
|
||||
prefix = args.source_prefix if args.source_prefix is not None else ""
|
||||
if args.source_prefix is None and "t5" in args.model_arch.lower():
|
||||
logger.warning(
|
||||
"You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
|
||||
"`--source_prefix 'summarize: ' `"
|
||||
)
|
||||
|
||||
# Load HuggingFace dataset
|
||||
raw_dataset = load_dataset(args.dataset_name, args.dataset_config)
|
||||
|
||||
logger.info(f"raw dataset length: {raw_dataset.num_rows}")
|
||||
mlflow.log_metric("train_samples", raw_dataset["train"].shape[0])
|
||||
mlflow.log_metric("test_samples", raw_dataset["test"].shape[0])
|
||||
mlflow.log_metric("validation_samples", raw_dataset["validation"].shape[0])
|
||||
|
||||
if args.limit_samples > 0:
|
||||
sample_sizes = {
|
||||
k: min(len(raw_dataset[k]), args.limit_samples) for k in raw_dataset.keys()
|
||||
}
|
||||
raw_dataset = DatasetDict(
|
||||
{
|
||||
k: raw_dataset[k].select([i for i in range(sample_sizes[k])])
|
||||
for k in raw_dataset.keys()
|
||||
}
|
||||
)
|
||||
logger.info("sampled raw dataset:")
|
||||
logger.info(raw_dataset.num_rows)
|
||||
|
||||
def preprocess_function(examples):
|
||||
# remove pairs where at least one record is None
|
||||
|
||||
inputs, targets = [], []
|
||||
for i in range(len(examples[args.text_column])):
|
||||
if (
|
||||
examples[args.text_column][i] is not None
|
||||
and examples[args.summary_column][i] is not None
|
||||
):
|
||||
inputs.append(examples[args.text_column][i])
|
||||
targets.append(examples[args.summary_column][i])
|
||||
|
||||
inputs = [prefix + inp for inp in inputs]
|
||||
model_inputs = tokenizer(
|
||||
inputs,
|
||||
max_length=args.max_input_length,
|
||||
padding=args.padding,
|
||||
truncation=True,
|
||||
)
|
||||
# Set up the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer(
|
||||
examples[args.summary_column],
|
||||
max_length=args.max_target_length,
|
||||
padding=args.padding,
|
||||
truncation=True,
|
||||
)
|
||||
|
||||
# replace all tokenizer.pad_token_id in the labels by -100 to ignore padding in the loss.
|
||||
if args.padding == "max_length":
|
||||
labels["input_ids"] = [
|
||||
[(l if l != tokenizer.pad_token_id else -100) for l in label]
|
||||
for label in labels["input_ids"]
|
||||
]
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
preprocessed_datasets = raw_dataset.map(preprocess_function, batched=True)
|
||||
logger.info(f"preprocessed_datasets: {preprocessed_datasets}")
|
||||
|
||||
output_path = os.path.join(args.encodings)
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
|
||||
preprocessed_datasets.save_to_disk(output_path)
|
||||
|
||||
logger.info(f"tokenized data is saved to {output_path}")
|
||||
|
||||
# Stop Logging
|
||||
mlflow.end_run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,82 @@
|
|||
from azureml.core import Run
|
||||
from azureml.core.model import Model
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
import mlflow
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function of the script."""
|
||||
# initialize root logger
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
|
||||
)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# input and output arguments
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--model_folder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="folder containing model",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--register_as",
|
||||
type=str,
|
||||
required=True,
|
||||
help="name to use for model registration in AzureML",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--deploy_flag", type=str, required=True, help="a deploy flag whether to deploy or not"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
logger.info(f"Running with arguments: {args}")
|
||||
|
||||
# Start Logging
|
||||
mlflow.start_run()
|
||||
|
||||
if os.path.isfile(args.deploy_flag):
|
||||
deploy_flag_file_path = args.deploy_flag
|
||||
else:
|
||||
deploy_flag_file_path = os.path.join(args.deploy_flag, "deploy_flag")
|
||||
|
||||
logger.info(f"Opening deploy_flag file from {deploy_flag_file_path}")
|
||||
with open(deploy_flag_file_path, 'rb') as in_file:
|
||||
deploy_flag = bool(int(in_file.read()))
|
||||
|
||||
if deploy_flag:
|
||||
logger.info(f"Deploy flag is True, registering model as {args.register_as}...")
|
||||
run = Run.get_context()
|
||||
|
||||
# if we're running locally, except
|
||||
if run.__class__.__name__ == "_OfflineRun":
|
||||
raise Exception("You can't run this script locally, you will need to run it as an AzureML job.")
|
||||
|
||||
_ = Model.register(
|
||||
run.experiment.workspace,
|
||||
model_name=args.register_as,
|
||||
model_path=args.model_folder,
|
||||
tags={
|
||||
"type": "huggingface",
|
||||
"task": "summarization"
|
||||
},
|
||||
description="Huggingface model finetuned for summarization",
|
||||
)
|
||||
else:
|
||||
logger.info(f"Deploy flag is False, pass.")
|
||||
|
||||
# Stop Logging
|
||||
mlflow.end_run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -0,0 +1,269 @@
|
|||
import logging
|
||||
import os
|
||||
from datasets import load_metric, load_from_disk
|
||||
from transformers import (
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoTokenizer,
|
||||
DataCollatorForSeq2Seq,
|
||||
Seq2SeqTrainingArguments,
|
||||
Seq2SeqTrainer,
|
||||
HfArgumentParser,
|
||||
IntervalStrategy,
|
||||
)
|
||||
from transformers.trainer_callback import TrainerCallback
|
||||
|
||||
import torch
|
||||
import nltk
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
import numpy as np
|
||||
import mlflow
|
||||
from pynvml import *
|
||||
import time
|
||||
|
||||
|
||||
# Input arguments are set with dataclass. Huggingface library stores the default training args in TrainingArguments dataclass
|
||||
# user args are also defined in dataclasses, we will then load arguments from a tuple of user defined and built-in dataclasses.
|
||||
@dataclass
|
||||
class DataArgs:
|
||||
# Inputs
|
||||
preprocessed_datasets: str = field(
|
||||
default=None, metadata={"help": "path to preprocesed datasets"}
|
||||
)
|
||||
|
||||
# Processing parameters
|
||||
max_target_length: Optional[int] = field(
|
||||
default=128,
|
||||
metadata={"help": "maxi sequence length for target text after tokenization."},
|
||||
)
|
||||
limit_samples: Optional[int] = field(
|
||||
default=-1,
|
||||
metadata={"help": "limit the number of samples for faster run."},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArgs:
|
||||
model_name: Optional[str] = field(default=None, metadata={"help": "model name"})
|
||||
model_path: Optional[str] = field(
|
||||
default=None, metadata={"help": "path to existing model file to load"}
|
||||
)
|
||||
model_output: Optional[str] = field(
|
||||
default=None, metadata={"help": "path to save the model"}
|
||||
)
|
||||
|
||||
|
||||
nltk.download("punkt")
|
||||
|
||||
|
||||
def print_gpu_utilization():
|
||||
nvmlInit()
|
||||
handle = nvmlDeviceGetHandleByIndex(0)
|
||||
info = nvmlDeviceGetMemoryInfo(handle)
|
||||
print(f"GPU memory occupied: {info.used//1024**2} MB.")
|
||||
|
||||
|
||||
def print_summary(result):
|
||||
print(f"Time: {result.metrics['train_runtime']:.2f}")
|
||||
print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
|
||||
print_gpu_utilization()
|
||||
|
||||
|
||||
def postprocess_text(preds, labels):
|
||||
"""Postprocess output for computing metrics"""
|
||||
preds = [pred.strip() for pred in preds]
|
||||
labels = [label.strip() for label in labels]
|
||||
|
||||
# rougeLSum expects newline after each sentence
|
||||
preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
|
||||
labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
|
||||
|
||||
return preds, labels
|
||||
|
||||
|
||||
def compute_metrics(eval_preds, tokenizer, metric):
|
||||
"""Compute metric based on predictions from evaluation"""
|
||||
preds, labels = eval_preds
|
||||
if isinstance(preds, tuple):
|
||||
preds = preds[0]
|
||||
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
|
||||
|
||||
# Replace -100 in the labels as we can't decode them.
|
||||
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
||||
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
|
||||
# Some simple post-processing
|
||||
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
||||
|
||||
result = metric.compute(
|
||||
predictions=decoded_preds, references=decoded_labels, use_stemmer=True
|
||||
)
|
||||
# Extract a few results from ROUGE
|
||||
result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
|
||||
|
||||
prediction_lens = [
|
||||
np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
|
||||
]
|
||||
result["gen_len"] = np.mean(prediction_lens)
|
||||
result = {k: round(v, 4) for k, v in result.items()}
|
||||
return result
|
||||
|
||||
|
||||
class CustomCallback(TrainerCallback):
|
||||
"""A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
|
||||
|
||||
This is a hotfix for the issue raised here:
|
||||
https://github.com/huggingface/transformers/issues/18870
|
||||
"""
|
||||
def on_log(self, args, state, control, logs=None, **kwargs):
|
||||
if state.is_world_process_zero:
|
||||
metrics = {}
|
||||
for k, v in logs.items():
|
||||
if isinstance(v, (int, float)):
|
||||
metrics[k] = v
|
||||
mlflow.log_metrics(metrics=metrics, step=state.global_step)
|
||||
|
||||
|
||||
def main():
|
||||
# Setup logging
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
console_handler = logging.StreamHandler()
|
||||
formatter = logging.Formatter(
|
||||
"%(asctime)s : %(levelname)s : %(name)s : %(message)s"
|
||||
)
|
||||
console_handler.setFormatter(formatter)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
# initialize the mlflow session
|
||||
mlflow.start_run()
|
||||
|
||||
parser = HfArgumentParser((ModelArgs, DataArgs, Seq2SeqTrainingArguments))
|
||||
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
||||
logger.info(f"Running with arguments: {model_args}, {data_args}, {training_args}")
|
||||
|
||||
# Check if this is the main node
|
||||
is_this_main_node = int(os.environ.get("RANK", "0")) == 0
|
||||
if is_this_main_node:
|
||||
logger.info("This is the main Node")
|
||||
|
||||
input_datasets = load_from_disk(data_args.preprocessed_datasets)
|
||||
logger.info(f"preprocessed dataset is loaded")
|
||||
|
||||
if model_args.model_path:
|
||||
logger.info("using a saved model")
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_args.model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_args.model_path)
|
||||
else:
|
||||
logger.info("using a model from model library")
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_args.model_name)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name)
|
||||
|
||||
# Artificially limit the number of samples (for testing)
|
||||
if training_args.do_train: # if using --do-train from Seq2SeqTrainingArguments
|
||||
if data_args.limit_samples > 0:
|
||||
max_train_samples = min(len(input_datasets["train"]), data_args.limit_samples)
|
||||
train_dataset = input_datasets["train"].select(range(max_train_samples))
|
||||
logger.info(f"train: making a {max_train_samples} sample of the data")
|
||||
else:
|
||||
train_dataset = input_datasets["train"]
|
||||
|
||||
if training_args.do_eval:
|
||||
if data_args.limit_samples > 0:
|
||||
max_eval_samples = min(
|
||||
len(input_datasets["validation"]), data_args.limit_samples
|
||||
)
|
||||
eval_dataset = input_datasets["validation"].select(range(max_eval_samples))
|
||||
logger.info(f"eval: making a {max_eval_samples} sample of the data")
|
||||
else:
|
||||
eval_dataset = input_datasets["validation"]
|
||||
|
||||
if training_args.do_predict:
|
||||
if data_args.limit_samples > 0:
|
||||
max_predict_samples = min(
|
||||
len(input_datasets["test"]), data_args.limit_samples
|
||||
)
|
||||
predict_dataset = input_datasets["test"].select(range(max_predict_samples))
|
||||
logger.info(f"predict: making a {max_predict_samples} sample of the data")
|
||||
else:
|
||||
predict_dataset = input_datasets["test"]
|
||||
|
||||
# Data collator
|
||||
label_pad_token_id = -100
|
||||
|
||||
data_collator = DataCollatorForSeq2Seq(
|
||||
tokenizer,
|
||||
model=model,
|
||||
label_pad_token_id=label_pad_token_id,
|
||||
)
|
||||
|
||||
# Metric
|
||||
metric = load_metric("rouge")
|
||||
|
||||
if training_args.do_train:
|
||||
logging_steps = len(train_dataset) // training_args.per_device_train_batch_size
|
||||
training_args.logging_steps = logging_steps
|
||||
#training_args.output_dir = "outputs"
|
||||
training_args.save_strategy = "epoch"
|
||||
training_args.evaluation_strategy = IntervalStrategy.EPOCH
|
||||
training_args.predict_with_generate = True
|
||||
training_args.report_to = [] # use our own callback
|
||||
logger.info(f"training args: {training_args}")
|
||||
|
||||
# Initialize our Trainer
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset if training_args.do_train else None,
|
||||
eval_dataset=eval_dataset if training_args.do_eval else None,
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=lambda preds : compute_metrics(preds, tokenizer, metric),
|
||||
callbacks=[CustomCallback]
|
||||
)
|
||||
|
||||
# Start the actual training (to include evaluation use --do-eval)
|
||||
if training_args.do_train:
|
||||
logger.info("Start training")
|
||||
start = time.time()
|
||||
train_result = trainer.train()
|
||||
|
||||
mlflow.log_metric(
|
||||
"time/epoch", (time.time() - start) / 60 / training_args.num_train_epochs
|
||||
)
|
||||
logger.info(
|
||||
"training is done"
|
||||
) # Only print gpu utilization if gpu is available
|
||||
if torch.cuda.is_available():
|
||||
print_summary(train_result)
|
||||
|
||||
# Save the model as an output
|
||||
if model_args.model_output and is_this_main_node:
|
||||
logger.info(f"Saving the model at {model_args.model_output}")
|
||||
os.makedirs(model_args.model_output, exist_ok=True)
|
||||
trainer.save_model(model_args.model_output)
|
||||
|
||||
# Just run the predictions
|
||||
if training_args.do_predict:
|
||||
logger.info("*** Predict ***")
|
||||
max_length = (
|
||||
training_args.generation_max_length
|
||||
if training_args.generation_max_length is not None
|
||||
else data_args.max_target_length
|
||||
)
|
||||
|
||||
predict_results = trainer.predict(
|
||||
predict_dataset, metric_key_prefix="predict", max_length=max_length
|
||||
)
|
||||
metrics = predict_results.metrics
|
||||
metrics["predict_samples"] = len(predict_dataset)
|
||||
|
||||
trainer.log_metrics("predict", metrics)
|
||||
trainer.save_metrics("predict", metrics)
|
||||
|
||||
# Stop Logging
|
||||
mlflow.end_run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,44 @@
|
|||
import os
|
||||
import logging
|
||||
import json
|
||||
from transformers import (
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
def init():
|
||||
"""
|
||||
This function is called when the container is initialized/started, typically after create/update of the deployment.
|
||||
You can write the logic here to perform init operations like caching the model in memory
|
||||
"""
|
||||
global model, tokenizer
|
||||
# AZUREML_MODEL_DIR is an environment variable created during deployment.
|
||||
# It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
|
||||
model_path = os.path.join(os.getenv("AZUREML_MODEL_DIR"), os.listdir(os.getenv("AZUREML_MODEL_DIR"))[0])
|
||||
print("model_path")
|
||||
print(os.listdir(model_path))
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
||||
print("Init complete")
|
||||
|
||||
|
||||
def run(raw_data):
|
||||
global model, tokenizer
|
||||
"""
|
||||
This function is called for every invocation of the endpoint to perform the actual scoring/prediction.
|
||||
In the example we extract the data from the json input and call the scikit-learn model's predict()
|
||||
method and return the result back
|
||||
"""
|
||||
logging.info("Request received")
|
||||
article = json.loads(raw_data)["data"]
|
||||
if "t5" in model.config.architectures[0].lower():
|
||||
article= "summarize:" + article
|
||||
|
||||
inputs = tokenizer(article, return_tensors="pt", max_length=512, truncation=True)
|
||||
outputs = model.generate(
|
||||
inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
|
||||
)
|
||||
result = tokenizer.decode(outputs[0])
|
||||
print(result)
|
||||
logging.info("Request processed")
|
||||
return result
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,405 @@
|
|||
"""MLOps v2 NLP Python SDK training submission script."""
|
||||
import os
|
||||
import argparse
|
||||
|
||||
# Azure ML sdk v2 imports
|
||||
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
|
||||
from azure.ai.ml import MLClient
|
||||
from azure.ai.ml import command
|
||||
from azure.ai.ml import Input, Output
|
||||
from azure.ai.ml import dsl, Input, Output
|
||||
|
||||
def get_config_parger(parser: argparse.ArgumentParser = None):
|
||||
"""Builds the argument parser for the script."""
|
||||
if parser is None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
|
||||
group = parser.add_argument_group("Azure ML references")
|
||||
group.add_argument(
|
||||
"--config_location",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Subscription ID",
|
||||
)
|
||||
group.add_argument(
|
||||
"--subscription_id",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Subscription ID",
|
||||
)
|
||||
group.add_argument(
|
||||
"--resource_group",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Resource group name",
|
||||
)
|
||||
group.add_argument(
|
||||
"--workspace_name",
|
||||
type=str,
|
||||
required=False,
|
||||
help="Workspace name",
|
||||
)
|
||||
# Experiment Name
|
||||
group.add_argument(
|
||||
"-n",
|
||||
type=str,
|
||||
required=True,
|
||||
default="nlp_summarization_train",
|
||||
help="Experiment name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wait",
|
||||
default=False,
|
||||
action="store_true",
|
||||
help="wait for the job to finish",
|
||||
)
|
||||
|
||||
group = parser.add_argument_group("Training parameters")
|
||||
group.add_argument(
|
||||
"--limit_samples",
|
||||
type=int,
|
||||
default=1000,
|
||||
)
|
||||
group.add_argument(
|
||||
"--pretrained_model_name",
|
||||
type=str,
|
||||
default="t5-small",
|
||||
)
|
||||
group.add_argument(
|
||||
"--num_train_epochs",
|
||||
type=int,
|
||||
default=5,
|
||||
)
|
||||
group.add_argument(
|
||||
"--batch_size",
|
||||
type=int,
|
||||
default=8,
|
||||
)
|
||||
group.add_argument(
|
||||
"--learning_rate",
|
||||
type=float,
|
||||
default=0.00005,
|
||||
)
|
||||
group.add_argument(
|
||||
"--model_registration_name",
|
||||
type=str,
|
||||
default="pubmed-summarization",
|
||||
)
|
||||
|
||||
group = parser.add_argument_group("Compute parameters")
|
||||
group.add_argument(
|
||||
"--cpu_compute",
|
||||
type=str,
|
||||
default="cpu-cluster",
|
||||
)
|
||||
group.add_argument(
|
||||
"--cpu_compute_large",
|
||||
type=str,
|
||||
default="cpu-cluster-lg",
|
||||
)
|
||||
group.add_argument(
|
||||
"--gpu_compute",
|
||||
type=str,
|
||||
default="gpu-cluster",
|
||||
)
|
||||
group.add_argument(
|
||||
"--training_nodes",
|
||||
type=int,
|
||||
default=1,
|
||||
)
|
||||
group.add_argument(
|
||||
"--gpus_per_node",
|
||||
type=int,
|
||||
default=1,
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def connect_to_aml(args):
|
||||
"""Connect to Azure ML workspace using provided cli arguments."""
|
||||
try:
|
||||
credential = DefaultAzureCredential()
|
||||
# Check if given credential can get token successfully.
|
||||
credential.get_token("https://management.azure.com/.default")
|
||||
except Exception as ex:
|
||||
# Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
|
||||
credential = InteractiveBrowserCredential()
|
||||
|
||||
# Get a handle to workspace
|
||||
try:
|
||||
# ml_client to connect using local config.json
|
||||
ml_client = MLClient.from_config(credential, path='config.json')
|
||||
|
||||
except Exception as ex:
|
||||
print(
|
||||
"Could not find config.json, using config.yaml refs to Azure ML workspace instead."
|
||||
)
|
||||
|
||||
# tries to connect using cli args if provided else using config.yaml
|
||||
ml_client = MLClient(
|
||||
subscription_id=args.subscription_id,
|
||||
resource_group_name=args.resource_group,
|
||||
workspace_name=args.workspace_name,
|
||||
credential=credential,
|
||||
)
|
||||
return ml_client
|
||||
|
||||
|
||||
def build_components(args):
|
||||
"""Builds the components for the pipeline."""
|
||||
DATA_SCIENCE_FOLDER = os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), "..","..", "..", "data-science", "src"
|
||||
)
|
||||
|
||||
prep_finetuning_dataset = command(
|
||||
name="prep_finetuning_dataset",
|
||||
display_name="Prepare dataset for training",
|
||||
inputs={
|
||||
"dataset_name": Input(type="string"),
|
||||
"dataset_config": Input(type="string"),
|
||||
"text_column": Input(type="string"),
|
||||
"summary_column": Input(type="string"),
|
||||
"limit_samples": Input(type="integer"),
|
||||
"max_input_length": Input(type="integer"),
|
||||
"max_target_length": Input(type="integer"),
|
||||
"padding": Input(type="string"),
|
||||
"pretrained_model_name": Input(type="string"),
|
||||
},
|
||||
outputs=dict(
|
||||
encodings=Output(type="uri_folder", mode="rw_mount"),
|
||||
),
|
||||
code=DATA_SCIENCE_FOLDER,
|
||||
command="""python summarization/prepare.py \
|
||||
--dataset_name ${{inputs.dataset_name}} \
|
||||
--dataset_config ${{inputs.dataset_config}} \
|
||||
--text_column ${{inputs.text_column}} \
|
||||
--summary_column ${{inputs.summary_column}} \
|
||||
--limit_samples ${{inputs.limit_samples}} \
|
||||
--model_arch ${{inputs.pretrained_model_name}} \
|
||||
--max_input_length ${{inputs.max_input_length}} \
|
||||
--max_target_length ${{inputs.max_target_length}} \
|
||||
--padding ${{inputs.padding}} \
|
||||
--encodings ${{outputs.encodings}}\
|
||||
""",
|
||||
environment="nlp_summarization_train@latest",
|
||||
)
|
||||
|
||||
finetune_model = command(
|
||||
name="finetune_model",
|
||||
display_name="Fine-tune summarization model",
|
||||
inputs={
|
||||
"preprocessed_datasets": Input(type="uri_folder"),
|
||||
"pretrained_model_name": Input(type="string"),
|
||||
"limit_samples": Input(type="integer"),
|
||||
"learning_rate": Input(type="number"),
|
||||
"num_train_epochs": Input(type="integer"),
|
||||
"per_device_train_batch_size": Input(type="integer"),
|
||||
"per_device_eval_batch_size": Input(type="integer"),
|
||||
},
|
||||
outputs=dict(
|
||||
finetuned_model=Output(type="uri_folder", mode="rw_mount"),
|
||||
),
|
||||
code=DATA_SCIENCE_FOLDER,
|
||||
command="""python summarization/run.py \
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}} \
|
||||
--learning_rate ${{inputs.learning_rate}} \
|
||||
--per_device_train_batch_size ${{inputs.per_device_train_batch_size}} \
|
||||
--per_device_eval_batch_size ${{inputs.per_device_eval_batch_size}} \
|
||||
--limit_samples ${{inputs.limit_samples}} \
|
||||
--model_name ${{inputs.pretrained_model_name}} \
|
||||
--model_output ${{outputs.finetuned_model}}\
|
||||
--output_dir outputs \
|
||||
--num_train_epochs ${{inputs.num_train_epochs}} \
|
||||
--do_train --do_eval \
|
||||
""",
|
||||
environment="nlp_summarization_train@latest",
|
||||
distribution={
|
||||
"type": "PyTorch",
|
||||
# set process count to the number of gpus on the node
|
||||
"process_count_per_instance": args.gpus_per_node,
|
||||
},
|
||||
# set instance count to the number of nodes you want to use
|
||||
instance_count=args.training_nodes,
|
||||
)
|
||||
|
||||
evaluate_model = command(
|
||||
name="evaluate_model",
|
||||
display_name="Run eval on a model",
|
||||
inputs={
|
||||
"preprocessed_datasets": Input(type="uri_folder"),
|
||||
"model_path": Input(type="uri_folder", optional=True),
|
||||
"model_name": Input(type="string", optional=True),
|
||||
"limit_samples": Input(type="integer"),
|
||||
"max_target_length": Input(type="integer"),
|
||||
},
|
||||
outputs=dict(
|
||||
metrics=Output(type="uri_folder", mode="rw_mount"),
|
||||
),
|
||||
code=DATA_SCIENCE_FOLDER,
|
||||
command="""python summarization/run.py \
|
||||
--preprocessed_datasets ${{inputs.preprocessed_datasets}} \
|
||||
--limit_samples ${{inputs.limit_samples}} \
|
||||
--output_dir ${{outputs.metrics}} \
|
||||
$[[--model_path ${{inputs.model_path}}]] \
|
||||
$[[--model_name ${{inputs.model_name}}]] \
|
||||
--max_target_length ${{inputs.max_target_length}} \
|
||||
--do_predict \
|
||||
""",
|
||||
environment="nlp_summarization_train@latest",
|
||||
)
|
||||
|
||||
compare_models = command(
|
||||
name="compare_models",
|
||||
display_name="Compare finetuned to baseline",
|
||||
inputs={
|
||||
"baseline_metrics": Input(type="uri_folder"),
|
||||
"candidate_metrics": Input(type="uri_folder"),
|
||||
"reference_metric": Input(type="string"),
|
||||
},
|
||||
outputs=dict(
|
||||
deploy_flag=Output(type="uri_folder", mode="rw_mount"),
|
||||
),
|
||||
code=DATA_SCIENCE_FOLDER,
|
||||
command="""python summarization/compare.py \
|
||||
--baseline_metrics ${{inputs.baseline_metrics}} \
|
||||
--candidate_metrics ${{inputs.candidate_metrics}} \
|
||||
--reference_metric ${{inputs.reference_metric}} \
|
||||
--deploy_flag ${{outputs.deploy_flag}} \
|
||||
--force_comparison True\
|
||||
""",
|
||||
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
|
||||
)
|
||||
|
||||
register_model = command(
|
||||
name="register_model",
|
||||
display_name="Register model",
|
||||
inputs={
|
||||
"model": Input(type="uri_folder"),
|
||||
"deploy_flag": Input(type="uri_folder"),
|
||||
"model_registration_name": Input(type="string"),
|
||||
},
|
||||
code=DATA_SCIENCE_FOLDER,
|
||||
command="""python summarization/register.py \
|
||||
--model_folder ${{inputs.model}} \
|
||||
--deploy_flag ${{inputs.deploy_flag}} \
|
||||
--register_as ${{inputs.model_registration_name}} \
|
||||
""",
|
||||
environment="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu@latest",
|
||||
)
|
||||
|
||||
return {
|
||||
"prep_finetuning_dataset": prep_finetuning_dataset,
|
||||
"finetune_model": finetune_model,
|
||||
"evaluate_model": evaluate_model,
|
||||
"compare_models": compare_models,
|
||||
"register_model": register_model,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the script."""
|
||||
parser = get_config_parger()
|
||||
args, _ = parser.parse_known_args()
|
||||
ml_client = connect_to_aml(args)
|
||||
|
||||
# get components from build function
|
||||
components_dict = build_components(args)
|
||||
prep_finetuning_dataset = components_dict["prep_finetuning_dataset"]
|
||||
finetune_model = components_dict["finetune_model"]
|
||||
evaluate_model = components_dict["evaluate_model"]
|
||||
compare_models = components_dict["compare_models"]
|
||||
register_model = components_dict["register_model"]
|
||||
|
||||
# build the pipeline using Azure ML SDK v2
|
||||
@dsl.pipeline(
|
||||
name="NLP Training Pipeline",
|
||||
description="NLP Training Pipeline",
|
||||
)
|
||||
def nlp_training_pipeline(
|
||||
limit_samples: int,
|
||||
pretrained_model_name: str,
|
||||
num_train_epochs: int,
|
||||
batch_size: int,
|
||||
learning_rate: float,
|
||||
model_registration_name: str,
|
||||
):
|
||||
prep_finetuning_dataset_step = prep_finetuning_dataset(
|
||||
dataset_name="ccdv/pubmed-summarization",
|
||||
dataset_config="section",
|
||||
text_column="article",
|
||||
summary_column="abstract",
|
||||
limit_samples=limit_samples,
|
||||
max_input_length=512,
|
||||
max_target_length=40,
|
||||
padding="max_length",
|
||||
pretrained_model_name=pretrained_model_name,
|
||||
)
|
||||
prep_finetuning_dataset_step.compute = args.cpu_compute_large
|
||||
|
||||
finetune_model_step = finetune_model(
|
||||
preprocessed_datasets=prep_finetuning_dataset_step.outputs.encodings,
|
||||
pretrained_model_name=pretrained_model_name,
|
||||
limit_samples=limit_samples,
|
||||
learning_rate=learning_rate,
|
||||
num_train_epochs=num_train_epochs,
|
||||
per_device_train_batch_size=batch_size,
|
||||
per_device_eval_batch_size=batch_size,
|
||||
)
|
||||
finetune_model_step.compute = args.gpu_compute
|
||||
|
||||
evaluate_finetuned_model_step = evaluate_model(
|
||||
preprocessed_datasets=prep_finetuning_dataset_step.outputs.encodings,
|
||||
model_path=finetune_model_step.outputs.finetuned_model,
|
||||
limit_samples=limit_samples,
|
||||
max_target_length=40,
|
||||
)
|
||||
evaluate_finetuned_model_step.compute = args.gpu_compute
|
||||
|
||||
evaluate_baseline_model_step = evaluate_model(
|
||||
preprocessed_datasets=prep_finetuning_dataset_step.outputs.encodings,
|
||||
model_name=pretrained_model_name,
|
||||
limit_samples=limit_samples,
|
||||
max_target_length=40,
|
||||
)
|
||||
evaluate_baseline_model_step.compute = args.gpu_compute
|
||||
|
||||
compare_models_step = compare_models(
|
||||
baseline_metrics=evaluate_finetuned_model_step.outputs.metrics,
|
||||
candidate_metrics=evaluate_baseline_model_step.outputs.metrics,
|
||||
reference_metric="predict_rougeLsum",
|
||||
)
|
||||
compare_models_step.compute = args.cpu_compute
|
||||
|
||||
register_model_step = register_model(
|
||||
model=finetune_model_step.outputs.finetuned_model,
|
||||
deploy_flag=compare_models_step.outputs.deploy_flag,
|
||||
model_registration_name=model_registration_name,
|
||||
)
|
||||
register_model_step.compute = args.cpu_compute
|
||||
|
||||
# instanciates the job
|
||||
pipeline_job = nlp_training_pipeline(
|
||||
limit_samples=args.limit_samples,
|
||||
pretrained_model_name=args.pretrained_model_name,
|
||||
num_train_epochs=args.num_train_epochs,
|
||||
batch_size=args.batch_size,
|
||||
learning_rate=args.learning_rate,
|
||||
model_registration_name=args.model_registration_name,
|
||||
)
|
||||
|
||||
# submits the job
|
||||
print("Submitting the pipeline job to your AzureML workspace...")
|
||||
pipeline_job = ml_client.jobs.create_or_update(
|
||||
pipeline_job, experiment_name=args.n
|
||||
)
|
||||
|
||||
print("The url to see your live job running is returned by the sdk:")
|
||||
print(pipeline_job.services["Studio"].endpoint)
|
||||
|
||||
if args.wait:
|
||||
ml_client.jobs.stream(pipeline_job.name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,71 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
variables:
|
||||
- ${{ if eq(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'main' branch: PRD environment
|
||||
- template: ../../config-infra-prod.yml
|
||||
- ${{ if ne(variables['Build.SourceBranchName'], 'main') }}:
|
||||
# 'develop' or feature branches: DEV environment
|
||||
- template: ../../config-infra-dev.yml
|
||||
- name: version
|
||||
value: python-sdk-v2
|
||||
|
||||
trigger:
|
||||
- none
|
||||
|
||||
pool:
|
||||
vmImage: ubuntu-20.04
|
||||
|
||||
resources:
|
||||
repositories:
|
||||
- repository: mlops-templates # Template Repo
|
||||
name: Azure/mlops-templates # need to change org name from "Azure" to your own org
|
||||
endpoint: github-connection # need to set up and hardcode
|
||||
type: github
|
||||
ref: main
|
||||
|
||||
stages:
|
||||
- stage: DeployTrainingPipeline
|
||||
displayName: Deploy Training Pipeline
|
||||
jobs:
|
||||
- job: DeployTrainingPipeline
|
||||
steps:
|
||||
- checkout: self
|
||||
path: s/
|
||||
- checkout: mlops-templates
|
||||
path: s/templates/
|
||||
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
|
||||
- template: templates/python-sdk-v2/install-requirements.yml@mlops-templates
|
||||
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: cpu-cluster
|
||||
size: STANDARD_DS3_V2
|
||||
min_instances: 0
|
||||
max_instances: 1
|
||||
cluster_tier: dedicated
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: cpu-cluster-lg
|
||||
size: Standard_D14_v2
|
||||
min_instances: 0
|
||||
max_instances: 1
|
||||
cluster_tier: dedicated
|
||||
- template: templates/${{ variables.version }}/create-compute.yml@mlops-templates
|
||||
parameters:
|
||||
cluster_name: gpu-cluster
|
||||
size: Standard_NV6
|
||||
min_instances: 0
|
||||
max_instances: 1
|
||||
cluster_tier: dedicated
|
||||
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
|
||||
parameters:
|
||||
build_type: docker
|
||||
environment_name: nlp_summarization_train
|
||||
environment_path: data-science/environments/training
|
||||
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
|
||||
parameters:
|
||||
pipeline_path: mlops/azureml/train/pipeline-train.py
|
||||
experiment_name: $(environment)_nlp_summarization_$(Build.SourceBranchName)
|
Загрузка…
Ссылка в новой задаче