Merge branch 'main-dec31' into main-dec31

This commit is contained in:
Scott Donohoo 2022-12-21 09:31:15 -06:00 коммит произвёл GitHub
Родитель 19fb79dd95 00d5cdfa70
Коммит 703be110a6
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 151 добавлений и 224 удалений

Просмотреть файл

@ -22,29 +22,14 @@ from mlflow.tracking import MlflowClient
TARGET_COL = "cost"
NUMERIC_COLS = [
"distance",
"dropoff_latitude",
"dropoff_longitude",
"passengers",
"pickup_latitude",
"pickup_longitude",
"pickup_weekday",
"pickup_month",
"pickup_monthday",
"pickup_hour",
"pickup_minute",
"pickup_second",
"dropoff_weekday",
"dropoff_month",
"dropoff_monthday",
"dropoff_hour",
"dropoff_minute",
"dropoff_second",
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
"dropoff_hour", "dropoff_minute", "dropoff_second"
]
CAT_NOM_COLS = [
"store_forward",
"vendor",
"store_forward", "vendor"
]
CAT_ORD_COLS = [
@ -58,7 +43,6 @@ def parse_args():
parser.add_argument("--model_input", type=str, help="Path of input model")
parser.add_argument("--test_data", type=str, help="Path to test dataset")
parser.add_argument("--evaluation_output", type=str, help="Path of eval results")
parser.add_argument("--runner", type=str, help="Local or Cloud Runner", default="CloudRunner")
args = parser.parse_args()
@ -81,8 +65,7 @@ def main(args):
yhat_test, score = model_evaluation(X_test, y_test, model, args.evaluation_output)
# ----------------- Model Promotion ---------------- #
if args.runner == "CloudRunner":
predictions, deploy_flag = model_promotion(args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score)
predictions, deploy_flag = model_promotion(args.model_name, args.evaluation_output, X_test, y_test, yhat_test, score)

Просмотреть файл

@ -16,29 +16,14 @@ import mlflow
TARGET_COL = "cost"
NUMERIC_COLS = [
"distance",
"dropoff_latitude",
"dropoff_longitude",
"passengers",
"pickup_latitude",
"pickup_longitude",
"pickup_weekday",
"pickup_month",
"pickup_monthday",
"pickup_hour",
"pickup_minute",
"pickup_second",
"dropoff_weekday",
"dropoff_month",
"dropoff_monthday",
"dropoff_hour",
"dropoff_minute",
"dropoff_second",
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
"dropoff_hour", "dropoff_minute", "dropoff_second"
]
CAT_NOM_COLS = [
"store_forward",
"vendor",
"store_forward", "vendor"
]
CAT_ORD_COLS = [
@ -70,11 +55,8 @@ def main(args):
# ------------ Reading Data ------------ #
# -------------------------------------- #
print("mounted_path files: ")
arr = os.listdir(args.raw_data)
print(arr)
data = pd.read_csv((Path(args.raw_data) / 'taxi-data.csv'))
data = pd.read_csv((Path(args.raw_data)))
data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]
# ------------- Split Data ------------- #

Просмотреть файл

@ -21,29 +21,14 @@ import mlflow.sklearn
TARGET_COL = "cost"
NUMERIC_COLS = [
"distance",
"dropoff_latitude",
"dropoff_longitude",
"passengers",
"pickup_latitude",
"pickup_longitude",
"pickup_weekday",
"pickup_month",
"pickup_monthday",
"pickup_hour",
"pickup_minute",
"pickup_second",
"dropoff_weekday",
"dropoff_month",
"dropoff_monthday",
"dropoff_hour",
"dropoff_minute",
"dropoff_second",
"distance", "dropoff_latitude", "dropoff_longitude", "passengers", "pickup_latitude",
"pickup_longitude", "pickup_weekday", "pickup_month", "pickup_monthday", "pickup_hour",
"pickup_minute", "pickup_second", "dropoff_weekday", "dropoff_month", "dropoff_monthday",
"dropoff_hour", "dropoff_minute", "dropoff_second"
]
CAT_NOM_COLS = [
"store_forward",
"vendor",
"store_forward", "vendor"
]
CAT_ORD_COLS = [

Просмотреть файл

@ -1,24 +0,0 @@
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: evaluate_model
display_name: evaluate-model
type: command
inputs:
model_name:
type: string
model_input:
type: uri_folder
test_data:
type: uri_folder
outputs:
evaluation_output:
type: uri_folder
code: ../../../../data-science/src
environment: azureml:taxi-train-env@latest
command: >-
python evaluate.py
--model_name ${{inputs.model_name}}
--model_input ${{inputs.model_input}}
--test_data ${{inputs.test_data}}
--evaluation_output ${{outputs.evaluation_output}}
# </component>

Просмотреть файл

@ -1,30 +0,0 @@
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: prep_data
display_name: prep-data
type: command
inputs:
raw_data:
type: uri_file
enable_monitoring:
type: string
table_name:
type: string
outputs:
train_data:
type: uri_folder
val_data:
type: uri_folder
test_data:
type: uri_folder
code: ../../../../data-science/src
environment: azureml:taxi-train-env@latest
command: >-
python prep.py
--raw_data ${{inputs.raw_data}}
--train_data ${{outputs.train_data}}
--val_data ${{outputs.val_data}}
--test_data ${{outputs.test_data}}
--enable_monitoring ${{inputs.enable_monitoring}}
--table_name ${{inputs.table_name}}
# </component>

Просмотреть файл

@ -1,24 +0,0 @@
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: register_model
display_name: register-model
type: command
inputs:
model_name:
type: string
model_path:
type: uri_folder
evaluation_output:
type: uri_folder
outputs:
model_info_output_path:
type: uri_folder
code: ../../../../data-science/src
environment: azureml:taxi-train-env@latest
command: >-
python register.py
--model_name ${{inputs.model_name}}
--model_path ${{inputs.model_path}}
--evaluation_output ${{inputs.evaluation_output}}
--model_info_output_path ${{outputs.model_info_output_path}}
# </component>

Просмотреть файл

@ -1,18 +0,0 @@
# <component>
$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
name: train_model
display_name: train-model
type: command
inputs:
train_data:
type: uri_folder
outputs:
model_output:
type: uri_folder
code: ../../../../data-science/src
environment: azureml:taxi-train-env@latest
command: >-
python train.py
--train_data ${{inputs.train_data}}
--model_output ${{outputs.model_output}}
# </component>

Просмотреть файл

@ -14,7 +14,7 @@ from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Environment
from azure.ai.ml.dsl import pipeline
from azure.ai.ml import Input, Output, load_component
from azure.ai.ml import Input, Output, command
from azure.ai.ml.constants import AssetTypes, InputOutputModes
import json
@ -24,10 +24,12 @@ import os
def parse_args():
parser = argparse.ArgumentParser("Deploy Training Pipeline")
parser.add_argument("-c", type=str, help="Compute Cluster Name")
parser.add_argument("-m", type=str, help="Enable Monitoring", default="false")
parser.add_argument("-d", type=str, help="Data Asset Name")
parser.add_argument("-n", type=str, help="Experiment Name")
parser.add_argument("--experiment_name", type=str, help="Experiment Name")
parser.add_argument("--compute_name", type=str, help="Compute Cluster Name")
parser.add_argument("--data_name", type=str, help="Data Asset Name")
parser.add_argument("--environment_name", type=str, help="Registered Environment Name")
parser.add_argument("--enable_monitoring", type=str, help="Enable Monitoring", default="false")
parser.add_argument("--table_name", type=str, help="ADX Monitoring Table Name", default="taximonitoring")
args = parser.parse_args()
@ -47,8 +49,7 @@ def main():
print(ex)
try:
compute_target = args.c
print(ml_client.compute.get(compute_target))
print(ml_client.compute.get(args.compute_name))
except:
print("No compute found")
@ -58,34 +59,104 @@ def main():
print('current', os.listdir())
# Create pipeline job
parent_dir = "mlops/azureml/train/components"
# 1. Define components
parent_dir = "data-science/src"
prep_data = command(
name="prep_data",
display_name="prep-data",
code=os.path.join(parent_dir, "prep"),
command="python prep.py \
--raw_data ${{inputs.raw_data}} \
--train_data ${{outputs.train_data}} \
--val_data ${{outputs.val_data}} \
--test_data ${{outputs.test_data}} \
--enable_monitoring ${{inputs.enable_monitoring}} \
--table_name ${{inputs.table_name}}",
environment=args.environment_name+"@latest",
inputs={
"raw_data": Input(type="uri_file"),
"enable_monitoring": Input(type="string"),
"table_name": Input(type="string")
},
outputs={
"train_data": Output(type="uri_folder"),
"val_data": Output(type="uri_folder"),
"test_data": Output(type="uri_folder"),
}
)
train_model = command(
name="train_model",
display_name="train-model",
code=os.path.join(parent_dir, "train"),
command="python train.py \
--train_data ${{inputs.train_data}} \
--model_output ${{outputs.model_output}}",
environment=args.environment_name+"@latest",
inputs={"train_data": Input(type="uri_folder")},
outputs={"model_output": Output(type="uri_folder")}
)
# 1. Load components
prepare_data = load_component(source=os.path.join(parent_dir , "prep.yml"))
train_model = load_component(source=os.path.join(parent_dir, "train.yml"))
evaluate_model = load_component(source=os.path.join(parent_dir, "evaluate.yml"))
register_model = load_component(source=os.path.join(parent_dir, "register.yml"))
evaluate_model = command(
name="evaluate_model",
display_name="evaluate-model",
code=os.path.join(parent_dir, "evaluate"),
command="python evaluate.py \
--model_name ${{inputs.model_name}} \
--model_input ${{inputs.model_input}} \
--test_data ${{inputs.test_data}} \
--evaluation_output ${{outputs.evaluation_output}}",
environment=args.environment_name+"@latest",
inputs={
"model_name": Input(type="string"),
"model_input": Input(type="uri_folder"),
"test_data": Input(type="uri_folder")
},
outputs={
"evaluation_output": Output(type="uri_folder")
}
)
register_model = command(
name="register_model",
display_name="register-model",
code=os.path.join(parent_dir, "register"),
command="python register.py \
--model_name ${{inputs.model_name}} \
--model_path ${{inputs.model_path}} \
--evaluation_output ${{inputs.evaluation_output}} \
--model_info_output_path ${{outputs.model_info_output_path}}",
environment=args.environment_name+"@latest",
inputs={
"model_name": Input(type="string"),
"model_path": Input(type="uri_folder"),
"evaluation_output": Input(type="uri_folder")
},
outputs={
"model_info_output_path": Output(type="uri_folder")
}
)
# 2. Construct pipeline
@pipeline()
def taxi_training_pipeline(raw_data, enable_monitoring, table_name):
prepare = prepare_data(
prep = prep_data(
raw_data=raw_data,
enable_monitoring=enable_monitoring,
table_name=table_name
)
train = train_model(
train_data=prepare.outputs.train_data
train_data=prep.outputs.train_data
)
evaluate = evaluate_model(
model_name="taxi-model",
model_input=train.outputs.model_output,
test_data=prepare.outputs.test_data
test_data=prep.outputs.test_data
)
@ -96,24 +167,24 @@ def main():
)
return {
"pipeline_job_train_data": prepare.outputs.train_data,
"pipeline_job_test_data": prepare.outputs.test_data,
"pipeline_job_train_data": prep.outputs.train_data,
"pipeline_job_test_data": prep.outputs.test_data,
"pipeline_job_trained_model": train.outputs.model_output,
"pipeline_job_score_report": evaluate.outputs.evaluation_output,
}
pipeline_job = taxi_training_pipeline(
Input(path=args.d + "@latest"), args.m, "taximonitoring"
Input(path=args.data_name + "@latest", type="uri_file"), args.enable_monitoring, args.table_name
)
# set pipeline level compute
pipeline_job.settings.default_compute = args.c
pipeline_job.settings.default_compute = args.compute_name
# set pipeline level datastore
pipeline_job.settings.default_datastore = "workspaceblobstore"
pipeline_job = ml_client.jobs.create_or_update(
pipeline_job, experiment_name=args.n
pipeline_job, experiment_name=args.experiment_name
)
pipeline_job

Просмотреть файл

@ -26,43 +26,45 @@ resources:
ref: main-dec31
stages:
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
- template: templates/python-sdk-v2/install-requirements.yml@mlops-templates
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
- template: templates/aml-cli-v2/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: low_priority
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
environment_name: taxi-train-env
environment_description: "Training Environment for Taxi Pipeline"
environment_path: data-science/environment/train-conda.yml
build_type: conda
- template: templates/${{ variables.version }}/register-data-asset.yml@mlops-templates
parameters:
data_name: taxi-data
data_description: taxi-training-dataset
data_path: data/taxi-data.csv
data_type: uri_file
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_path: mlops/azureml/train/run_pipeline.py
data_name: taxi-data
cluster_name: cpu-cluster
enable_monitoring: $(enable_monitoring)
experiment_name: taxi-train-pipeline
- stage: DeployTrainingPipeline
displayName: Deploy Training Pipeline
jobs:
- job: DeployTrainingPipeline
timeoutInMinutes: 120 # how long to run the job before automatically cancelling
steps:
- checkout: self
path: s/
- checkout: mlops-templates
path: s/templates/
- template: templates/aml-cli-v2/install-az-cli.yml@mlops-templates
- template: templates/aml-cli-v2/install-aml-cli.yml@mlops-templates
- template: templates/python-sdk-v2/install-requirements.yml@mlops-templates
- template: templates/aml-cli-v2/connect-to-workspace.yml@mlops-templates
- template: templates/aml-cli-v2/create-compute.yml@mlops-templates
parameters:
cluster_name: cpu-cluster
size: Standard_DS3_v2
min_instances: 0
max_instances: 4
cluster_tier: low_priority
- template: templates/${{ variables.version }}/register-environment.yml@mlops-templates
parameters:
environment_name: taxi-train-env
environment_description: "Training Environment for Taxi Pipeline"
environment_path: data-science/environment/train-conda.yml
build_type: conda
- template: templates/${{ variables.version }}/register-data-asset.yml@mlops-templates
parameters:
data_name: taxi-data
data_description: taxi-training-dataset
data_path: data/taxi-data.csv
data_type: uri_file
- template: templates/${{ variables.version }}/run-pipeline.yml@mlops-templates
parameters:
pipeline_path: mlops/azureml/train/run_pipeline.py
experiment_name: taxi-train-pipeline
data_name: taxi-data
environment_name: taxi-train-env
compute_name: cpu-cluster
enable_monitoring: $(enable_monitoring)
table_name: 'taximonitoring'