azureml-examples/cli/readme.py

1016 строки
35 KiB
Python

# imports
import os
import json
import glob
import argparse
import hashlib
import random
import string
import yaml
# define constants
EXCLUDED_JOBS = ["java", "spark-job-component", "storage_pe", "user-assigned-identity"]
# TODO: Re-include these below endpoints and deployments when the workflow generation code supports substituting vars in .yaml files.
EXCLUDED_ENDPOINTS = [
"1-uai-create-endpoint",
"1-sai-create-endpoint",
"tfserving-endpoint",
]
EXCLUDED_DEPLOYMENTS = [
"minimal-multimodel-deployment",
"minimal-single-model-conda-in-dockerfile-deployment",
"mlflow-deployment",
"r-deployment",
"torchserve-deployment",
"triton-cc-deployment",
"2-sai-deployment",
"kubernetes-green-deployment",
]
EXCLUDED_RESOURCES = [
"workspace",
"datastore",
"vm-attach",
"instance",
"connections",
"compute/cluster-user-identity",
"compute/attached-spark",
"compute/attached-spark-system-identity",
"compute/attached-spark-user-identity",
"registry",
]
EXCLUDED_ASSETS = ["conda-yamls", "mlflow-models"]
EXCLUDED_SCHEDULES = []
EXCLUDED_SCRIPTS = [
"setup",
"cleanup",
"run-job",
"run-pipeline-job-with-registry-components",
"deploy-custom-container-multimodel-minimal",
"run-pipeline-jobs",
]
READONLY_HEADER = "# This code is autogenerated.\
\n# Code is generated by running custom script: python3 readme.py\
\n# Any manual changes to this file may cause incorrect behavior.\
\n# Any manual changes will be overwritten if the code is regenerated.\n"
BRANCH = "main" # default - do not change
# Duplicate name in working directory during checkout
# https://github.com/actions/checkout/issues/739
GITHUB_WORKSPACE = "${{ github.workspace }}"
GITHUB_CONCURRENCY_GROUP = (
"${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
)
# BRANCH = "sdk-preview" # this should be deleted when this branch is merged to main
hours_between_runs = 12
# define functions
def main(args):
# get list of notebooks
notebooks = sorted(glob.glob("**/*.ipynb", recursive=True))
# make all notebooks consistent
modify_notebooks(notebooks)
# get list of jobs
jobs = sorted(glob.glob("jobs/**/*job*.yml", recursive=True))
jobs += sorted(glob.glob("jobs/basics/*.yml", recursive=False))
jobs += sorted(glob.glob("jobs/*/basics/**/*job*.yml", recursive=True))
jobs += sorted(glob.glob("jobs/pipelines/**/*pipeline*.yml", recursive=True))
jobs += sorted(glob.glob("jobs/spark/*.yml", recursive=False))
jobs += sorted(
glob.glob("jobs/automl-standalone-jobs/**/cli-automl-*.yml", recursive=True)
)
jobs += sorted(
glob.glob("jobs/pipelines-with-components/**/*pipeline*.yml", recursive=True)
)
jobs += sorted(
glob.glob("jobs/automl-standalone-jobs/**/*cli-automl*.yml", recursive=True)
)
jobs += sorted(glob.glob("responsible-ai/**/cli-*.yml", recursive=True))
jobs += sorted(glob.glob("jobs/parallel/**/*pipeline*.yml", recursive=True))
jobs = [
job.replace(".yml", "")
for job in jobs
if not any(excluded in job for excluded in EXCLUDED_JOBS)
]
jobs_using_registry_components = sorted(
glob.glob(
"jobs/pipelines-with-components/basics/**/*pipeline*.yml", recursive=True
)
)
jobs_using_registry_components = [
job.replace(".yml", "")
for job in jobs_using_registry_components
if not any(excluded in job.replace(os.sep, "/") for excluded in EXCLUDED_JOBS)
]
# get list of endpoints
endpoints = sorted(glob.glob("endpoints/**/*endpoint.yml", recursive=True))
endpoints = [
endpoint.replace(".yml", "")
for endpoint in endpoints
if not any(
excluded in endpoint.replace(os.sep, "/") for excluded in EXCLUDED_ENDPOINTS
)
]
# get list of resources
resources = sorted(glob.glob("resources/**/*.yml", recursive=True))
resources = [
resource.replace(".yml", "")
for resource in resources
if not any(
excluded in resource.replace(os.sep, "/") for excluded in EXCLUDED_RESOURCES
)
]
# get list of assets
assets = sorted(glob.glob("assets/**/*.yml", recursive=True))
assets = [
asset.replace(".yml", "")
for asset in assets
if not any(
excluded in asset.replace(os.sep, "/") for excluded in EXCLUDED_ASSETS
)
]
# get list of scripts
scripts = sorted(glob.glob("*.sh", recursive=False))
scripts = [
script.replace(".sh", "")
for script in scripts
if not any(
excluded in script.replace(os.sep, "/") for excluded in EXCLUDED_SCRIPTS
)
]
# get list of schedules
schedules = sorted(glob.glob("schedules/**/*schedule.yml", recursive=True))
schedules = [
schedule.replace(".yml", "")
for schedule in schedules
if not any(
excluded in schedule.replace(os.sep, "/") for excluded in EXCLUDED_SCHEDULES
)
]
# write workflows
write_workflows(
jobs,
jobs_using_registry_components,
endpoints,
resources,
assets,
scripts,
schedules,
)
# read existing README.md
with open("README.md", "r") as f:
readme_before = f.read()
# write README.md
write_readme(jobs, endpoints, resources, assets, scripts, schedules)
# read modified README.md
with open("README.md", "r") as f:
readme_after = f.read()
# check if readme matches
if args.check_readme:
if not check_readme(readme_before, readme_after):
print("README.md file did not match...")
exit(2)
def modify_notebooks(notebooks):
# setup variables
kernelspec = {
"display_name": "Python 3.8 - AzureML",
"language": "python",
"name": "python38-azureml",
}
# for each notebooks
for notebook in notebooks:
# read in notebook
with open(notebook, "r") as f:
data = json.load(f)
# update metadata
data["metadata"]["kernelspec"] = kernelspec
# write notebook
with open(notebook, "w") as f:
json.dump(data, f, indent=1)
def write_readme(jobs, endpoints, resources, assets, scripts, schedules):
# read in prefix.md and suffix.md
with open("prefix.md", "r") as f:
prefix = f.read()
with open("suffix.md", "r") as f:
suffix = f.read()
# define markdown tables
jobs_table = "\n**Jobs** ([jobs](jobs))\n\npath|status|description\n-|-|-\n"
endpoints_table = (
"\n**Endpoints** ([endpoints](endpoints))\n\npath|status|description\n-|-|-\n"
)
resources_table = (
"\n**Resources** ([resources](resources))\n\npath|status|description\n-|-|-\n"
)
assets_table = "\n**Assets** ([assets](assets))\n\npath|status|description\n-|-|-\n"
scripts_table = "\n**Scripts**\n\npath|status|\n-|-\n"
schedules_table = "\n**Schedules**\n\npath|status|\n-|-\n"
# process jobs
for job in jobs:
# build entries for tutorial table
posix_job = job.replace(os.sep, "/")
job_name = posix_job.replace("/", "-")
status = f"[![{posix_job}](https://github.com/Azure/azureml-examples/workflows/cli-{job_name}/badge.svg?branch={BRANCH})](https://github.com/Azure/azureml-examples/actions/workflows/cli-{job_name}.yml)"
description = "*no description*"
try:
with open(f"{job}.yml", "r") as f:
for line in f.readlines():
if "description: " in str(line):
description = line.split(": ")[-1].strip()
break
except:
pass
# add row to tutorial table
row = f"[{posix_job}.yml]({posix_job}.yml)|{status}|{description}\n"
jobs_table += row
# process endpoints
for endpoint in endpoints:
# build entries for tutorial table
posix_endpoint = endpoint.replace(os.sep, "/")
endpoint_name = posix_endpoint.replace("/", "-")
status = f"[![{posix_endpoint}](https://github.com/Azure/azureml-examples/workflows/cli-{endpoint_name}/badge.svg?branch={BRANCH})](https://github.com/Azure/azureml-examples/actions/workflows/cli-{endpoint_name}.yml)"
description = "*no description*"
try:
with open(f"{endpoint}.yml", "r") as f:
for line in f.readlines():
if "description: " in str(line):
description = line.split(": ")[-1].strip()
break
except:
pass
# add row to tutorial table
row = f"[{posix_endpoint}.yml]({posix_endpoint}.yml)|{status}|{description}\n"
endpoints_table += row
# process resources
for resource in resources:
# build entries for tutorial table
posix_resource = resource.replace(os.sep, "/")
resource_name = posix_resource.replace("/", "-")
status = f"[![{posix_resource}](https://github.com/Azure/azureml-examples/workflows/cli-{resource_name}/badge.svg?branch={BRANCH})](https://github.com/Azure/azureml-examples/actions/workflows/cli-{resource_name}.yml)"
description = "*no description*"
try:
with open(f"{resource}.yml", "r") as f:
for line in f.readlines():
if "description: " in str(line):
description = line.split(": ")[-1].strip()
break
except:
pass
# add row to tutorial table
row = f"[{posix_resource}.yml]({posix_resource}.yml)|{status}|{description}\n"
resources_table += row
# process assets
for asset in assets:
# build entries for tutorial table
posix_asset = asset.replace(os.sep, "/")
asset_name = posix_asset.replace("/", "-")
status = f"[![{posix_asset}](https://github.com/Azure/azureml-examples/workflows/cli-{asset_name}/badge.svg?branch={BRANCH})](https://github.com/Azure/azureml-examples/actions/workflows/cli-{asset_name}.yml)"
description = "*no description*"
try:
with open(f"{asset}.yml", "r") as f:
for line in f.readlines():
if "description: " in str(line):
description = line.split(": ")[-1].strip()
break
except:
pass
# add row to tutorial table
row = f"[{posix_asset}.yml]({posix_asset}.yml)|{status}|{description}\n"
assets_table += row
# process scripts
for script in scripts:
# build entries for tutorial table
posix_script = script.replace(os.sep, "/")
status = f"[![{posix_script}](https://github.com/Azure/azureml-examples/workflows/cli-scripts-{script}/badge.svg?branch={BRANCH})](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-{script}.yml)"
link = f"https://scripts.microsoft.com/azure/machine-learning/{script}"
# add row to tutorial table
row = f"[{posix_script}.sh]({posix_script}.sh)|{status}\n"
scripts_table += row
# process schedules
for schedule in schedules:
# build entries for tutorial table
posix_schedule = schedule.replace(os.sep, "/")
status = f"[![{posix_schedule}](https://github.com/Azure/azureml-examples/workflows/cli-schedules-{posix_schedule}/badge.svg?branch={BRANCH})](https://github.com/Azure/azureml-examples/actions/workflows/cli-schedules-{posix_schedule}.yml)"
link = (
f"https://schedules.microsoft.com/azure/machine-learning/{posix_schedule}"
)
# add row to tutorial table
row = f"[{posix_schedule}.yml]({posix_schedule}.yml)|{status}\n"
schedules_table += row
# write README.md
print("writing README.md...")
with open("README.md", "w") as f:
f.write(
prefix
+ scripts_table
+ jobs_table
+ endpoints_table
+ resources_table
+ assets_table
+ schedules_table
+ suffix
)
print("Finished writing README.md...")
def write_workflows(
jobs,
jobs_using_registry_components,
endpoints,
resources,
assets,
scripts,
schedules,
):
print("writing .github/workflows...")
# process jobs
for job in jobs:
# write workflow file
write_job_workflow(job)
# process jobs_using_registry_components
for job in jobs_using_registry_components:
# write workflow file
write_job_using_registry_components_workflow(job)
# process endpoints
for endpoint in endpoints:
# write workflow file
write_endpoint_workflow(endpoint)
# process assest
for resource in resources:
# write workflow file
write_asset_workflow(resource)
# process assest
for asset in assets:
# write workflow file
write_asset_workflow(asset)
# process scripts
for script in scripts:
# write workflow file
write_script_workflow(script)
# process schedules
for schedule in schedules:
# write workflow file
write_schedule_workflow(schedule)
def check_readme(before, after):
return before == after
def parse_path(path):
filename = None
project_dir = None
hyphenated = None
try:
filename = path.split(os.sep)[-1]
except:
pass
try:
project_dir = os.sep.join(path.split(os.sep)[:-1])
except:
pass
try:
hyphenated = path.replace(os.sep, "-").replace("/", "-")
except:
pass
return filename, project_dir, hyphenated
def write_job_workflow(job):
filename, project_dir, hyphenated = parse_path(job)
posix_project_dir = project_dir.replace(os.sep, "/")
is_pipeline_sample = "jobs/pipelines" in job
is_spark_sample = "jobs/spark" in job
schedule_hour, schedule_minute = get_schedule_time(filename)
# Duplicate name in working directory during checkout
# https://github.com/actions/checkout/issues/739
workflow_yaml = f"""{READONLY_HEADER}
name: cli-{hyphenated}
on:
workflow_dispatch:
schedule:
- cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"
pull_request:
branches:
- main
paths:
- cli/{posix_project_dir}/**
- infra/bootstrapping/**
- .github/workflows/cli-{hyphenated}.yml\n"""
if is_pipeline_sample:
workflow_yaml += " - cli/run-pipeline-jobs.sh\n" ""
if is_spark_sample:
workflow_yaml += " - cli/jobs/spark/data/titanic.csv\n" ""
workflow_yaml += f""" - cli/setup.sh
permissions:
id-token: write
concurrency:
group: {GITHUB_CONCURRENCY_GROUP}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
- name: bootstrap resources
run: |
echo '{GITHUB_CONCURRENCY_GROUP}';
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: Eagerly cache access tokens for required scopes
run: |
# Workaround for azure-cli's lack of support for ID token refresh
# Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
# Management
az account get-access-token --scope https://management.azure.com/.default --output none
# ML
az account get-access-token --scope https://ml.azure.com/.default --output none\n"""
if is_spark_sample:
workflow_yaml += get_spark_setup_workflow(job, posix_project_dir, filename)
workflow_yaml += f""" - name: run job
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";\n"""
if "automl" in job and "image" in job:
workflow_yaml += f""" bash \"{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh\" replace_template_values \"prepare_data.py\";
pip install azure-identity
bash \"{GITHUB_WORKSPACE}/sdk/python/setup.sh\"
python prepare_data.py --subscription $SUBSCRIPTION_ID --group $RESOURCE_GROUP_NAME --workspace $WORKSPACE_NAME\n"""
elif "autotuning" in job:
workflow_yaml += f""" bash -x generate-yml.sh\n"""
# workflow_yaml += f""" bash -x {os.path.relpath(".", project_dir)}/run-job.sh generate-yml.yml\n"""
workflow_yaml += f""" bash -x {os.path.relpath(".", project_dir).replace(os.sep, "/")}/run-job.sh {filename}.yml
working-directory: cli/{posix_project_dir}
- name: validate readme
run: |
python check-readme.py "{GITHUB_WORKSPACE}/cli/{posix_project_dir}"
working-directory: infra/bootstrapping
continue-on-error: false\n"""
# write workflow
with open(
f"..{os.sep}.github{os.sep}workflows{os.sep}cli-{job.replace(os.sep, '-').replace('/', '-')}.yml",
"w",
) as f:
f.write(workflow_yaml)
def write_job_using_registry_components_workflow(job):
filename, project_dir, hyphenated = parse_path(job)
posix_project_dir = project_dir.replace(os.sep, "/")
folder_name = project_dir.split(os.sep)[-1]
is_pipeline_sample = "jobs/pipelines" in job
schedule_hour, schedule_minute = get_schedule_time(filename)
# Duplicate name in working directory during checkout
# https://github.com/actions/checkout/issues/739
workflow_yaml = f"""{READONLY_HEADER}
name: cli-{hyphenated}-registry
on:
workflow_dispatch:
schedule:
- cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"
pull_request:
branches:
- main
paths:
- cli/{posix_project_dir}/**
- infra/bootstrapping/**
- .github/workflows/cli-{hyphenated}-registry.yml\n"""
if is_pipeline_sample:
workflow_yaml += " - cli/run-pipeline-jobs.sh\n" ""
workflow_yaml += f""" - cli/setup.sh
permissions:
id-token: write
concurrency:
group: {GITHUB_CONCURRENCY_GROUP}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
- name: bootstrap resources
run: |
echo '{GITHUB_CONCURRENCY_GROUP}';
bash bootstrap.sh
working-directory: infra
continue-on-error: false
- name: setup-cli
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: Eagerly cache access tokens for required scopes
run: |
# Workaround for azure-cli's lack of support for ID token refresh
# Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
# Management
az account get-access-token --scope https://management.azure.com/.default --output none
# ML
az account get-access-token --scope https://ml.azure.com/.default --output none
- name: validate readme
run: |
python check-readme.py "{GITHUB_WORKSPACE}/cli/{posix_project_dir}"
working-directory: infra/bootstrapping
continue-on-error: false
- name: run job
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";\n"""
if "automl" in job and "image" in job:
workflow_yaml += f""" bash \"{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh\" replace_template_values \"prepare_data.py\";
pip install azure-identity
bash \"{GITHUB_WORKSPACE}/sdk/python/setup.sh\"
python prepare_data.py --subscription $SUBSCRIPTION_ID --group $RESOURCE_GROUP_NAME --workspace $WORKSPACE_NAME\n"""
workflow_yaml += f""" bash -x {os.path.relpath(".", project_dir).replace(os.sep, "/")}/run-pipeline-job-with-registry-components.sh {filename} {folder_name}
working-directory: cli/{posix_project_dir}\n"""
# write workflow
with open(
f"..{os.sep}.github{os.sep}workflows{os.sep}cli-{job.replace(os.sep, '-').replace('/', '-')}-registry.yml",
"w",
) as f:
f.write(workflow_yaml)
def write_endpoint_workflow(endpoint):
filename, project_dir, hyphenated = parse_path(endpoint)
project_dir = project_dir.replace(os.sep, "/")
deployments = sorted(
glob.glob(project_dir + "/*deployment.yml", recursive=True)
+ glob.glob(project_dir + "/*deployment.yaml", recursive=True)
)
deployments = [
deployment
for deployment in deployments
if not any(excluded in deployment for excluded in EXCLUDED_DEPLOYMENTS)
]
schedule_hour, schedule_minute = get_schedule_time(filename)
endpoint_type = (
"online"
if "endpoints/online/" in endpoint
else "batch"
if "endpoints/batch/" in endpoint
else "unknown"
)
endpoint_name = hyphenated[-28:].replace("-", "") + str(
random.randrange(1000, 9999)
)
create_endpoint_yaml = f"""{READONLY_HEADER}
name: cli-{hyphenated}
on:
workflow_dispatch:
schedule:
- cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"
pull_request:
branches:
- main
paths:
- cli/{project_dir}/**
- cli/endpoints/{endpoint_type}/**
- infra/bootstrapping/**
- .github/workflows/cli-{hyphenated}.yml
- cli/setup.sh
permissions:
id-token: write
concurrency:
group: {GITHUB_CONCURRENCY_GROUP}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
- name: bootstrap resources
run: |
bash bootstrap.sh
working-directory: infra/bootstrapping
continue-on-error: false
- name: setup-cli
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: Eagerly cache access tokens for required scopes
run: |
# Workaround for azure-cli's lack of support for ID token refresh
# Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
# Management
az account get-access-token --scope https://management.azure.com/.default --output none
# ML
az account get-access-token --scope https://ml.azure.com/.default --output none
- name: validate readme
run: |
python check-readme.py "{GITHUB_WORKSPACE}/cli/{project_dir}"
working-directory: infra/bootstrapping
continue-on-error: false
- name: delete endpoint if existing
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
az ml {endpoint_type}-endpoint delete -n {endpoint_name} -y
working-directory: cli
continue-on-error: true
- name: create endpoint
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
cat {endpoint}.yml
az ml {endpoint_type}-endpoint create -n {endpoint_name} -f {endpoint}.yml
working-directory: cli\n"""
cleanup_yaml = f""" - name: cleanup endpoint
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
az ml {endpoint_type}-endpoint delete -n {endpoint_name} -y
working-directory: cli\n"""
workflow_yaml = create_endpoint_yaml
if (deployments is not None) and (len(deployments) > 0):
for deployment in deployments:
deployment = deployment.replace(".yml", "").replace(".yaml", "")
deployment_yaml = f""" - name: create deployment
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
cat {deployment}.yml
az ml {endpoint_type}-deployment create -e {endpoint_name} -f {deployment}.yml
working-directory: cli\n"""
workflow_yaml += deployment_yaml
workflow_yaml += cleanup_yaml
# write workflow
with open(f"../.github/workflows/cli-{hyphenated}.yml", "w") as f:
f.write(workflow_yaml)
def write_asset_workflow(asset):
filename, project_dir, hyphenated = parse_path(asset)
project_dir = project_dir.replace(os.sep, "/")
posix_asset = asset.replace(os.sep, "/")
schedule_hour, schedule_minute = get_schedule_time(filename)
workflow_yaml = f"""{READONLY_HEADER}
name: cli-{hyphenated}
on:
workflow_dispatch:
schedule:
- cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"
pull_request:
branches:
- main
paths:
- cli/{posix_asset}.yml
- infra/bootstrapping/**
- .github/workflows/cli-{hyphenated}.yml
- cli/setup.sh
permissions:
id-token: write
concurrency:
group: {GITHUB_CONCURRENCY_GROUP}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
- name: bootstrap resources
run: |
bash bootstrapping/bootstrap.sh
working-directory: infra
continue-on-error: false
- name: setup-cli
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: Eagerly cache access tokens for required scopes
run: |
# Workaround for azure-cli's lack of support for ID token refresh
# Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
# Management
az account get-access-token --scope https://management.azure.com/.default --output none
# ML
az account get-access-token --scope https://ml.azure.com/.default --output none
- name: validate readme
run: |
python check-readme.py "{GITHUB_WORKSPACE}/cli/{project_dir}"
working-directory: infra/bootstrapping
continue-on-error: false
- name: create asset
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
az ml {asset.split(os.sep)[1]} create -f {posix_asset}.yml
working-directory: cli\n"""
# write workflow
with open(
f"..{os.sep}.github{os.sep}workflows{os.sep}cli-{hyphenated}.yml", "w"
) as f:
f.write(workflow_yaml)
def write_script_workflow(script):
filename, project_dir, hyphenated = parse_path(script)
project_dir = project_dir.replace(os.sep, "/")
schedule_hour, schedule_minute = get_schedule_time(filename)
workflow_yaml = f"""{READONLY_HEADER}
name: cli-scripts-{hyphenated}
on:
workflow_dispatch:
schedule:
- cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"
pull_request:
branches:
- main
paths:
- cli/{script}.sh
- infra/bootstrapping/**
- .github/workflows/cli-scripts-{hyphenated}.yml
- cli/setup.sh
permissions:
id-token: write
concurrency:
group: {GITHUB_CONCURRENCY_GROUP}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
- name: bootstrap resources
run: |
bash bootstrap.sh
working-directory: infra
continue-on-error: false
- name: setup-cli
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: Eagerly cache access tokens for required scopes
run: |
# Workaround for azure-cli's lack of support for ID token refresh
# Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
# Management
az account get-access-token --scope https://management.azure.com/.default --output none
# ML
az account get-access-token --scope https://ml.azure.com/.default --output none
- name: validate readme
run: |
python check-readme.py "{GITHUB_WORKSPACE}/cli/{project_dir}"
working-directory: infra/bootstrapping
continue-on-error: false
- name: test script script
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
set -e; bash -x {script}.sh
working-directory: cli\n"""
# write workflow
with open(f"../.github/workflows/cli-scripts-{hyphenated}.yml", "w") as f:
f.write(workflow_yaml)
def write_schedule_workflow(schedule):
filename, project_dir, hyphenated = parse_path(schedule)
project_dir = project_dir.replace(os.sep, "/")
posix_schedule = schedule.replace(os.sep, "/")
schedule_hour, schedule_minute = get_schedule_time(filename)
workflow_yaml = f"""{READONLY_HEADER}
name: cli-schedules-{hyphenated}
on:
workflow_dispatch:
schedule:
- cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"
pull_request:
branches:
- main
paths:
- cli/{posix_schedule}.yml
- infra/bootstrapping/**
- .github/workflows/cli-schedules-{hyphenated}.yml
- cli/setup.sh
permissions:
id-token: write
concurrency:
group: {GITHUB_CONCURRENCY_GROUP}
cancel-in-progress: true
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: check out repo
uses: actions/checkout@v2
- name: azure login
uses: azure/login@v1
with:
client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
- name: bootstrap resources
run: |
bash bootstrap.sh
working-directory: infra
continue-on-error: false
- name: setup-cli
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
bash setup.sh
working-directory: cli
continue-on-error: true
- name: Eagerly cache access tokens for required scopes
run: |
# Workaround for azure-cli's lack of support for ID token refresh
# Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
# Management
az account get-access-token --scope https://management.azure.com/.default --output none
# ML
az account get-access-token --scope https://ml.azure.com/.default --output none
- name: validate readme
run: |
python check-readme.py "{GITHUB_WORKSPACE}/cli/{project_dir}"
working-directory: infra/bootstrapping
continue-on-error: false
- name: create schedule
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
az ml schedule create -f ./{posix_schedule}.yml --set name="ci_test_{filename}"
working-directory: cli\n
- name: disable schedule
run: |
source "{GITHUB_WORKSPACE}/infra/bootstrapping/sdk_helpers.sh";
source "{GITHUB_WORKSPACE}/infra/bootstrapping/init_environment.sh";
az ml schedule disable --name ci_test_{filename}
working-directory: cli\n"""
# write workflow
with open(f"../.github/workflows/cli-schedules-{hyphenated}.yml", "w") as f:
f.write(workflow_yaml)
def get_schedule_time(filename):
name_hash = int(hashlib.sha512(filename.encode()).hexdigest(), 16)
schedule_minute = name_hash % 60
schedule_hour = (name_hash // 60) % hours_between_runs
return schedule_hour, schedule_minute
def get_endpoint_name(filename, hyphenated):
# gets the endpoint name from the .yml file
with open(filename, "r") as f:
endpoint_name = yaml.safe_load(f)["name"]
return endpoint_name
def get_spark_setup_workflow(job, posix_project_dir, filename):
is_attached = "attached-spark" in job
is_user_identity = "user-identity" in job
is_managed_identity = "managed-identity" in job
is_default_identity = "default-identity" in job
workflow = f""" - name: upload data
run: |
bash -x upload-data-to-blob.sh jobs/spark/
working-directory: cli
continue-on-error: true\n"""
if is_managed_identity:
workflow += f""" - name: setup identities
run: |
bash -x setup-identities.sh
working-directory: cli/{posix_project_dir}
continue-on-error: true\n"""
if is_attached:
workflow += f""" - name: setup attached spark
working-directory: cli
continue-on-error: true"""
if is_attached and is_user_identity:
workflow += f"""
run: |
bash -x {posix_project_dir}/setup-attached-resources.sh resources/compute/attached-spark-user-identity.yml {posix_project_dir}/{filename}.yml\n"""
if is_attached and is_managed_identity:
workflow += f"""
run: |
bash -x {posix_project_dir}/setup-attached-resources.sh resources/compute/attached-spark-system-identity.yml {posix_project_dir}/{filename}.yml\n"""
if is_attached and is_default_identity:
workflow += f"""
run: |
bash -x {posix_project_dir}/setup-attached-resources.sh resources/compute/attached-spark.yml {posix_project_dir}/{filename}.yml\n"""
return workflow
# run functions
if __name__ == "__main__":
# setup argparse
parser = argparse.ArgumentParser()
parser.add_argument("--check-readme", type=bool, default=False)
args = parser.parse_args()
# call main
main(args)