547 строки
19 KiB
Python
547 строки
19 KiB
Python
# imports
|
|
import contextlib
|
|
import os
|
|
import json
|
|
import glob
|
|
import argparse
|
|
import hashlib
|
|
|
|
from configparser import ConfigParser
|
|
|
|
# define constants
|
|
ENABLE_MANUAL_CALLING = True # defines whether the workflow can be invoked or not
|
|
NOT_TESTED_NOTEBOOKS = [
|
|
"datastore",
|
|
"mlflow-model-local-inference-test",
|
|
"multicloud-configuration",
|
|
"debug-online-endpoints-locally-in-visual-studio-code",
|
|
"train-hyperparameter-tune-with-sklearn",
|
|
"train-hyperparameter-tune-deploy-with-keras",
|
|
"train-hyperparameter-tune-deploy-with-tensorflow",
|
|
"interactive_data_wrangling",
|
|
# mlflow SDK samples notebooks
|
|
"mlflow_sdk_online_endpoints_progresive",
|
|
"mlflow_sdk_online_endpoints",
|
|
"mlflow_sdk_web_service",
|
|
"scoring_to_mlmodel",
|
|
"track_with_databricks_deploy_aml",
|
|
"model_management",
|
|
"run_history",
|
|
"keras_mnist_with_mlflow",
|
|
"logging_and_customizing_models",
|
|
"xgboost_classification_mlflow",
|
|
"xgboost_nested_runs",
|
|
"xgboost_service_principal",
|
|
"using_mlflow_rest_api",
|
|
"yolov5/tutorial",
|
|
"4.Provision-feature-store",
|
|
] # cannot automate lets exclude
|
|
NOT_SCHEDULED_NOTEBOOKS = [] # these are too expensive, lets not run everyday
|
|
# define branch where we need this
|
|
# use if running on a release candidate, else make it empty
|
|
READONLY_HEADER = "# This code is autogenerated.\
|
|
\n# Code is generated by running custom script: python3 readme.py\
|
|
\n# Any manual changes to this file may cause incorrect behavior.\
|
|
\n# Any manual changes will be overwritten if the code is regenerated.\n"
|
|
BRANCH = "main" # default - do not change
|
|
# BRANCH = "sdk-preview" # this should be deleted when this branch is merged to main
|
|
GITHUB_CONCURRENCY_GROUP = (
|
|
"${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
|
|
)
|
|
USE_FORECAST_REQUIREMENTS = "USE_FORECAST_REQUIREMENTS"
|
|
COMPUTE_NAMES = "COMPUTE_NAMES"
|
|
|
|
|
|
def main(args):
|
|
# get list of notebooks
|
|
notebooks = sorted(glob.glob("**/*.ipynb", recursive=True))
|
|
|
|
# write workflows
|
|
write_workflows(notebooks)
|
|
|
|
# modify notebooks
|
|
modify_notebooks(notebooks)
|
|
|
|
# write readme
|
|
write_readme(notebooks)
|
|
|
|
# write pipeline readme
|
|
pipeline_dir = "jobs" + os.sep + "pipelines" + os.sep
|
|
with change_working_dir(pipeline_dir):
|
|
pipeline_notebooks = sorted(glob.glob("**/*.ipynb", recursive=True))
|
|
pipeline_notebooks = [
|
|
f"{pipeline_dir}{notebook}" for notebook in pipeline_notebooks
|
|
]
|
|
write_readme(pipeline_notebooks, pipeline_folder=pipeline_dir)
|
|
|
|
|
|
def write_workflows(notebooks):
|
|
print("writing .github/workflows...")
|
|
cfg = ConfigParser()
|
|
cfg.read(os.path.join("notebooks_config.ini"))
|
|
for notebook in notebooks:
|
|
notebook_path = notebook.replace(os.sep, "/")
|
|
if not any(excluded in notebook_path for excluded in NOT_TESTED_NOTEBOOKS):
|
|
# get notebook name
|
|
name = os.path.basename(notebook).replace(".ipynb", "")
|
|
folder = os.path.dirname(notebook)
|
|
classification = folder.replace(os.sep, "-")
|
|
|
|
enable_scheduled_runs = True
|
|
if any(excluded in notebook_path for excluded in NOT_SCHEDULED_NOTEBOOKS):
|
|
enable_scheduled_runs = False
|
|
|
|
# write workflow file
|
|
write_notebook_workflow(
|
|
notebook, name, classification, folder, enable_scheduled_runs, cfg
|
|
)
|
|
print("finished writing .github/workflows")
|
|
|
|
|
|
def get_additional_requirements(req_name, req_path):
|
|
return f"""
|
|
- name: pip install {req_name} reqs
|
|
run: pip install -r {req_path}"""
|
|
|
|
|
|
def get_mlflow_import(notebook, validation_yml):
|
|
with open(notebook, "r", encoding="utf-8") as f:
|
|
string_file = f.read()
|
|
if (
|
|
validation_yml
|
|
or "import mlflow" in string_file
|
|
or "from mlflow" in string_file
|
|
):
|
|
return get_additional_requirements(
|
|
"mlflow", "sdk/python/mlflow-requirements.txt"
|
|
)
|
|
else:
|
|
return ""
|
|
|
|
|
|
def get_forecast_reqs(notebook_name, nb_config):
|
|
is_required = int(
|
|
nb_config.get(
|
|
section=notebook_name, option=USE_FORECAST_REQUIREMENTS, fallback=0
|
|
)
|
|
)
|
|
if is_required:
|
|
return get_additional_requirements(
|
|
"forecasting", "sdk/python/forecasting-requirements.txt"
|
|
)
|
|
else:
|
|
return ""
|
|
|
|
|
|
def get_validation_yml(notebook_folder, notebook_name):
|
|
validation_yml = ""
|
|
validation_json_file_name = os.path.join(
|
|
"..",
|
|
"..",
|
|
".github",
|
|
"test",
|
|
"sdk",
|
|
notebook_name.replace(".ipynb", ".json"),
|
|
)
|
|
|
|
if os.path.exists(validation_json_file_name):
|
|
with open(validation_json_file_name, "r") as json_file:
|
|
validation_file = json.load(json_file)
|
|
for validation in validation_file["validations"]:
|
|
validation_yml += get_validation_check_yml(
|
|
notebook_folder, notebook_name, validation
|
|
)
|
|
|
|
return validation_yml
|
|
|
|
|
|
def get_validation_check_yml(notebook_folder, notebook_name, validation):
|
|
validation_name = validation["name"]
|
|
validation_file_name = validation_name.replace(" ", "_")
|
|
notebook_output_file = (
|
|
os.path.basename(notebook_name).replace(".", ".output.").replace(os.sep, "/")
|
|
)
|
|
notebook_folder = notebook_folder.replace(os.sep, "/")
|
|
full_folder_name = f"sdk/python/{notebook_folder}"
|
|
github_workspace = "${{ github.workspace }}"
|
|
|
|
check_yml = f"""
|
|
- name: {validation_name}
|
|
run: |
|
|
python {github_workspace}/.github/test/scripts/{validation_file_name}.py \\
|
|
--file_name {notebook_output_file} \\
|
|
--folder . \\"""
|
|
|
|
for param_name, param_value in validation["params"].items():
|
|
if type(param_value) is list:
|
|
check_yml += f"""
|
|
--{param_name} \\"""
|
|
|
|
for param_item in param_value:
|
|
param_item_value = param_item.replace("\n", "\\n")
|
|
check_yml += f"""
|
|
\"{param_item_value}\" \\"""
|
|
else:
|
|
check_yml += f"""
|
|
--{param_name} {param_value} \\"""
|
|
|
|
check_yml += f"""
|
|
working-directory: {full_folder_name} \\"""
|
|
|
|
return check_yml[:-2]
|
|
|
|
|
|
def write_notebook_workflow(
|
|
notebook, name, classification, folder, enable_scheduled_runs, nb_config
|
|
):
|
|
is_pipeline_notebook = ("jobs-pipelines" in classification) or (
|
|
"assets-component" in classification
|
|
)
|
|
is_spark_notebook_sample = ("jobs-spark" in classification) or ("_spark_" in name)
|
|
is_featurestore_sample = "featurestore_sample" in classification
|
|
# Duplicate name in working directory during checkout
|
|
# https://github.com/actions/checkout/issues/739
|
|
github_workspace = "${{ github.workspace }}"
|
|
forecast_import = get_forecast_reqs(name, nb_config)
|
|
posix_folder = folder.replace(os.sep, "/")
|
|
posix_notebook = notebook.replace(os.sep, "/")
|
|
|
|
# Schedule notebooks at different times to reduce maximum quota usage.
|
|
name_hash = int(hashlib.sha512(name.encode()).hexdigest(), 16)
|
|
schedule_minute = name_hash % 60
|
|
hours_between_runs = 12
|
|
schedule_hour = (name_hash // 60) % hours_between_runs
|
|
|
|
validation_yml = get_validation_yml(folder, notebook)
|
|
mlflow_import = get_mlflow_import(notebook, validation_yml)
|
|
|
|
workflow_yaml = f"""{READONLY_HEADER}
|
|
name: sdk-{classification}-{name}
|
|
# This file is created by sdk/python/readme.py.
|
|
# Please do not edit directly.
|
|
on:\n"""
|
|
if ENABLE_MANUAL_CALLING:
|
|
workflow_yaml += f""" workflow_dispatch:\n"""
|
|
if enable_scheduled_runs:
|
|
workflow_yaml += f""" schedule:
|
|
- cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"\n"""
|
|
workflow_yaml += f""" pull_request:
|
|
branches:
|
|
- main\n"""
|
|
if BRANCH != "main":
|
|
workflow_yaml += f""" - {BRANCH}\n"""
|
|
if is_pipeline_notebook:
|
|
workflow_yaml += " - pipeline/*\n"
|
|
workflow_yaml += f""" paths:
|
|
- sdk/python/{posix_folder}/**
|
|
- .github/workflows/sdk-{classification}-{name}.yml
|
|
- sdk/python/dev-requirements.txt
|
|
- infra/bootstrapping/**
|
|
- sdk/python/setup.sh\n"""
|
|
if is_featurestore_sample:
|
|
workflow_yaml += f""" - sdk/python/featurestore_sample/**"""
|
|
workflow_yaml += f"""
|
|
permissions:
|
|
id-token: write
|
|
concurrency:
|
|
group: {GITHUB_CONCURRENCY_GROUP}
|
|
cancel-in-progress: true
|
|
jobs:
|
|
build:
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: check out repo
|
|
uses: actions/checkout@v2
|
|
- name: setup python
|
|
uses: actions/setup-python@v2
|
|
with:
|
|
python-version: "3.10"
|
|
- name: pip install notebook reqs
|
|
run: pip install --no-cache-dir -r sdk/python/dev-requirements.txt{mlflow_import}{forecast_import}
|
|
- name: azure login
|
|
uses: azure/login@v1
|
|
with:
|
|
client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
|
|
tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
|
|
subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
|
|
- name: bootstrap resources
|
|
run: |
|
|
echo '{GITHUB_CONCURRENCY_GROUP}';
|
|
bash bootstrap.sh
|
|
working-directory: infra/bootstrapping
|
|
continue-on-error: false
|
|
- name: setup SDK
|
|
run: |
|
|
source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh";
|
|
source "{github_workspace}/infra/bootstrapping/init_environment.sh";
|
|
bash setup.sh
|
|
working-directory: sdk/python
|
|
continue-on-error: true
|
|
- name: validate readme
|
|
run: |
|
|
python check-readme.py "{github_workspace}" "{github_workspace}/sdk/python/{posix_folder}"
|
|
working-directory: infra/bootstrapping
|
|
continue-on-error: false
|
|
- name: setup-cli
|
|
run: |
|
|
source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh";
|
|
source "{github_workspace}/infra/bootstrapping/init_environment.sh";
|
|
bash setup.sh
|
|
working-directory: cli
|
|
continue-on-error: true
|
|
- name: Eagerly cache access tokens for required scopes
|
|
run: |
|
|
# Workaround for azure-cli's lack of support for ID token refresh
|
|
# Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
|
|
|
|
# Management
|
|
az account get-access-token --scope https://management.azure.com/.default --output none
|
|
# ML
|
|
az account get-access-token --scope https://ml.azure.com/.default --output none\n"""
|
|
if is_spark_notebook_sample:
|
|
workflow_yaml += get_spark_config_workflow(posix_folder, name)
|
|
if is_featurestore_sample:
|
|
workflow_yaml += get_featurestore_config_workflow(posix_folder, name)
|
|
workflow_yaml += f""" - name: run {posix_notebook}
|
|
run: |
|
|
source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh";
|
|
source "{github_workspace}/infra/bootstrapping/init_environment.sh";
|
|
bash "{github_workspace}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
|
|
bash "{github_workspace}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "{name}.ipynb";
|
|
[ -f "../../.azureml/config" ] && cat "../../.azureml/config";"""
|
|
|
|
if name == "debug-online-endpoints-locally-in-visual-studio-code":
|
|
workflow_yaml += f"""
|
|
sed -i -e "s/<ENDPOINT_NAME>/localendpoint/g" {name}.ipynb
|
|
|
|
# Create a dummy executable for VSCode
|
|
mkdir -p /tmp/code
|
|
touch /tmp/code/code
|
|
chmod +x /tmp/code/code
|
|
export PATH="/tmp/code:$PATH"\n"""
|
|
|
|
papermill_option = ""
|
|
if "endpoints-batch" in classification:
|
|
papermill_option = " --log-output"
|
|
|
|
if not ("automl" in folder):
|
|
workflow_yaml += f"""
|
|
papermill -k python {name}.ipynb {name}.output.ipynb{papermill_option}
|
|
working-directory: sdk/python/{posix_folder}"""
|
|
elif "nlp" in folder or "image" in folder:
|
|
# need GPU cluster, so override the compute cluster name to dedicated
|
|
workflow_yaml += f"""
|
|
papermill -k python -p compute_name automl-gpu-cluster {name}.ipynb {name}.output.ipynb
|
|
working-directory: sdk/python/{posix_folder}"""
|
|
else:
|
|
# need CPU cluster, so override the compute cluster name to dedicated
|
|
workflow_yaml += f"""
|
|
papermill -k python -p compute_name automl-cpu-cluster {name}.ipynb {name}.output.ipynb
|
|
working-directory: sdk/python/{posix_folder}"""
|
|
|
|
if name == "connections":
|
|
workflow_yaml += """
|
|
env:
|
|
ACR_USERNAME: ${{ secrets.ACR_USERNAME }}
|
|
ACR_PASSWORD: ${{ secrets.ACR_PASSWORD }}
|
|
GIT_PAT: ${{ secrets.GIT_PAT }}
|
|
PYTHON_FEED_SAS: ${{ secrets.PYTHON_FEED_SAS }}"""
|
|
|
|
workflow_yaml += validation_yml
|
|
|
|
workflow_yaml += f"""
|
|
- name: upload notebook's working folder as an artifact
|
|
if: ${{{{ always() }}}}
|
|
uses: ./.github/actions/upload-artifact
|
|
with:
|
|
name: {name}
|
|
path: sdk/python/{posix_folder}\n"""
|
|
|
|
if nb_config.get(section=name, option=COMPUTE_NAMES, fallback=None):
|
|
workflow_yaml += f"""
|
|
- name: Remove the compute if notebook did not done it properly.
|
|
run: bash "{github_workspace}/infra/bootstrapping/remove_computes.sh" {nb_config.get(section=name, option=COMPUTE_NAMES)}\n"""
|
|
|
|
workflow_file = os.path.join(
|
|
"..", "..", ".github", "workflows", f"sdk-{classification}-{name}.yml"
|
|
)
|
|
workflow_before = ""
|
|
if os.path.exists(workflow_file):
|
|
with open(workflow_file, "r") as f:
|
|
workflow_before = f.read()
|
|
|
|
if workflow_yaml != workflow_before:
|
|
# write workflow
|
|
with open(workflow_file, "w") as f:
|
|
f.write(workflow_yaml)
|
|
|
|
|
|
def write_readme(notebooks, pipeline_folder=None):
|
|
prefix = "prefix.md"
|
|
suffix = "suffix.md"
|
|
readme_file = "README.md"
|
|
if pipeline_folder:
|
|
prefix = os.path.join(pipeline_folder, prefix)
|
|
suffix = os.path.join(pipeline_folder, suffix)
|
|
readme_file = os.path.join(pipeline_folder, readme_file)
|
|
|
|
if BRANCH == "":
|
|
branch = "main"
|
|
else:
|
|
branch = BRANCH
|
|
# read in prefix.md and suffix.md
|
|
with open(prefix, "r") as f:
|
|
prefix = f.read()
|
|
with open(suffix, "r") as f:
|
|
suffix = f.read()
|
|
|
|
# define markdown tables
|
|
notebook_table = f"Test Status is for branch - **_{branch}_**\n|Area|Sub-Area|Notebook|Description|Status|\n|--|--|--|--|--|\n"
|
|
for notebook in notebooks:
|
|
# get notebook name
|
|
name = notebook.split(os.sep)[-1].replace(".ipynb", "")
|
|
area = notebook.split(os.sep)[0]
|
|
sub_area = notebook.split(os.sep)[1]
|
|
folder = os.path.dirname(notebook)
|
|
classification = folder.replace(os.sep, "-")
|
|
|
|
try:
|
|
# read in notebook
|
|
with open(notebook, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
description = "*no description*"
|
|
try:
|
|
if data["metadata"]["description"] is not None:
|
|
description = data["metadata"]["description"]["description"]
|
|
except BaseException:
|
|
pass
|
|
except BaseException:
|
|
print("Could not load", notebook)
|
|
pass
|
|
|
|
if any(excluded in notebook for excluded in NOT_TESTED_NOTEBOOKS):
|
|
description += " - _This sample is excluded from automated tests_"
|
|
if any(excluded in notebook for excluded in NOT_SCHEDULED_NOTEBOOKS):
|
|
description += " - _This sample is only tested on demand_"
|
|
|
|
if pipeline_folder:
|
|
notebook = os.path.relpath(notebook, pipeline_folder)
|
|
|
|
# write workflow file
|
|
notebook_table += (
|
|
write_readme_row(
|
|
branch,
|
|
notebook.replace(os.sep, "/"),
|
|
name,
|
|
classification,
|
|
area,
|
|
sub_area,
|
|
description,
|
|
)
|
|
+ "\n"
|
|
)
|
|
|
|
print("writing README.md...")
|
|
with open(readme_file, "w") as f:
|
|
f.write(prefix + notebook_table + suffix)
|
|
print("finished writing README.md")
|
|
|
|
|
|
def write_readme_row(
|
|
branch, notebook, name, classification, area, sub_area, description
|
|
):
|
|
gh_link = "https://github.com/Azure/azureml-examples/actions/workflows"
|
|
|
|
nb_name = f"[{name}]({notebook})"
|
|
status = f"[![{name}]({gh_link}/sdk-{classification}-{name}.yml/badge.svg?branch={branch})]({gh_link}/sdk-{classification}-{name}.yml)"
|
|
|
|
row = f"|{area}|{sub_area}|{nb_name}|{description}|{status}|"
|
|
return row
|
|
|
|
|
|
def modify_notebooks(notebooks):
|
|
print("modifying notebooks...")
|
|
# setup variables
|
|
kernelspec = {
|
|
"display_name": "Python 3.10 - SDK V2",
|
|
"language": "python",
|
|
"name": "python310-sdkv2",
|
|
}
|
|
|
|
# for each notebooks
|
|
for notebook in notebooks:
|
|
# read in notebook
|
|
with open(notebook, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
# update metadata
|
|
data["metadata"]["kernelspec"] = kernelspec
|
|
|
|
# write notebook
|
|
with open(notebook, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=1, ensure_ascii=False)
|
|
f.write("\n")
|
|
|
|
print("finished modifying notebooks...")
|
|
|
|
|
|
def get_spark_config_workflow(folder_name, file_name):
|
|
workflow = f""" - name: setup spark resources
|
|
run: |
|
|
bash -x jobs/spark/setup_spark.sh jobs/spark/ {folder_name}/{file_name}.ipynb
|
|
working-directory: sdk/python
|
|
continue-on-error: true\n"""
|
|
|
|
return workflow
|
|
|
|
|
|
def get_featurestore_config_workflow(folder_name, file_name):
|
|
is_sdk_noteobook = "_sdk_" in file_name
|
|
is_cli_notebook = "_cli_" in file_name
|
|
is_vnet_notebook = "_vnet_" in file_name
|
|
workflow = f""" - name: setup feature-store resources"""
|
|
if is_sdk_noteobook:
|
|
workflow += f"""
|
|
run: |
|
|
bash -x automation-test/setup-resources.sh automation-test/{file_name}.ipynb
|
|
working-directory: sdk/python/featurestore_sample
|
|
continue-on-error: true\n"""
|
|
if is_cli_notebook:
|
|
workflow += f"""
|
|
run: |
|
|
bash -x automation-test/setup-resources-cli.sh automation-test/{file_name}.ipynb
|
|
working-directory: sdk/python/featurestore_sample
|
|
continue-on-error: true\n"""
|
|
if is_vnet_notebook:
|
|
workflow += f"""
|
|
run: |
|
|
bash -x automation-test/setup-resources-vnet.sh automation-test/{file_name}.ipynb
|
|
working-directory: sdk/python/featurestore_sample
|
|
continue-on-error: true\n"""
|
|
|
|
return workflow
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def change_working_dir(path):
|
|
"""Context manager for changing the current working directory"""
|
|
|
|
saved_path = os.getcwd()
|
|
os.chdir(str(path))
|
|
try:
|
|
yield
|
|
finally:
|
|
os.chdir(saved_path)
|
|
|
|
|
|
# run functions
|
|
if __name__ == "__main__":
|
|
# setup argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--check-readme", type=bool, default=False)
|
|
args = parser.parse_args()
|
|
|
|
# call main
|
|
main(args)
|