ENH: Group AML experiments by PR number (#681)

Ensure that all AML experiments that originate from a PR are submitted
to the same experiment.
- Add capability to hardcode experiment name for AML submission via an
environment variable
- Add a PR job that looks for running AML jobs and cancels them
- Add timeouts for all smoke test jobs
This commit is contained in:
Anton Schwaighofer 2022-11-28 14:26:24 +00:00 коммит произвёл GitHub
Родитель 7c95a4f471
Коммит e81323f589
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
18 изменённых файлов: 278 добавлений и 67 удалений

50
.github/actions/cancel_azureml_jobs/action.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,50 @@
name: 'Cancel environment setup'
description: 'Set up environment hi-ml-cpath workflows'
runs:
using: "composite"
steps:
- name: Create AzureML config.json file
shell: bash
run: ./create_config.sh
# Use a cache action to save the full conda environment, so that we don't have to reinstall it every time.
# Paths are tied to the location of the miniconda installation, and may need adjustment on a different OS.
- name: Retrieve cached Conda environment
id: cache-conda
uses: actions/cache@v3
with:
path: /usr/share/miniconda/envs/AzureML_SDK
key: azureml-conda-${{ hashFiles('.github/actions/cancel_azureml_jobs/azureml-env.yml') }}
# If the cache action didn't find a cache, then install the conda environment afresh.
- name: Build Conda environment from scratch
uses: conda-incubator/setup-miniconda@v2
if: steps.cache-conda.outputs.cache-hit != 'true'
with:
environment-file: .github/actions/cancel_azureml_jobs/azureml-env.yml
activate-environment: AzureML_SDK
# Modify the path to point to the new or cached Conda environment.
# This is effectively also what `conda activate` does.
- name: Activate environment
shell: bash
run: |
echo "Adding Conda bin folder to path"
echo "/usr/share/miniconda/envs/AzureML_SDK/bin" >> $GITHUB_PATH
- name: Conda info
shell: bash
run: conda info
- name: Show active Python path
shell: bash
run: which python
# The AzureML experiment where the jobs are deleted is read from
# the environment variable HIML_EXPERIMENT_NAME set in the workflow file.
- name: Cancel AzureML jobs
shell: bash
run: |
echo "Cancelling all unfinished AzureML jobs for this PR."
python .github/actions/cancel_azureml_jobs/cancel_aml_jobs.py

8
.github/actions/cancel_azureml_jobs/azureml-env.yml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,8 @@
name: AzureML_SDK
channels:
- defaults
dependencies:
- pip=20.1.1
- python=3.7.3
- pip:
- azureml-sdk==1.36.0

42
.github/actions/cancel_azureml_jobs/cancel_aml_jobs.py поставляемый Normal file
Просмотреть файл

@ -0,0 +1,42 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import os
from azureml._restclient.constants import RunStatus
from azureml.core import Experiment, Run, Workspace
from azureml.core.authentication import ServicePrincipalAuthentication
def cancel_running_and_queued_jobs() -> None:
print("Authenticating")
auth = ServicePrincipalAuthentication(
tenant_id='72f988bf-86f1-41af-91ab-2d7cd011db47',
service_principal_id=os.environ["HIML_SERVICE_PRINCIPAL_ID"],
service_principal_password=os.environ["HIML_SERVICE_PRINCIPAL_PASSWORD"])
print("Getting AML workspace")
workspace = Workspace.get(
name="hi-ml",
auth=auth,
subscription_id=os.environ["HIML_SUBSCRIPTION_ID"],
resource_group=os.environ["HIML_RESOURCE_GROUP"]
)
experiment_name = os.environ["HIML_EXPERIMENT_NAME"]
experiment_name = experiment_name.replace("/", "_")
print(f"Experiment: {experiment_name}")
experiment = Experiment(workspace, name=experiment_name)
print(f"Retrieved experiment {experiment.name}")
for run in experiment.get_runs(include_children=True, properties={}):
assert isinstance(run, Run)
status_suffix = f"'{run.status}' run {run.id} ({run.display_name})"
if run.status in (RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.FINALIZING, RunStatus.CANCELED,
RunStatus.CANCEL_REQUESTED):
print(f"Skipping {status_suffix}")
else:
print(f"Cancelling {status_suffix}")
run.cancel()
if __name__ == "__main__":
cancel_running_and_queued_jobs()

28
.github/workflows/cpath-pr.yml поставляемый
Просмотреть файл

@ -23,9 +23,20 @@ env:
HIML_WORKSPACE_NAME: ${{ secrets.HIML_WORKSPACE_NAME }}
HIML_SERVICE_PRINCIPAL_ID: ${{ secrets.HIML_SERVICE_PRINCIPAL_ID }}
HIML_SERVICE_PRINCIPAL_PASSWORD: ${{ secrets.HIML_SERVICE_PRINCIPAL_PASSWORD }}
# Set the AML experiment name for all AML jobs submitted during tests. Github.ref looks like
# "refs/pull/123/merge" for PR builds.
HIML_EXPERIMENT_NAME: ${{ github.ref }}
jobs:
cancel-azureml:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Cancel previous AzureML runs
uses: ./.github/actions/cancel_azureml_jobs
flake8:
runs-on: ubuntu-20.04
steps:
@ -64,6 +75,7 @@ jobs:
pytest:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -85,10 +97,7 @@ jobs:
if: always()
shell: bash -l {0}
run: |
branch_prefix="refs/heads/"
full_branch_name=$GITHUB_REF
branch_name_without_prefix=${full_branch_name#$branch_prefix}
python hi-ml-azure/run_pytest.py --mark=gpu --cluster=pr-gpu --conda_env=${{ env.folder }}/environment.yml --folder=${{ env.folder }} --coverage_module=${{ env.module_for_coverage_reporting }} --experiment="$branch_name_without_prefix"
python hi-ml-azure/run_pytest.py --mark=gpu --cluster=pr-gpu --conda_env=${{ env.folder }}/environment.yml --folder=${{ env.folder }} --coverage_module=${{ env.module_for_coverage_reporting }}
- name: Upload coverage reports to Codecov
# Coverage should also be uploaded if tests still fail.
@ -100,6 +109,7 @@ jobs:
smoke_test_cucim_slidespandaimagenetmil:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -115,6 +125,7 @@ jobs:
smoke_test_openslide_slidespandaimagenetmil:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -130,6 +141,7 @@ jobs:
smoke_test_tilespandaimagenetmil:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -145,6 +157,7 @@ jobs:
smoke_test_tcgacrckimagenetmil:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -160,6 +173,7 @@ jobs:
smoke_test_tcgacrcksslmil:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -175,6 +189,7 @@ jobs:
smoke_test_crck_simclr:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -190,6 +205,7 @@ jobs:
smoke_test_crck_flexible_finetuning:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -205,6 +221,7 @@ jobs:
smoke_test_crck_loss_analysis:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -220,6 +237,7 @@ jobs:
smoke_test_slides_panda_loss_analysis:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -235,6 +253,7 @@ jobs:
smoke_test_slides_panda_no_ddp_sampler:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:
@ -250,6 +269,7 @@ jobs:
smoke_test_tiles_panda_no_ddp_sampler:
runs-on: ubuntu-20.04
needs: [ cancel-azureml ]
steps:
- uses: actions/checkout@v3
with:

13
.github/workflows/hi-ml-pr.yml поставляемый
Просмотреть файл

@ -22,11 +22,22 @@ env:
HIML_DIST_ARTIFACT_SUFFIX: '-dist'
HIML_PACKAGE_NAME_ARTIFACT_SUFFIX: '-package_name'
HIML_VERSION_ARTIFACT_SUFFIX: '-latest_version'
# Set the AML experiment name for all AML jobs submitted during tests. Github.ref looks like
# "refs/pull/123/merge" for PR builds.
HIML_EXPERIMENT_NAME: ${{ github.ref }}
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
PYPI_TEST_API_TOKEN: ${{ secrets.PYPI_TEST_API_TOKEN }}
jobs:
cancel-azureml:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Cancel previous AzureML runs
uses: ./.github/actions/cancel_azureml_jobs
flake8:
runs-on: ubuntu-20.04
steps:
@ -144,7 +155,7 @@ jobs:
test-artifact-pkg:
runs-on: ubuntu-20.04
needs: [ build-python ]
needs: [ build-python, cancel-azureml ]
strategy:
matrix:
folder: [ hi-ml, hi-ml-azure ]

Просмотреть файл

@ -35,7 +35,8 @@ from azureml.train.hyperdrive import HyperDriveConfig, GridParameterSampling, Pr
from azureml.dataprep.fuse.daemon import MountContext
from health_azure.amulet import (ENV_AMLT_DATAREFERENCE_DATA, ENV_AMLT_DATAREFERENCE_OUTPUT, is_amulet_job)
from health_azure.utils import (create_python_environment, create_run_recovery_id, find_file_in_parent_to_pythonpath,
from health_azure.utils import (ENV_EXPERIMENT_NAME, create_python_environment, create_run_recovery_id,
find_file_in_parent_to_pythonpath,
is_run_and_child_runs_completed, is_running_in_azure_ml, register_environment,
run_duration_string_to_seconds, to_azure_friendly_string, RUN_CONTEXT, get_workspace,
PathOrString, DEFAULT_ENVIRONMENT_VARIABLES, get_ml_client,
@ -424,6 +425,30 @@ def get_display_name_v2(tags: Optional[Dict[str, Any]] = None) -> str:
return display_name
def effective_experiment_name(experiment_name: Optional[str],
entry_script: Optional[PathOrString] = None) -> str:
"""Choose the experiment name to use for the run. If provided in the environment variable HIML_EXPERIMENT_NAME,
then use that. Otherwise, use the argument `experiment_name`, or fall back to the default based on the
entry point script.
:param experiment_name: The name of the AzureML experiment in which the run should be submitted.
:param entry_script: The script that should be run in AzureML.
:return: The effective experiment name to use, based on the fallback rules above.
"""
value_from_env = os.environ.get(ENV_EXPERIMENT_NAME, "")
if value_from_env:
raw_value = value_from_env
elif experiment_name:
raw_value = experiment_name
elif entry_script is not None:
raw_value = Path(entry_script).stem
else:
raise ValueError("No experiment name provided, and no entry script provided. ")
cleaned_value = to_azure_friendly_string(raw_value)
assert cleaned_value is not None, "Expecting an actual string"
return cleaned_value
def submit_run_v2(workspace: Optional[Workspace],
experiment_name: str,
environment: EnvironmentV2,
@ -852,6 +877,7 @@ def submit_to_azure_if_needed( # type: ignore
amlignore_path = snapshot_root_directory / AML_IGNORE_FILE
lines_to_append = [str(path) for path in (ignored_folders or [])]
with append_to_amlignore(amlignore=amlignore_path, lines_to_append=lines_to_append):
if strictly_aml_v1:
run_config = create_run_configuration(
@ -880,10 +906,8 @@ def submit_to_azure_if_needed( # type: ignore
else:
config_to_submit = script_run_config
effective_experiment_name = experiment_name or Path(script_run_config.script).stem
run = submit_run(workspace=workspace,
experiment_name=effective_experiment_name,
experiment_name=effective_experiment_name(experiment_name, script_run_config.script),
script_run_config=config_to_submit,
tags=tags,
wait_for_completion=wait_for_completion,
@ -898,7 +922,6 @@ def submit_to_azure_if_needed( # type: ignore
if entry_script is None:
entry_script = Path(sys.argv[0])
script_params = script_params or sys.argv[1:]
effective_experiment_name = experiment_name or Path(entry_script).stem
ml_client = get_ml_client(ml_client=ml_client, aml_workspace=workspace)
registered_env = register_environment_v2(environment, ml_client)
@ -908,7 +931,7 @@ def submit_to_azure_if_needed( # type: ignore
run = submit_run_v2(workspace=workspace,
input_datasets_v2=input_datasets_v2,
output_datasets_v2=output_datasets_v2,
experiment_name=effective_experiment_name,
experiment_name=effective_experiment_name(experiment_name, entry_script),
environment=registered_env,
snapshot_root_directory=snapshot_root_directory,
entry_script=entry_script,

Просмотреть файл

@ -76,6 +76,9 @@ ENV_LOCAL_RANK = "LOCAL_RANK"
ENV_RANK = "RANK"
MASTER_PORT_DEFAULT = 6105
# Environment variables that affect job submission, in particular in builds
ENV_EXPERIMENT_NAME = "HIML_EXPERIMENT_NAME"
# Other Azure ML related variables
ENVIRONMENT_VERSION = "1"
FINAL_MODEL_FOLDER = "final_model"

Просмотреть файл

@ -35,17 +35,16 @@ from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundErr
from azureml.data.azure_storage_datastore import AzureBlobDatastore
import health_azure.utils as util
from health_azure.himl import AML_IGNORE_FILE, append_to_amlignore
from health_azure.himl import AML_IGNORE_FILE, append_to_amlignore, effective_experiment_name
from health_azure.utils import (ENV_MASTER_ADDR, ENV_MASTER_PORT, MASTER_PORT_DEFAULT,
PackageDependency, create_argparser, get_credential)
from testazure.test_himl import RunTarget, render_and_run_test_script
from testazure.utils_testazure import (DEFAULT_IGNORE_FOLDERS, DEFAULT_WORKSPACE, MockRun, change_working_directory,
himl_azure_root, repository_root)
experiment_for_unittests, himl_azure_root, repository_root)
RUN_ID = uuid4().hex
RUN_NUMBER = 42
EXPERIMENT_NAME = "fancy-experiment"
AML_TESTS_EXPERIMENT = "test_experiment"
def oh_no() -> None:
@ -1103,16 +1102,16 @@ def test_get_latest_aml_run_from_experiment(num_runs: int, tags: Dict[str, str],
assert len(aml_runs) == expected_num_returned
def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None:
def test_get_latest_aml_run_from_experiment_remote() -> None:
"""
Test that a remote run with particular tags can be correctly retrieved, ignoring any more recent
experiments which do not have the correct tags. Note: this test will instantiate 2 new Runs in the
workspace described in your config.json file, under an experiment defined by AML_TESTS_EXPERIMENT
workspace described in your config.json file, under an experiment defined by experiment_for_unittests()
"""
ws = DEFAULT_WORKSPACE.workspace
assert True
experiment = Experiment(ws, AML_TESTS_EXPERIMENT)
experiment = Experiment(ws, experiment_for_unittests())
config = ScriptRunConfig(
source_directory=".",
command=["cd ."], # command that does nothing
@ -1123,6 +1122,7 @@ def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None:
amlignore=Path("") / AML_IGNORE_FILE,
lines_to_append=DEFAULT_IGNORE_FOLDERS):
first_run = experiment.submit(config)
first_run.diplay_name = "test_get_latest_aml_run_from_experiment_remote"
tags = {"experiment_type": "great_experiment"}
first_run.set_tags(tags)
first_run.wait_for_completion()
@ -1136,7 +1136,7 @@ def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None:
second_run.remove_tags(tags)
# Retrieve latest run with given tags (expect first_run to be returned)
retrieved_runs = util.get_latest_aml_runs_from_experiment(AML_TESTS_EXPERIMENT, tags=tags, aml_workspace=ws)
retrieved_runs = util.get_latest_aml_runs_from_experiment(experiment_for_unittests(), tags=tags, aml_workspace=ws)
assert len(retrieved_runs) == 1
assert retrieved_runs[0].id == first_run.id
assert retrieved_runs[0].get_tags() == tags
@ -1258,7 +1258,7 @@ def test_download_file_from_run(tmp_path: Path, dummy_env_vars: Dict[str, str],
def test_download_file_from_run_remote(tmp_path: Path) -> None:
# This test will create a Run in your workspace (using only local compute)
ws = DEFAULT_WORKSPACE.workspace
experiment = Experiment(ws, AML_TESTS_EXPERIMENT)
experiment = Experiment(ws, experiment_for_unittests())
config = ScriptRunConfig(
source_directory=".",
command=["cd ."], # command that does nothing
@ -1268,6 +1268,7 @@ def test_download_file_from_run_remote(tmp_path: Path) -> None:
amlignore=Path("") / AML_IGNORE_FILE,
lines_to_append=DEFAULT_IGNORE_FOLDERS):
run = experiment.submit(config)
run.diplay_name = "test_download_file_from_run_remote"
file_to_upload = tmp_path / "dummy_file.txt"
file_contents = "Hello world"
@ -1308,7 +1309,7 @@ def test_download_run_file_during_run(tmp_path: Path) -> None:
information about the workspace to use, but pick up the current workspace.
"""
# Create a run that contains a simple txt file
experiment_name = "himl-tests"
experiment_name = effective_experiment_name("himl-tests")
run_to_download_from = util.create_aml_run_object(experiment_name=experiment_name,
workspace=DEFAULT_WORKSPACE.workspace)
file_contents = "Hello World!"
@ -1582,7 +1583,7 @@ def test_checkpoint_download_remote(tmp_path: Path) -> None:
prefix = "outputs/checkpoints/"
ws = DEFAULT_WORKSPACE.workspace
experiment = Experiment(ws, AML_TESTS_EXPERIMENT)
experiment = Experiment(ws, experiment_for_unittests())
config = ScriptRunConfig(
source_directory=".",
command=["cd ."], # command that does nothing
@ -1592,6 +1593,7 @@ def test_checkpoint_download_remote(tmp_path: Path) -> None:
amlignore=Path("") / AML_IGNORE_FILE,
lines_to_append=DEFAULT_IGNORE_FOLDERS):
run = experiment.submit(config)
run.display_name = "test_checkpoint_download_remote"
file_contents = "Hello world"
file_name = "" # for pyright
@ -2341,8 +2343,8 @@ def test_create_run() -> None:
"""
Test if we can create an AML run object here in the test suite, write logs and read them back in.
"""
run_name = "foo"
experiment_name = "himl-tests"
run_name = "test_create_run"
experiment_name = effective_experiment_name("himl-tests")
run: Optional[Run] = None
try:
run = util.create_aml_run_object(experiment_name=experiment_name, run_name=run_name,

Просмотреть файл

@ -45,6 +45,7 @@ from health_azure.datasets import (
)
from health_azure.utils import (
DEFAULT_ENVIRONMENT_VARIABLES,
ENV_EXPERIMENT_NAME,
ENVIRONMENT_VERSION,
EXPERIMENT_RUN_SEPARATOR,
WORKSPACE_CONFIG_JSON,
@ -741,6 +742,7 @@ def test_submit_run(mock_workspace: mock.MagicMock,
assert "AzureML completed" not in out
@pytest.mark.fast
def test_submit_run_v2(tmp_path: Path) -> None:
def _mock_sweep(*args: Any, **kwargs: Any) -> MagicMock:
assert kwargs.get("compute") == dummy_compute_target
@ -864,7 +866,7 @@ def test_submit_run_v2(tmp_path: Path) -> None:
# 'command' should be called with the same args
print(mock_command.call)
# command should be called once when initialising the command job and once when updating the param sampling
mock_command.call_count == 2
assert mock_command.call_count == 2
mock_command.assert_any_call(
code=str(dummy_root_directory),
@ -1657,6 +1659,7 @@ def test_extract_v2_inputs_outputs_from_args() -> None:
assert len(output_datasets) == 0
@pytest.mark.fast
def test_get_display_name_v2() -> None:
dummy_display_name = "job display name"
expected_display_name = "job-display-name"
@ -1676,3 +1679,24 @@ def test_get_display_name_v2() -> None:
# if no tags provided, display name should be empty
display_name = himl.get_display_name_v2()
assert display_name == ""
@pytest.mark.fast
def test_experiment_name() -> None:
"""Test the logic for choosing experiment names: Explicitly given experiment name should be used if provided,
otherwise fall back to environment variables and thpwden script name."""
# When the test suite runs on the Github, the environment variable "HIML_EXPERIMENT_NAME" will be set.
# Remove it to test the default behaviour.
with mock.patch.dict(os.environ):
os.environ.pop(ENV_EXPERIMENT_NAME, None)
assert himl.effective_experiment_name("explicit", Path()) == "explicit"
assert himl.effective_experiment_name("", Path("from_script.py")) == "from_script"
# Provide experiment names with special characters here that should be filtered out
with mock.patch("health_azure.himl.to_azure_friendly_string", return_value="mock_return"):
assert himl.effective_experiment_name("explicit", Path()) == "mock_return"
assert himl.effective_experiment_name("explicit/", Path()) == "explicit_"
with mock.patch.dict(os.environ, {ENV_EXPERIMENT_NAME: "name/from/env"}):
assert himl.effective_experiment_name("explicit", Path()) == "name_from_env"
with mock.patch.dict(os.environ, {ENV_EXPERIMENT_NAME: "name_from_env"}):
assert himl.effective_experiment_name("explicit", Path()) == "name_from_env"
assert himl.effective_experiment_name("", Path("from_script.py")) == "name_from_env"

Просмотреть файл

@ -10,7 +10,8 @@ from contextlib import contextmanager
from pathlib import Path
from typing import Dict, Generator, Optional
from health_azure.utils import (UnitTestWorkspaceWrapper, WORKSPACE_CONFIG_JSON)
from health_azure.utils import (ENV_EXPERIMENT_NAME, WORKSPACE_CONFIG_JSON, UnitTestWorkspaceWrapper,
to_azure_friendly_string)
DEFAULT_DATASTORE = "himldatasets"
FALLBACK_SINGLE_RUN = "refs_pull_545_merge:refs_pull_545_merge_1626538212_d2b07afd"
@ -46,6 +47,15 @@ def repository_root() -> Path:
return himl_azure_root().parent
def experiment_for_unittests() -> str:
"""
Gets the name of the experiment to use for tests.
"""
experiment_name = to_azure_friendly_string(os.getenv(ENV_EXPERIMENT_NAME, "unittests"))
assert experiment_name is not None
return experiment_name
@contextmanager
def change_working_directory(path_or_str: Path) -> Generator:
"""

Просмотреть файл

@ -160,11 +160,11 @@ define DEFAULT_SMOKE_TEST_ARGS
endef
define AML_ONE_DEVICE_ARGS
--cluster=testing-nc6 --wait_for_completion --num_nodes=1 --max_num_gpus=1 --strictly_aml_v1=True
--cluster=testing-nc6 --wait_for_completion --num_nodes=1 --max_num_gpus=1 --strictly_aml_v1=True --max_run_duration=1h
endef
define AML_MULTIPLE_DEVICE_ARGS
--cluster=dedicated-nc24s-v2 --wait_for_completion --strictly_aml_v1=True
--cluster=dedicated-nc24s-v2 --wait_for_completion --strictly_aml_v1=True --max_run_duration=1h
endef
define DEEPSMILEDEFAULT_SMOKE_TEST_ARGS
@ -206,7 +206,7 @@ smoke_test_cucim_slidespandaimagenetmil_local:
# Once running in AML the following test takes around 6 minutes
smoke_test_cucim_slidespandaimagenetmil_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_cucim_slidespandaimagenetmil_aml;}
smoke_test_openslide_slidespandaimagenetmil_local:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -214,7 +214,7 @@ smoke_test_openslide_slidespandaimagenetmil_local:
smoke_test_openslide_slidespandaimagenetmil_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${OPENSLIDE_BACKEND_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${OPENSLIDE_BACKEND_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_openslide_slidespandaimagenetmil_aml;}
# The following test takes about 6 minutes
smoke_test_tilespandaimagenetmil_local:
@ -223,7 +223,7 @@ smoke_test_tilespandaimagenetmil_local:
smoke_test_tilespandaimagenetmil_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tilespandaimagenetmil_aml;}
# The following test takes about 30 seconds
smoke_test_tcgacrcksslmil_local:
@ -232,7 +232,7 @@ smoke_test_tcgacrcksslmil_local:
smoke_test_tcgacrcksslmil_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tcgacrcksslmil_aml;}
smoke_test_tcgacrckimagenetmil_local:
{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKIMANEGETMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -240,7 +240,7 @@ smoke_test_tcgacrckimagenetmil_local:
smoke_test_tcgacrckimagenetmil_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKIMANEGETMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tcgacrckimagenetmil_aml;}
# The following test takes about 3 minutes
smoke_test_crck_simclr_local:
@ -249,7 +249,7 @@ smoke_test_crck_simclr_local:
smoke_test_crck_simclr_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${CRCKSIMCLR_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${CRCKSIMCLR_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${CRCKSIMCLR_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_crck_simclr_aml;}
smoke_test_crck_flexible_finetuning_local:
{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -257,7 +257,7 @@ smoke_test_crck_flexible_finetuning_local:
smoke_test_crck_flexible_finetuning_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${CRCK_TUNING_ARGS};}
${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${CRCK_TUNING_ARGS} --tag smoke_test_crck_flexible_finetuning_aml;}
smoke_test_crck_loss_analysis_local:
{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -265,7 +265,7 @@ smoke_test_crck_loss_analysis_local:
smoke_test_crck_loss_analysis_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${LOSS_ANALYSIS_ARGS};}
${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${LOSS_ANALYSIS_ARGS} --tag smoke_test_crck_loss_analysis_aml;}
smoke_test_slides_panda_loss_analysis_local:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -273,7 +273,7 @@ smoke_test_slides_panda_loss_analysis_local:
smoke_test_slides_panda_loss_analysis_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${LOSS_ANALYSIS_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${LOSS_ANALYSIS_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_slides_panda_loss_analysis_aml;}
smoke_test_slides_panda_no_ddp_sampler_local:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -281,7 +281,7 @@ smoke_test_slides_panda_no_ddp_sampler_local:
smoke_test_slides_panda_no_ddp_sampler_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_slides_panda_no_ddp_sampler_aml;}
smoke_test_tiles_panda_no_ddp_sampler_local:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -289,7 +289,7 @@ smoke_test_tiles_panda_no_ddp_sampler_local:
smoke_test_tiles_panda_no_ddp_sampler_aml:
{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tiles_panda_no_ddp_sampler_aml;}
smoke tests local: smoke_test_cucim_slidespandaimagenetmil_local smoke_test_openslide_slidespandaimagenetmil_local smoke_test_tilespandaimagenetmil_local smoke_test_tcgacrcksslmil_local smoke_test_crck_simclr_local smoke_test_crck_flexible_finetuning_local smoke_test_tcgacrckimagenetmil_local smoke_test_crck_loss_analysis_local smoke_test_slides_panda_loss_analysis_local smoke_test_slides_panda_no_ddp_sampler_local smoke_test_tiles_panda_no_ddp_sampler_local

Просмотреть файл

@ -50,3 +50,7 @@ class ExperimentConfig(param.Parameterized):
param.ClassSelector(class_=Path, default=None, allow_None=True,
doc="The path to the AzureML workspace configuration file. If not specified, the "
"configuration file in the current folder or one of its parents will be used.")
max_run_duration: str = param.String(
default="", doc="The maximum runtime that is allowed for this job in AzureML. This is given as a floating"
"point number with a string suffix s, m, h, d for seconds, minutes, hours, day. Examples: '3.5h', '2d'"
)

Просмотреть файл

@ -281,6 +281,7 @@ class Runner:
input_datasets=input_datasets, # type: ignore
num_nodes=self.experiment_config.num_nodes,
wait_for_completion=self.experiment_config.wait_for_completion,
max_run_duration=self.experiment_config.max_run_duration,
ignored_folders=[],
submit_to_azureml=bool(self.experiment_config.cluster),
docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,

Просмотреть файл

@ -28,7 +28,7 @@ from health_ml.utils.regression_test_utils import (
compare_folder_contents,
compare_folders_and_run_outputs,
)
from testazure.utils_testazure import DEFAULT_WORKSPACE
from testazure.utils_testazure import DEFAULT_WORKSPACE, experiment_for_unittests
def create_folder_and_write_text(file: Path, text: str) -> None:
@ -242,7 +242,9 @@ def upload_to_run_and_compare(regression_test_subfolder: str, run_to_mock: str,
file_contents = "some file contents"
file_name = "contents.txt"
regression_test_folder = tmp_path / "expected"
run = create_aml_run_object(workspace=DEFAULT_WORKSPACE.workspace, experiment_name="test_regression_test_utils")
run = create_aml_run_object(workspace=DEFAULT_WORKSPACE.workspace,
experiment_name=experiment_for_unittests(),
run_name="upload_to_run_and_compare")
# Upload a single file to the newly created run. When comparing the run output files,
# and seeing this in the set of files that are expected to exist on the run, this should pass.
file1 = tmp_path / file_name

Просмотреть файл

@ -2,12 +2,14 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import os
import shutil
import pytest
from pathlib import Path
from typing import Generator
from unittest.mock import DEFAULT, MagicMock, Mock, patch
from unittest import mock
from unittest.mock import MagicMock, Mock, patch
from _pytest.logging import LogCaptureFixture
from pytorch_lightning import LightningModule
@ -21,8 +23,8 @@ from health_ml.run_ml import MLRunner, get_mlflow_run_id_from_previous_loggers
from health_ml.utils.checkpoint_utils import CheckpointParser
from health_ml.utils.common_utils import is_gpu_available
from health_ml.utils.lightning_loggers import HimlMLFlowLogger, StoringLogger
from health_azure.utils import is_global_rank_zero
from testazure.utils_testazure import DEFAULT_WORKSPACE
from health_azure.utils import ENV_EXPERIMENT_NAME, is_global_rank_zero
from testazure.utils_testazure import DEFAULT_WORKSPACE, experiment_for_unittests
from testhiml.utils.fixed_paths_for_tests import mock_run_id
no_gpu = not is_gpu_available()
@ -299,11 +301,11 @@ def test_run(run_inference_only: bool, run_extra_val_epoch: bool, ml_runner_with
with patch("health_ml.run_ml.create_lightning_trainer") as mock_create_trainer:
with patch.multiple(
ml_runner_with_container,
checkpoint_handler=DEFAULT,
load_model_checkpoint=DEFAULT,
run_training=DEFAULT,
run_validation=DEFAULT,
run_inference=DEFAULT,
checkpoint_handler=mock.DEFAULT,
load_model_checkpoint=mock.DEFAULT,
run_training=mock.DEFAULT,
run_validation=mock.DEFAULT,
run_inference=mock.DEFAULT,
) as mocks:
mock_create_trainer.return_value = MagicMock(), MagicMock()
ml_runner_with_container.run()
@ -327,9 +329,9 @@ def test_run_inference_only(ml_runner_with_run_id: MLRunner) -> None:
with patch("health_ml.run_ml.create_lightning_trainer") as mock_create_trainer:
with patch.multiple(
ml_runner_with_run_id,
run_training=DEFAULT,
run_validation=DEFAULT,
validate_model_weights=DEFAULT
run_training=mock.DEFAULT,
run_validation=mock.DEFAULT,
validate_model_weights=mock.DEFAULT
) as mocks:
mock_trainer = MagicMock()
mock_create_trainer.return_value = mock_trainer, MagicMock()
@ -349,7 +351,7 @@ def test_resume_training_from_run_id(run_extra_val_epoch: bool, ml_runner_with_r
ml_runner_with_run_id.container.max_num_gpus = 0
ml_runner_with_run_id.container.max_epochs += 10
assert ml_runner_with_run_id.checkpoint_handler.trained_weights_path
with patch.multiple(ml_runner_with_run_id, run_validation=DEFAULT, run_inference=DEFAULT) as mocks:
with patch.multiple(ml_runner_with_run_id, run_validation=mock.DEFAULT, run_inference=mock.DEFAULT) as mocks:
ml_runner_with_run_id.run()
assert mocks["run_validation"].called == run_extra_val_epoch
mocks["run_inference"].assert_called_once()
@ -381,8 +383,7 @@ def test_log_on_vm(log_from_vm: bool) -> None:
container = HelloWorld()
container.max_epochs = 1
# Mimic an experiment name given on the command line.
experiment_name = "unittest"
container.experiment = experiment_name
container.experiment = experiment_for_unittests()
# The tag is used to identify the run, similar to the behaviour when submitting a run to AzureML.
tag = f"test_log_on_vm [{log_from_vm}]"
container.tag = tag
@ -404,18 +405,23 @@ def test_log_on_vm(log_from_vm: bool) -> None:
assert isinstance(logger, HimlMLFlowLogger)
@pytest.mark.fast
def test_experiment_name() -> None:
"""Test that the experiment name is set correctly, choosing either the experiment name given on the commandline
or the model name"""
container = HelloWorld()
# No experiment name given on the commandline: use the model name
model_name = "some_model"
container._model_name = model_name
assert container.effective_experiment_name == model_name
# Experiment name given on the commandline: use the experiment name
experiment_name = "unittest"
container.experiment = experiment_name
assert container.effective_experiment_name == experiment_name
# When the test suite runs on the Github, the environment variable "HIML_EXPERIMENT_NAME" will be set.
# Remove it to test the default behaviour.
with patch.dict(os.environ):
os.environ.pop(ENV_EXPERIMENT_NAME, None)
container = HelloWorld()
# No experiment name given on the commandline: use the model name
model_name = "some_model"
container._model_name = model_name
assert container.effective_experiment_name == model_name
# Experiment name given on the commandline: use the experiment name
experiment_name = experiment_for_unittests()
container.experiment = experiment_name
assert container.effective_experiment_name == experiment_name
def test_get_mlflow_run_id_from_previous_loggers() -> None:

Просмотреть файл

@ -12,6 +12,7 @@ from torchvision.transforms import Compose, Resize, CenterCrop
from azureml.core import Run
from health_azure import object_to_yaml, create_aml_run_object
from health_azure.himl import effective_experiment_name
from health_azure.utils import is_running_in_azure_ml
from health_ml.utils.serialization import ModelInfo
from testazure.utils_testazure import DEFAULT_WORKSPACE
@ -107,7 +108,7 @@ def test_serialization_roundtrip() -> None:
def test_get_metadata() -> None:
"""Test if model metadata is read correctly from the AzureML run."""
run_name = "foo"
experiment_name = "himl-tests"
experiment_name = effective_experiment_name("himl-tests")
run: Optional[Run] = None
try:
run = create_aml_run_object(

Просмотреть файл

@ -6,6 +6,7 @@ from pathlib import Path
from typing import Optional
from functools import lru_cache
from health_azure.himl import effective_experiment_name
from health_azure.utils import PathOrString
from health_azure.utils import create_aml_run_object
from testazure.utils_testazure import DEFAULT_WORKSPACE
@ -50,7 +51,7 @@ def mock_run_id(id: int = 0) -> str:
:return: The run id of the created run that contains the checkpoint.
"""
experiment_name = "himl-tests"
experiment_name = effective_experiment_name("himl-tests")
run_to_download_from = create_aml_run_object(experiment_name=experiment_name, workspace=DEFAULT_WORKSPACE.workspace)
full_file_path = full_test_data_path(suffix="hello_world_checkpoint.ckpt")

Просмотреть файл

@ -20,13 +20,14 @@ from azureml._restclient.constants import RunStatus
from azureml.core import Run
from health_azure import RUN_CONTEXT, create_aml_run_object
from health_azure.himl import effective_experiment_name
from health_ml.utils import AzureMLLogger, AzureMLProgressBar, log_learning_rate, log_on_epoch
from health_ml.utils.logging import _preprocess_hyperparams
from testhiml.utils_testhiml import DEFAULT_WORKSPACE
def create_unittest_run_object(snapshot_directory: Optional[Path] = None) -> Run:
return create_aml_run_object(experiment_name="himl-tests",
return create_aml_run_object(experiment_name=effective_experiment_name("himl-tests"),
workspace=DEFAULT_WORKSPACE.workspace,
snapshot_directory=snapshot_directory or ".")
@ -295,7 +296,9 @@ def test_azureml_logger_actual_run() -> None:
"""
When running outside of AzureML, a new run should be created.
"""
logger = AzureMLLogger(enable_logging_outside_azure_ml=True, workspace=DEFAULT_WORKSPACE.workspace)
logger = AzureMLLogger(enable_logging_outside_azure_ml=True,
workspace=DEFAULT_WORKSPACE.workspace,
run_name="test_azureml_logger_actual_run")
assert not logger.is_running_in_azure_ml
assert logger.run is not None
assert logger.run != RUN_CONTEXT