diff --git a/.github/actions/cancel_azureml_jobs/action.yml b/.github/actions/cancel_azureml_jobs/action.yml new file mode 100644 index 00000000..392e5e24 --- /dev/null +++ b/.github/actions/cancel_azureml_jobs/action.yml @@ -0,0 +1,50 @@ +name: 'Cancel environment setup' +description: 'Set up environment hi-ml-cpath workflows' + +runs: + using: "composite" + steps: + - name: Create AzureML config.json file + shell: bash + run: ./create_config.sh + + # Use a cache action to save the full conda environment, so that we don't have to reinstall it every time. + # Paths are tied to the location of the miniconda installation, and may need adjustment on a different OS. + - name: Retrieve cached Conda environment + id: cache-conda + uses: actions/cache@v3 + with: + path: /usr/share/miniconda/envs/AzureML_SDK + key: azureml-conda-${{ hashFiles('.github/actions/cancel_azureml_jobs/azureml-env.yml') }} + + # If the cache action didn't find a cache, then install the conda environment afresh. + - name: Build Conda environment from scratch + uses: conda-incubator/setup-miniconda@v2 + if: steps.cache-conda.outputs.cache-hit != 'true' + with: + environment-file: .github/actions/cancel_azureml_jobs/azureml-env.yml + activate-environment: AzureML_SDK + + # Modify the path to point to the new or cached Conda environment. + # This is effectively also what `conda activate` does. + - name: Activate environment + shell: bash + run: | + echo "Adding Conda bin folder to path" + echo "/usr/share/miniconda/envs/AzureML_SDK/bin" >> $GITHUB_PATH + + - name: Conda info + shell: bash + run: conda info + + - name: Show active Python path + shell: bash + run: which python + + # The AzureML experiment where the jobs are deleted is read from + # the environment variable HIML_EXPERIMENT_NAME set in the workflow file. + - name: Cancel AzureML jobs + shell: bash + run: | + echo "Cancelling all unfinished AzureML jobs for this PR." + python .github/actions/cancel_azureml_jobs/cancel_aml_jobs.py diff --git a/.github/actions/cancel_azureml_jobs/azureml-env.yml b/.github/actions/cancel_azureml_jobs/azureml-env.yml new file mode 100644 index 00000000..8f29929a --- /dev/null +++ b/.github/actions/cancel_azureml_jobs/azureml-env.yml @@ -0,0 +1,8 @@ +name: AzureML_SDK +channels: + - defaults +dependencies: + - pip=20.1.1 + - python=3.7.3 + - pip: + - azureml-sdk==1.36.0 diff --git a/.github/actions/cancel_azureml_jobs/cancel_aml_jobs.py b/.github/actions/cancel_azureml_jobs/cancel_aml_jobs.py new file mode 100644 index 00000000..ea5460ec --- /dev/null +++ b/.github/actions/cancel_azureml_jobs/cancel_aml_jobs.py @@ -0,0 +1,42 @@ +# ------------------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ +import os + +from azureml._restclient.constants import RunStatus +from azureml.core import Experiment, Run, Workspace +from azureml.core.authentication import ServicePrincipalAuthentication + + +def cancel_running_and_queued_jobs() -> None: + print("Authenticating") + auth = ServicePrincipalAuthentication( + tenant_id='72f988bf-86f1-41af-91ab-2d7cd011db47', + service_principal_id=os.environ["HIML_SERVICE_PRINCIPAL_ID"], + service_principal_password=os.environ["HIML_SERVICE_PRINCIPAL_PASSWORD"]) + print("Getting AML workspace") + workspace = Workspace.get( + name="hi-ml", + auth=auth, + subscription_id=os.environ["HIML_SUBSCRIPTION_ID"], + resource_group=os.environ["HIML_RESOURCE_GROUP"] + ) + experiment_name = os.environ["HIML_EXPERIMENT_NAME"] + experiment_name = experiment_name.replace("/", "_") + print(f"Experiment: {experiment_name}") + experiment = Experiment(workspace, name=experiment_name) + print(f"Retrieved experiment {experiment.name}") + for run in experiment.get_runs(include_children=True, properties={}): + assert isinstance(run, Run) + status_suffix = f"'{run.status}' run {run.id} ({run.display_name})" + if run.status in (RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.FINALIZING, RunStatus.CANCELED, + RunStatus.CANCEL_REQUESTED): + print(f"Skipping {status_suffix}") + else: + print(f"Cancelling {status_suffix}") + run.cancel() + + +if __name__ == "__main__": + cancel_running_and_queued_jobs() diff --git a/.github/workflows/cpath-pr.yml b/.github/workflows/cpath-pr.yml index 47330b1f..8872ec86 100644 --- a/.github/workflows/cpath-pr.yml +++ b/.github/workflows/cpath-pr.yml @@ -23,9 +23,20 @@ env: HIML_WORKSPACE_NAME: ${{ secrets.HIML_WORKSPACE_NAME }} HIML_SERVICE_PRINCIPAL_ID: ${{ secrets.HIML_SERVICE_PRINCIPAL_ID }} HIML_SERVICE_PRINCIPAL_PASSWORD: ${{ secrets.HIML_SERVICE_PRINCIPAL_PASSWORD }} + # Set the AML experiment name for all AML jobs submitted during tests. Github.ref looks like + # "refs/pull/123/merge" for PR builds. + HIML_EXPERIMENT_NAME: ${{ github.ref }} jobs: + cancel-azureml: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Cancel previous AzureML runs + uses: ./.github/actions/cancel_azureml_jobs + flake8: runs-on: ubuntu-20.04 steps: @@ -64,6 +75,7 @@ jobs: pytest: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -85,10 +97,7 @@ jobs: if: always() shell: bash -l {0} run: | - branch_prefix="refs/heads/" - full_branch_name=$GITHUB_REF - branch_name_without_prefix=${full_branch_name#$branch_prefix} - python hi-ml-azure/run_pytest.py --mark=gpu --cluster=pr-gpu --conda_env=${{ env.folder }}/environment.yml --folder=${{ env.folder }} --coverage_module=${{ env.module_for_coverage_reporting }} --experiment="$branch_name_without_prefix" + python hi-ml-azure/run_pytest.py --mark=gpu --cluster=pr-gpu --conda_env=${{ env.folder }}/environment.yml --folder=${{ env.folder }} --coverage_module=${{ env.module_for_coverage_reporting }} - name: Upload coverage reports to Codecov # Coverage should also be uploaded if tests still fail. @@ -100,6 +109,7 @@ jobs: smoke_test_cucim_slidespandaimagenetmil: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -115,6 +125,7 @@ jobs: smoke_test_openslide_slidespandaimagenetmil: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -130,6 +141,7 @@ jobs: smoke_test_tilespandaimagenetmil: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -145,6 +157,7 @@ jobs: smoke_test_tcgacrckimagenetmil: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -160,6 +173,7 @@ jobs: smoke_test_tcgacrcksslmil: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -175,6 +189,7 @@ jobs: smoke_test_crck_simclr: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -190,6 +205,7 @@ jobs: smoke_test_crck_flexible_finetuning: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -205,6 +221,7 @@ jobs: smoke_test_crck_loss_analysis: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -220,6 +237,7 @@ jobs: smoke_test_slides_panda_loss_analysis: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -235,6 +253,7 @@ jobs: smoke_test_slides_panda_no_ddp_sampler: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: @@ -250,6 +269,7 @@ jobs: smoke_test_tiles_panda_no_ddp_sampler: runs-on: ubuntu-20.04 + needs: [ cancel-azureml ] steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/hi-ml-pr.yml b/.github/workflows/hi-ml-pr.yml index 4a114068..fe7bb210 100644 --- a/.github/workflows/hi-ml-pr.yml +++ b/.github/workflows/hi-ml-pr.yml @@ -22,11 +22,22 @@ env: HIML_DIST_ARTIFACT_SUFFIX: '-dist' HIML_PACKAGE_NAME_ARTIFACT_SUFFIX: '-package_name' HIML_VERSION_ARTIFACT_SUFFIX: '-latest_version' + # Set the AML experiment name for all AML jobs submitted during tests. Github.ref looks like + # "refs/pull/123/merge" for PR builds. + HIML_EXPERIMENT_NAME: ${{ github.ref }} PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} PYPI_TEST_API_TOKEN: ${{ secrets.PYPI_TEST_API_TOKEN }} jobs: + cancel-azureml: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Cancel previous AzureML runs + uses: ./.github/actions/cancel_azureml_jobs + flake8: runs-on: ubuntu-20.04 steps: @@ -144,7 +155,7 @@ jobs: test-artifact-pkg: runs-on: ubuntu-20.04 - needs: [ build-python ] + needs: [ build-python, cancel-azureml ] strategy: matrix: folder: [ hi-ml, hi-ml-azure ] diff --git a/hi-ml-azure/src/health_azure/himl.py b/hi-ml-azure/src/health_azure/himl.py index ed9ebb3e..64c81bfb 100644 --- a/hi-ml-azure/src/health_azure/himl.py +++ b/hi-ml-azure/src/health_azure/himl.py @@ -35,7 +35,8 @@ from azureml.train.hyperdrive import HyperDriveConfig, GridParameterSampling, Pr from azureml.dataprep.fuse.daemon import MountContext from health_azure.amulet import (ENV_AMLT_DATAREFERENCE_DATA, ENV_AMLT_DATAREFERENCE_OUTPUT, is_amulet_job) -from health_azure.utils import (create_python_environment, create_run_recovery_id, find_file_in_parent_to_pythonpath, +from health_azure.utils import (ENV_EXPERIMENT_NAME, create_python_environment, create_run_recovery_id, + find_file_in_parent_to_pythonpath, is_run_and_child_runs_completed, is_running_in_azure_ml, register_environment, run_duration_string_to_seconds, to_azure_friendly_string, RUN_CONTEXT, get_workspace, PathOrString, DEFAULT_ENVIRONMENT_VARIABLES, get_ml_client, @@ -424,6 +425,30 @@ def get_display_name_v2(tags: Optional[Dict[str, Any]] = None) -> str: return display_name +def effective_experiment_name(experiment_name: Optional[str], + entry_script: Optional[PathOrString] = None) -> str: + """Choose the experiment name to use for the run. If provided in the environment variable HIML_EXPERIMENT_NAME, + then use that. Otherwise, use the argument `experiment_name`, or fall back to the default based on the + entry point script. + + :param experiment_name: The name of the AzureML experiment in which the run should be submitted. + :param entry_script: The script that should be run in AzureML. + :return: The effective experiment name to use, based on the fallback rules above. + """ + value_from_env = os.environ.get(ENV_EXPERIMENT_NAME, "") + if value_from_env: + raw_value = value_from_env + elif experiment_name: + raw_value = experiment_name + elif entry_script is not None: + raw_value = Path(entry_script).stem + else: + raise ValueError("No experiment name provided, and no entry script provided. ") + cleaned_value = to_azure_friendly_string(raw_value) + assert cleaned_value is not None, "Expecting an actual string" + return cleaned_value + + def submit_run_v2(workspace: Optional[Workspace], experiment_name: str, environment: EnvironmentV2, @@ -852,6 +877,7 @@ def submit_to_azure_if_needed( # type: ignore amlignore_path = snapshot_root_directory / AML_IGNORE_FILE lines_to_append = [str(path) for path in (ignored_folders or [])] + with append_to_amlignore(amlignore=amlignore_path, lines_to_append=lines_to_append): if strictly_aml_v1: run_config = create_run_configuration( @@ -880,10 +906,8 @@ def submit_to_azure_if_needed( # type: ignore else: config_to_submit = script_run_config - effective_experiment_name = experiment_name or Path(script_run_config.script).stem - run = submit_run(workspace=workspace, - experiment_name=effective_experiment_name, + experiment_name=effective_experiment_name(experiment_name, script_run_config.script), script_run_config=config_to_submit, tags=tags, wait_for_completion=wait_for_completion, @@ -898,7 +922,6 @@ def submit_to_azure_if_needed( # type: ignore if entry_script is None: entry_script = Path(sys.argv[0]) script_params = script_params or sys.argv[1:] - effective_experiment_name = experiment_name or Path(entry_script).stem ml_client = get_ml_client(ml_client=ml_client, aml_workspace=workspace) registered_env = register_environment_v2(environment, ml_client) @@ -908,7 +931,7 @@ def submit_to_azure_if_needed( # type: ignore run = submit_run_v2(workspace=workspace, input_datasets_v2=input_datasets_v2, output_datasets_v2=output_datasets_v2, - experiment_name=effective_experiment_name, + experiment_name=effective_experiment_name(experiment_name, entry_script), environment=registered_env, snapshot_root_directory=snapshot_root_directory, entry_script=entry_script, diff --git a/hi-ml-azure/src/health_azure/utils.py b/hi-ml-azure/src/health_azure/utils.py index 94492323..87d54ceb 100644 --- a/hi-ml-azure/src/health_azure/utils.py +++ b/hi-ml-azure/src/health_azure/utils.py @@ -76,6 +76,9 @@ ENV_LOCAL_RANK = "LOCAL_RANK" ENV_RANK = "RANK" MASTER_PORT_DEFAULT = 6105 +# Environment variables that affect job submission, in particular in builds +ENV_EXPERIMENT_NAME = "HIML_EXPERIMENT_NAME" + # Other Azure ML related variables ENVIRONMENT_VERSION = "1" FINAL_MODEL_FOLDER = "final_model" diff --git a/hi-ml-azure/testazure/testazure/test_azure_util.py b/hi-ml-azure/testazure/testazure/test_azure_util.py index f83be6d9..40960460 100644 --- a/hi-ml-azure/testazure/testazure/test_azure_util.py +++ b/hi-ml-azure/testazure/testazure/test_azure_util.py @@ -35,17 +35,16 @@ from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundErr from azureml.data.azure_storage_datastore import AzureBlobDatastore import health_azure.utils as util -from health_azure.himl import AML_IGNORE_FILE, append_to_amlignore +from health_azure.himl import AML_IGNORE_FILE, append_to_amlignore, effective_experiment_name from health_azure.utils import (ENV_MASTER_ADDR, ENV_MASTER_PORT, MASTER_PORT_DEFAULT, PackageDependency, create_argparser, get_credential) from testazure.test_himl import RunTarget, render_and_run_test_script from testazure.utils_testazure import (DEFAULT_IGNORE_FOLDERS, DEFAULT_WORKSPACE, MockRun, change_working_directory, - himl_azure_root, repository_root) + experiment_for_unittests, himl_azure_root, repository_root) RUN_ID = uuid4().hex RUN_NUMBER = 42 EXPERIMENT_NAME = "fancy-experiment" -AML_TESTS_EXPERIMENT = "test_experiment" def oh_no() -> None: @@ -1103,16 +1102,16 @@ def test_get_latest_aml_run_from_experiment(num_runs: int, tags: Dict[str, str], assert len(aml_runs) == expected_num_returned -def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None: +def test_get_latest_aml_run_from_experiment_remote() -> None: """ Test that a remote run with particular tags can be correctly retrieved, ignoring any more recent experiments which do not have the correct tags. Note: this test will instantiate 2 new Runs in the - workspace described in your config.json file, under an experiment defined by AML_TESTS_EXPERIMENT + workspace described in your config.json file, under an experiment defined by experiment_for_unittests() """ ws = DEFAULT_WORKSPACE.workspace assert True - experiment = Experiment(ws, AML_TESTS_EXPERIMENT) + experiment = Experiment(ws, experiment_for_unittests()) config = ScriptRunConfig( source_directory=".", command=["cd ."], # command that does nothing @@ -1123,6 +1122,7 @@ def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None: amlignore=Path("") / AML_IGNORE_FILE, lines_to_append=DEFAULT_IGNORE_FOLDERS): first_run = experiment.submit(config) + first_run.diplay_name = "test_get_latest_aml_run_from_experiment_remote" tags = {"experiment_type": "great_experiment"} first_run.set_tags(tags) first_run.wait_for_completion() @@ -1136,7 +1136,7 @@ def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None: second_run.remove_tags(tags) # Retrieve latest run with given tags (expect first_run to be returned) - retrieved_runs = util.get_latest_aml_runs_from_experiment(AML_TESTS_EXPERIMENT, tags=tags, aml_workspace=ws) + retrieved_runs = util.get_latest_aml_runs_from_experiment(experiment_for_unittests(), tags=tags, aml_workspace=ws) assert len(retrieved_runs) == 1 assert retrieved_runs[0].id == first_run.id assert retrieved_runs[0].get_tags() == tags @@ -1258,7 +1258,7 @@ def test_download_file_from_run(tmp_path: Path, dummy_env_vars: Dict[str, str], def test_download_file_from_run_remote(tmp_path: Path) -> None: # This test will create a Run in your workspace (using only local compute) ws = DEFAULT_WORKSPACE.workspace - experiment = Experiment(ws, AML_TESTS_EXPERIMENT) + experiment = Experiment(ws, experiment_for_unittests()) config = ScriptRunConfig( source_directory=".", command=["cd ."], # command that does nothing @@ -1268,6 +1268,7 @@ def test_download_file_from_run_remote(tmp_path: Path) -> None: amlignore=Path("") / AML_IGNORE_FILE, lines_to_append=DEFAULT_IGNORE_FOLDERS): run = experiment.submit(config) + run.diplay_name = "test_download_file_from_run_remote" file_to_upload = tmp_path / "dummy_file.txt" file_contents = "Hello world" @@ -1308,7 +1309,7 @@ def test_download_run_file_during_run(tmp_path: Path) -> None: information about the workspace to use, but pick up the current workspace. """ # Create a run that contains a simple txt file - experiment_name = "himl-tests" + experiment_name = effective_experiment_name("himl-tests") run_to_download_from = util.create_aml_run_object(experiment_name=experiment_name, workspace=DEFAULT_WORKSPACE.workspace) file_contents = "Hello World!" @@ -1582,7 +1583,7 @@ def test_checkpoint_download_remote(tmp_path: Path) -> None: prefix = "outputs/checkpoints/" ws = DEFAULT_WORKSPACE.workspace - experiment = Experiment(ws, AML_TESTS_EXPERIMENT) + experiment = Experiment(ws, experiment_for_unittests()) config = ScriptRunConfig( source_directory=".", command=["cd ."], # command that does nothing @@ -1592,6 +1593,7 @@ def test_checkpoint_download_remote(tmp_path: Path) -> None: amlignore=Path("") / AML_IGNORE_FILE, lines_to_append=DEFAULT_IGNORE_FOLDERS): run = experiment.submit(config) + run.display_name = "test_checkpoint_download_remote" file_contents = "Hello world" file_name = "" # for pyright @@ -2341,8 +2343,8 @@ def test_create_run() -> None: """ Test if we can create an AML run object here in the test suite, write logs and read them back in. """ - run_name = "foo" - experiment_name = "himl-tests" + run_name = "test_create_run" + experiment_name = effective_experiment_name("himl-tests") run: Optional[Run] = None try: run = util.create_aml_run_object(experiment_name=experiment_name, run_name=run_name, diff --git a/hi-ml-azure/testazure/testazure/test_himl.py b/hi-ml-azure/testazure/testazure/test_himl.py index 6d75a2c9..e092784b 100644 --- a/hi-ml-azure/testazure/testazure/test_himl.py +++ b/hi-ml-azure/testazure/testazure/test_himl.py @@ -45,6 +45,7 @@ from health_azure.datasets import ( ) from health_azure.utils import ( DEFAULT_ENVIRONMENT_VARIABLES, + ENV_EXPERIMENT_NAME, ENVIRONMENT_VERSION, EXPERIMENT_RUN_SEPARATOR, WORKSPACE_CONFIG_JSON, @@ -741,6 +742,7 @@ def test_submit_run(mock_workspace: mock.MagicMock, assert "AzureML completed" not in out +@pytest.mark.fast def test_submit_run_v2(tmp_path: Path) -> None: def _mock_sweep(*args: Any, **kwargs: Any) -> MagicMock: assert kwargs.get("compute") == dummy_compute_target @@ -864,7 +866,7 @@ def test_submit_run_v2(tmp_path: Path) -> None: # 'command' should be called with the same args print(mock_command.call) # command should be called once when initialising the command job and once when updating the param sampling - mock_command.call_count == 2 + assert mock_command.call_count == 2 mock_command.assert_any_call( code=str(dummy_root_directory), @@ -1657,6 +1659,7 @@ def test_extract_v2_inputs_outputs_from_args() -> None: assert len(output_datasets) == 0 +@pytest.mark.fast def test_get_display_name_v2() -> None: dummy_display_name = "job display name" expected_display_name = "job-display-name" @@ -1676,3 +1679,24 @@ def test_get_display_name_v2() -> None: # if no tags provided, display name should be empty display_name = himl.get_display_name_v2() assert display_name == "" + + +@pytest.mark.fast +def test_experiment_name() -> None: + """Test the logic for choosing experiment names: Explicitly given experiment name should be used if provided, + otherwise fall back to environment variables and thpwden script name.""" + # When the test suite runs on the Github, the environment variable "HIML_EXPERIMENT_NAME" will be set. + # Remove it to test the default behaviour. + with mock.patch.dict(os.environ): + os.environ.pop(ENV_EXPERIMENT_NAME, None) + assert himl.effective_experiment_name("explicit", Path()) == "explicit" + assert himl.effective_experiment_name("", Path("from_script.py")) == "from_script" + # Provide experiment names with special characters here that should be filtered out + with mock.patch("health_azure.himl.to_azure_friendly_string", return_value="mock_return"): + assert himl.effective_experiment_name("explicit", Path()) == "mock_return" + assert himl.effective_experiment_name("explicit/", Path()) == "explicit_" + with mock.patch.dict(os.environ, {ENV_EXPERIMENT_NAME: "name/from/env"}): + assert himl.effective_experiment_name("explicit", Path()) == "name_from_env" + with mock.patch.dict(os.environ, {ENV_EXPERIMENT_NAME: "name_from_env"}): + assert himl.effective_experiment_name("explicit", Path()) == "name_from_env" + assert himl.effective_experiment_name("", Path("from_script.py")) == "name_from_env" diff --git a/hi-ml-azure/testazure/testazure/utils_testazure.py b/hi-ml-azure/testazure/testazure/utils_testazure.py index 3c51d1a3..3eb33f3d 100644 --- a/hi-ml-azure/testazure/testazure/utils_testazure.py +++ b/hi-ml-azure/testazure/testazure/utils_testazure.py @@ -10,7 +10,8 @@ from contextlib import contextmanager from pathlib import Path from typing import Dict, Generator, Optional -from health_azure.utils import (UnitTestWorkspaceWrapper, WORKSPACE_CONFIG_JSON) +from health_azure.utils import (ENV_EXPERIMENT_NAME, WORKSPACE_CONFIG_JSON, UnitTestWorkspaceWrapper, + to_azure_friendly_string) DEFAULT_DATASTORE = "himldatasets" FALLBACK_SINGLE_RUN = "refs_pull_545_merge:refs_pull_545_merge_1626538212_d2b07afd" @@ -46,6 +47,15 @@ def repository_root() -> Path: return himl_azure_root().parent +def experiment_for_unittests() -> str: + """ + Gets the name of the experiment to use for tests. + """ + experiment_name = to_azure_friendly_string(os.getenv(ENV_EXPERIMENT_NAME, "unittests")) + assert experiment_name is not None + return experiment_name + + @contextmanager def change_working_directory(path_or_str: Path) -> Generator: """ diff --git a/hi-ml-cpath/Makefile b/hi-ml-cpath/Makefile index ce612596..0b03a047 100644 --- a/hi-ml-cpath/Makefile +++ b/hi-ml-cpath/Makefile @@ -160,11 +160,11 @@ define DEFAULT_SMOKE_TEST_ARGS endef define AML_ONE_DEVICE_ARGS ---cluster=testing-nc6 --wait_for_completion --num_nodes=1 --max_num_gpus=1 --strictly_aml_v1=True +--cluster=testing-nc6 --wait_for_completion --num_nodes=1 --max_num_gpus=1 --strictly_aml_v1=True --max_run_duration=1h endef define AML_MULTIPLE_DEVICE_ARGS ---cluster=dedicated-nc24s-v2 --wait_for_completion --strictly_aml_v1=True +--cluster=dedicated-nc24s-v2 --wait_for_completion --strictly_aml_v1=True --max_run_duration=1h endef define DEEPSMILEDEFAULT_SMOKE_TEST_ARGS @@ -206,7 +206,7 @@ smoke_test_cucim_slidespandaimagenetmil_local: # Once running in AML the following test takes around 6 minutes smoke_test_cucim_slidespandaimagenetmil_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_cucim_slidespandaimagenetmil_aml;} smoke_test_openslide_slidespandaimagenetmil_local: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ @@ -214,7 +214,7 @@ smoke_test_openslide_slidespandaimagenetmil_local: smoke_test_openslide_slidespandaimagenetmil_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${OPENSLIDE_BACKEND_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${OPENSLIDE_BACKEND_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_openslide_slidespandaimagenetmil_aml;} # The following test takes about 6 minutes smoke_test_tilespandaimagenetmil_local: @@ -223,7 +223,7 @@ smoke_test_tilespandaimagenetmil_local: smoke_test_tilespandaimagenetmil_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tilespandaimagenetmil_aml;} # The following test takes about 30 seconds smoke_test_tcgacrcksslmil_local: @@ -232,7 +232,7 @@ smoke_test_tcgacrcksslmil_local: smoke_test_tcgacrcksslmil_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tcgacrcksslmil_aml;} smoke_test_tcgacrckimagenetmil_local: { ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKIMANEGETMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ @@ -240,7 +240,7 @@ smoke_test_tcgacrckimagenetmil_local: smoke_test_tcgacrckimagenetmil_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKIMANEGETMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tcgacrckimagenetmil_aml;} # The following test takes about 3 minutes smoke_test_crck_simclr_local: @@ -249,7 +249,7 @@ smoke_test_crck_simclr_local: smoke_test_crck_simclr_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${CRCKSIMCLR_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${CRCKSIMCLR_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${CRCKSIMCLR_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_crck_simclr_aml;} smoke_test_crck_flexible_finetuning_local: { ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ @@ -257,7 +257,7 @@ smoke_test_crck_flexible_finetuning_local: smoke_test_crck_flexible_finetuning_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${CRCK_TUNING_ARGS};} + ${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${CRCK_TUNING_ARGS} --tag smoke_test_crck_flexible_finetuning_aml;} smoke_test_crck_loss_analysis_local: { ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ @@ -265,7 +265,7 @@ smoke_test_crck_loss_analysis_local: smoke_test_crck_loss_analysis_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${LOSS_ANALYSIS_ARGS};} + ${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${LOSS_ANALYSIS_ARGS} --tag smoke_test_crck_loss_analysis_aml;} smoke_test_slides_panda_loss_analysis_local: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ @@ -273,7 +273,7 @@ smoke_test_slides_panda_loss_analysis_local: smoke_test_slides_panda_loss_analysis_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${LOSS_ANALYSIS_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${LOSS_ANALYSIS_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_slides_panda_loss_analysis_aml;} smoke_test_slides_panda_no_ddp_sampler_local: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ @@ -281,7 +281,7 @@ smoke_test_slides_panda_no_ddp_sampler_local: smoke_test_slides_panda_no_ddp_sampler_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_slides_panda_no_ddp_sampler_aml;} smoke_test_tiles_panda_no_ddp_sampler_local: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ @@ -289,7 +289,7 @@ smoke_test_tiles_panda_no_ddp_sampler_local: smoke_test_tiles_panda_no_ddp_sampler_aml: { ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \ - ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};} + ${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tiles_panda_no_ddp_sampler_aml;} smoke tests local: smoke_test_cucim_slidespandaimagenetmil_local smoke_test_openslide_slidespandaimagenetmil_local smoke_test_tilespandaimagenetmil_local smoke_test_tcgacrcksslmil_local smoke_test_crck_simclr_local smoke_test_crck_flexible_finetuning_local smoke_test_tcgacrckimagenetmil_local smoke_test_crck_loss_analysis_local smoke_test_slides_panda_loss_analysis_local smoke_test_slides_panda_no_ddp_sampler_local smoke_test_tiles_panda_no_ddp_sampler_local diff --git a/hi-ml/src/health_ml/experiment_config.py b/hi-ml/src/health_ml/experiment_config.py index 1ddc4c77..f3e29d79 100644 --- a/hi-ml/src/health_ml/experiment_config.py +++ b/hi-ml/src/health_ml/experiment_config.py @@ -50,3 +50,7 @@ class ExperimentConfig(param.Parameterized): param.ClassSelector(class_=Path, default=None, allow_None=True, doc="The path to the AzureML workspace configuration file. If not specified, the " "configuration file in the current folder or one of its parents will be used.") + max_run_duration: str = param.String( + default="", doc="The maximum runtime that is allowed for this job in AzureML. This is given as a floating" + "point number with a string suffix s, m, h, d for seconds, minutes, hours, day. Examples: '3.5h', '2d'" + ) diff --git a/hi-ml/src/health_ml/runner.py b/hi-ml/src/health_ml/runner.py index b1a2a502..d4d335ea 100755 --- a/hi-ml/src/health_ml/runner.py +++ b/hi-ml/src/health_ml/runner.py @@ -281,6 +281,7 @@ class Runner: input_datasets=input_datasets, # type: ignore num_nodes=self.experiment_config.num_nodes, wait_for_completion=self.experiment_config.wait_for_completion, + max_run_duration=self.experiment_config.max_run_duration, ignored_folders=[], submit_to_azureml=bool(self.experiment_config.cluster), docker_base_image=DEFAULT_DOCKER_BASE_IMAGE, diff --git a/hi-ml/testhiml/testhiml/test_regression_test_utils.py b/hi-ml/testhiml/testhiml/test_regression_test_utils.py index 7bdb9ee9..d834438f 100644 --- a/hi-ml/testhiml/testhiml/test_regression_test_utils.py +++ b/hi-ml/testhiml/testhiml/test_regression_test_utils.py @@ -28,7 +28,7 @@ from health_ml.utils.regression_test_utils import ( compare_folder_contents, compare_folders_and_run_outputs, ) -from testazure.utils_testazure import DEFAULT_WORKSPACE +from testazure.utils_testazure import DEFAULT_WORKSPACE, experiment_for_unittests def create_folder_and_write_text(file: Path, text: str) -> None: @@ -242,7 +242,9 @@ def upload_to_run_and_compare(regression_test_subfolder: str, run_to_mock: str, file_contents = "some file contents" file_name = "contents.txt" regression_test_folder = tmp_path / "expected" - run = create_aml_run_object(workspace=DEFAULT_WORKSPACE.workspace, experiment_name="test_regression_test_utils") + run = create_aml_run_object(workspace=DEFAULT_WORKSPACE.workspace, + experiment_name=experiment_for_unittests(), + run_name="upload_to_run_and_compare") # Upload a single file to the newly created run. When comparing the run output files, # and seeing this in the set of files that are expected to exist on the run, this should pass. file1 = tmp_path / file_name diff --git a/hi-ml/testhiml/testhiml/test_run_ml.py b/hi-ml/testhiml/testhiml/test_run_ml.py index 45f81c2e..fea670ec 100644 --- a/hi-ml/testhiml/testhiml/test_run_ml.py +++ b/hi-ml/testhiml/testhiml/test_run_ml.py @@ -2,12 +2,14 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +import os import shutil import pytest from pathlib import Path from typing import Generator -from unittest.mock import DEFAULT, MagicMock, Mock, patch +from unittest import mock +from unittest.mock import MagicMock, Mock, patch from _pytest.logging import LogCaptureFixture from pytorch_lightning import LightningModule @@ -21,8 +23,8 @@ from health_ml.run_ml import MLRunner, get_mlflow_run_id_from_previous_loggers from health_ml.utils.checkpoint_utils import CheckpointParser from health_ml.utils.common_utils import is_gpu_available from health_ml.utils.lightning_loggers import HimlMLFlowLogger, StoringLogger -from health_azure.utils import is_global_rank_zero -from testazure.utils_testazure import DEFAULT_WORKSPACE +from health_azure.utils import ENV_EXPERIMENT_NAME, is_global_rank_zero +from testazure.utils_testazure import DEFAULT_WORKSPACE, experiment_for_unittests from testhiml.utils.fixed_paths_for_tests import mock_run_id no_gpu = not is_gpu_available() @@ -299,11 +301,11 @@ def test_run(run_inference_only: bool, run_extra_val_epoch: bool, ml_runner_with with patch("health_ml.run_ml.create_lightning_trainer") as mock_create_trainer: with patch.multiple( ml_runner_with_container, - checkpoint_handler=DEFAULT, - load_model_checkpoint=DEFAULT, - run_training=DEFAULT, - run_validation=DEFAULT, - run_inference=DEFAULT, + checkpoint_handler=mock.DEFAULT, + load_model_checkpoint=mock.DEFAULT, + run_training=mock.DEFAULT, + run_validation=mock.DEFAULT, + run_inference=mock.DEFAULT, ) as mocks: mock_create_trainer.return_value = MagicMock(), MagicMock() ml_runner_with_container.run() @@ -327,9 +329,9 @@ def test_run_inference_only(ml_runner_with_run_id: MLRunner) -> None: with patch("health_ml.run_ml.create_lightning_trainer") as mock_create_trainer: with patch.multiple( ml_runner_with_run_id, - run_training=DEFAULT, - run_validation=DEFAULT, - validate_model_weights=DEFAULT + run_training=mock.DEFAULT, + run_validation=mock.DEFAULT, + validate_model_weights=mock.DEFAULT ) as mocks: mock_trainer = MagicMock() mock_create_trainer.return_value = mock_trainer, MagicMock() @@ -349,7 +351,7 @@ def test_resume_training_from_run_id(run_extra_val_epoch: bool, ml_runner_with_r ml_runner_with_run_id.container.max_num_gpus = 0 ml_runner_with_run_id.container.max_epochs += 10 assert ml_runner_with_run_id.checkpoint_handler.trained_weights_path - with patch.multiple(ml_runner_with_run_id, run_validation=DEFAULT, run_inference=DEFAULT) as mocks: + with patch.multiple(ml_runner_with_run_id, run_validation=mock.DEFAULT, run_inference=mock.DEFAULT) as mocks: ml_runner_with_run_id.run() assert mocks["run_validation"].called == run_extra_val_epoch mocks["run_inference"].assert_called_once() @@ -381,8 +383,7 @@ def test_log_on_vm(log_from_vm: bool) -> None: container = HelloWorld() container.max_epochs = 1 # Mimic an experiment name given on the command line. - experiment_name = "unittest" - container.experiment = experiment_name + container.experiment = experiment_for_unittests() # The tag is used to identify the run, similar to the behaviour when submitting a run to AzureML. tag = f"test_log_on_vm [{log_from_vm}]" container.tag = tag @@ -404,18 +405,23 @@ def test_log_on_vm(log_from_vm: bool) -> None: assert isinstance(logger, HimlMLFlowLogger) +@pytest.mark.fast def test_experiment_name() -> None: """Test that the experiment name is set correctly, choosing either the experiment name given on the commandline or the model name""" - container = HelloWorld() - # No experiment name given on the commandline: use the model name - model_name = "some_model" - container._model_name = model_name - assert container.effective_experiment_name == model_name - # Experiment name given on the commandline: use the experiment name - experiment_name = "unittest" - container.experiment = experiment_name - assert container.effective_experiment_name == experiment_name + # When the test suite runs on the Github, the environment variable "HIML_EXPERIMENT_NAME" will be set. + # Remove it to test the default behaviour. + with patch.dict(os.environ): + os.environ.pop(ENV_EXPERIMENT_NAME, None) + container = HelloWorld() + # No experiment name given on the commandline: use the model name + model_name = "some_model" + container._model_name = model_name + assert container.effective_experiment_name == model_name + # Experiment name given on the commandline: use the experiment name + experiment_name = experiment_for_unittests() + container.experiment = experiment_name + assert container.effective_experiment_name == experiment_name def test_get_mlflow_run_id_from_previous_loggers() -> None: diff --git a/hi-ml/testhiml/testhiml/test_serialization.py b/hi-ml/testhiml/testhiml/test_serialization.py index 4d5442f0..03f31d38 100644 --- a/hi-ml/testhiml/testhiml/test_serialization.py +++ b/hi-ml/testhiml/testhiml/test_serialization.py @@ -12,6 +12,7 @@ from torchvision.transforms import Compose, Resize, CenterCrop from azureml.core import Run from health_azure import object_to_yaml, create_aml_run_object +from health_azure.himl import effective_experiment_name from health_azure.utils import is_running_in_azure_ml from health_ml.utils.serialization import ModelInfo from testazure.utils_testazure import DEFAULT_WORKSPACE @@ -107,7 +108,7 @@ def test_serialization_roundtrip() -> None: def test_get_metadata() -> None: """Test if model metadata is read correctly from the AzureML run.""" run_name = "foo" - experiment_name = "himl-tests" + experiment_name = effective_experiment_name("himl-tests") run: Optional[Run] = None try: run = create_aml_run_object( diff --git a/hi-ml/testhiml/testhiml/utils/fixed_paths_for_tests.py b/hi-ml/testhiml/testhiml/utils/fixed_paths_for_tests.py index ed828b22..7547479f 100644 --- a/hi-ml/testhiml/testhiml/utils/fixed_paths_for_tests.py +++ b/hi-ml/testhiml/testhiml/utils/fixed_paths_for_tests.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Optional from functools import lru_cache +from health_azure.himl import effective_experiment_name from health_azure.utils import PathOrString from health_azure.utils import create_aml_run_object from testazure.utils_testazure import DEFAULT_WORKSPACE @@ -50,7 +51,7 @@ def mock_run_id(id: int = 0) -> str: :return: The run id of the created run that contains the checkpoint. """ - experiment_name = "himl-tests" + experiment_name = effective_experiment_name("himl-tests") run_to_download_from = create_aml_run_object(experiment_name=experiment_name, workspace=DEFAULT_WORKSPACE.workspace) full_file_path = full_test_data_path(suffix="hello_world_checkpoint.ckpt") diff --git a/hi-ml/testhiml/testhiml/utils/test_logging.py b/hi-ml/testhiml/testhiml/utils/test_logging.py index 2c64fe4b..d5892e67 100644 --- a/hi-ml/testhiml/testhiml/utils/test_logging.py +++ b/hi-ml/testhiml/testhiml/utils/test_logging.py @@ -20,13 +20,14 @@ from azureml._restclient.constants import RunStatus from azureml.core import Run from health_azure import RUN_CONTEXT, create_aml_run_object +from health_azure.himl import effective_experiment_name from health_ml.utils import AzureMLLogger, AzureMLProgressBar, log_learning_rate, log_on_epoch from health_ml.utils.logging import _preprocess_hyperparams from testhiml.utils_testhiml import DEFAULT_WORKSPACE def create_unittest_run_object(snapshot_directory: Optional[Path] = None) -> Run: - return create_aml_run_object(experiment_name="himl-tests", + return create_aml_run_object(experiment_name=effective_experiment_name("himl-tests"), workspace=DEFAULT_WORKSPACE.workspace, snapshot_directory=snapshot_directory or ".") @@ -295,7 +296,9 @@ def test_azureml_logger_actual_run() -> None: """ When running outside of AzureML, a new run should be created. """ - logger = AzureMLLogger(enable_logging_outside_azure_ml=True, workspace=DEFAULT_WORKSPACE.workspace) + logger = AzureMLLogger(enable_logging_outside_azure_ml=True, + workspace=DEFAULT_WORKSPACE.workspace, + run_name="test_azureml_logger_actual_run") assert not logger.is_running_in_azure_ml assert logger.run is not None assert logger.run != RUN_CONTEXT