ENH: Group AML experiments by PR number (#681)

Ensure that all AML experiments that originate from a PR are submitted to the same experiment. - Add capability to hardcode experiment name for AML submission via an environment variable - Add a PR job that looks for running AML jobs and cancels them - Add timeouts for all smoke test jobs
2022-11-28 14:26:24 +00:00 · 2022-11-28 14:26:24 +00:00 · e81323f589
--- a/.github/actions/cancel_azureml_jobs/action.yml
+++ b/.github/actions/cancel_azureml_jobs/action.yml
@ -0,0 +1,50 @@
+name: 'Cancel environment setup'
+description: 'Set up environment hi-ml-cpath workflows'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Create AzureML config.json file
+      shell: bash
+      run: ./create_config.sh
+
+    # Use a cache action to save the full conda environment, so that we don't have to reinstall it every time.
+    # Paths are tied to the location of the miniconda installation, and may need adjustment on a different OS.
+    - name: Retrieve cached Conda environment
+      id: cache-conda
+      uses: actions/cache@v3
+      with:
+        path: /usr/share/miniconda/envs/AzureML_SDK
+        key: azureml-conda-${{ hashFiles('.github/actions/cancel_azureml_jobs/azureml-env.yml') }}
+
+    # If the cache action didn't find a cache, then install the conda environment afresh.
+    - name: Build Conda environment from scratch
+      uses: conda-incubator/setup-miniconda@v2
+      if: steps.cache-conda.outputs.cache-hit != 'true'
+      with:
+        environment-file: .github/actions/cancel_azureml_jobs/azureml-env.yml
+        activate-environment: AzureML_SDK
+
+    # Modify the path to point to the new or cached Conda environment.
+    # This is effectively also what `conda activate` does.
+    - name: Activate environment
+      shell: bash
+      run: |
+        echo "Adding Conda bin folder to path"
+        echo "/usr/share/miniconda/envs/AzureML_SDK/bin" >> $GITHUB_PATH
+
+    - name: Conda info
+      shell: bash
+      run: conda info
+
+    - name: Show active Python path
+      shell: bash
+      run: which python
+
+    # The AzureML experiment where the jobs are deleted is read from
+    # the environment variable HIML_EXPERIMENT_NAME set in the workflow file.
+    - name: Cancel AzureML jobs
+      shell: bash
+      run: |
+        echo "Cancelling all unfinished AzureML jobs for this PR."
+        python .github/actions/cancel_azureml_jobs/cancel_aml_jobs.py
--- a/.github/actions/cancel_azureml_jobs/azureml-env.yml
+++ b/.github/actions/cancel_azureml_jobs/azureml-env.yml
@ -0,0 +1,8 @@
+name: AzureML_SDK
+channels:
+  - defaults
+dependencies:
+  - pip=20.1.1
+  - python=3.7.3
+  - pip:
+      - azureml-sdk==1.36.0
--- a/.github/actions/cancel_azureml_jobs/cancel_aml_jobs.py
+++ b/.github/actions/cancel_azureml_jobs/cancel_aml_jobs.py
@ -0,0 +1,42 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+import os
+
+from azureml._restclient.constants import RunStatus
+from azureml.core import Experiment, Run, Workspace
+from azureml.core.authentication import ServicePrincipalAuthentication
+
+
+def cancel_running_and_queued_jobs() -> None:
+    print("Authenticating")
+    auth = ServicePrincipalAuthentication(
+        tenant_id='72f988bf-86f1-41af-91ab-2d7cd011db47',
+        service_principal_id=os.environ["HIML_SERVICE_PRINCIPAL_ID"],
+        service_principal_password=os.environ["HIML_SERVICE_PRINCIPAL_PASSWORD"])
+    print("Getting AML workspace")
+    workspace = Workspace.get(
+        name="hi-ml",
+        auth=auth,
+        subscription_id=os.environ["HIML_SUBSCRIPTION_ID"],
+        resource_group=os.environ["HIML_RESOURCE_GROUP"]
+    )
+    experiment_name = os.environ["HIML_EXPERIMENT_NAME"]
+    experiment_name = experiment_name.replace("/", "_")
+    print(f"Experiment: {experiment_name}")
+    experiment = Experiment(workspace, name=experiment_name)
+    print(f"Retrieved experiment {experiment.name}")
+    for run in experiment.get_runs(include_children=True, properties={}):
+        assert isinstance(run, Run)
+        status_suffix = f"'{run.status}' run {run.id} ({run.display_name})"
+        if run.status in (RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.FINALIZING, RunStatus.CANCELED,
+                          RunStatus.CANCEL_REQUESTED):
+            print(f"Skipping {status_suffix}")
+        else:
+            print(f"Cancelling {status_suffix}")
+            run.cancel()
+
+
+if __name__ == "__main__":
+    cancel_running_and_queued_jobs()
--- a/.github/workflows/cpath-pr.yml
+++ b/.github/workflows/cpath-pr.yml
@ -23,9 +23,20 @@ env:
  HIML_WORKSPACE_NAME: ${{ secrets.HIML_WORKSPACE_NAME }}
  HIML_SERVICE_PRINCIPAL_ID: ${{ secrets.HIML_SERVICE_PRINCIPAL_ID }}
  HIML_SERVICE_PRINCIPAL_PASSWORD: ${{ secrets.HIML_SERVICE_PRINCIPAL_PASSWORD }}
+  # Set the AML experiment name for all AML jobs submitted during tests. Github.ref looks like
+  # "refs/pull/123/merge" for PR builds.
+  HIML_EXPERIMENT_NAME: ${{ github.ref }}

 jobs:

+  cancel-azureml:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Cancel previous AzureML runs
+        uses: ./.github/actions/cancel_azureml_jobs
+
  flake8:
    runs-on: ubuntu-20.04
    steps:
@ -64,6 +75,7 @@ jobs:

  pytest:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -85,10 +97,7 @@ jobs:
        if: always()
        shell: bash -l {0}
        run: |
-          branch_prefix="refs/heads/"
-          full_branch_name=$GITHUB_REF
-          branch_name_without_prefix=${full_branch_name#$branch_prefix}
-          python hi-ml-azure/run_pytest.py --mark=gpu --cluster=pr-gpu --conda_env=${{ env.folder }}/environment.yml --folder=${{ env.folder }} --coverage_module=${{ env.module_for_coverage_reporting }} --experiment="$branch_name_without_prefix"
+          python hi-ml-azure/run_pytest.py --mark=gpu --cluster=pr-gpu --conda_env=${{ env.folder }}/environment.yml --folder=${{ env.folder }} --coverage_module=${{ env.module_for_coverage_reporting }}

      - name: Upload coverage reports to Codecov
        # Coverage should also be uploaded if tests still fail.
@ -100,6 +109,7 @@ jobs:

  smoke_test_cucim_slidespandaimagenetmil:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -115,6 +125,7 @@ jobs:

  smoke_test_openslide_slidespandaimagenetmil:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -130,6 +141,7 @@ jobs:

  smoke_test_tilespandaimagenetmil:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -145,6 +157,7 @@ jobs:

  smoke_test_tcgacrckimagenetmil:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -160,6 +173,7 @@ jobs:

  smoke_test_tcgacrcksslmil:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -175,6 +189,7 @@ jobs:

  smoke_test_crck_simclr:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -190,6 +205,7 @@ jobs:

  smoke_test_crck_flexible_finetuning:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -205,6 +221,7 @@ jobs:

  smoke_test_crck_loss_analysis:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -220,6 +237,7 @@ jobs:

  smoke_test_slides_panda_loss_analysis:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -235,6 +253,7 @@ jobs:

  smoke_test_slides_panda_no_ddp_sampler:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
@ -250,6 +269,7 @@ jobs:

  smoke_test_tiles_panda_no_ddp_sampler:
    runs-on: ubuntu-20.04
+    needs: [ cancel-azureml ]
    steps:
      - uses: actions/checkout@v3
        with:
--- a/.github/workflows/hi-ml-pr.yml
+++ b/.github/workflows/hi-ml-pr.yml
@ -22,11 +22,22 @@ env:
  HIML_DIST_ARTIFACT_SUFFIX: '-dist'
  HIML_PACKAGE_NAME_ARTIFACT_SUFFIX: '-package_name'
  HIML_VERSION_ARTIFACT_SUFFIX: '-latest_version'
+  # Set the AML experiment name for all AML jobs submitted during tests. Github.ref looks like
+  # "refs/pull/123/merge" for PR builds.
+  HIML_EXPERIMENT_NAME: ${{ github.ref }}
  PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
  PYPI_TEST_API_TOKEN: ${{ secrets.PYPI_TEST_API_TOKEN }}

 jobs:

+  cancel-azureml:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Cancel previous AzureML runs
+        uses: ./.github/actions/cancel_azureml_jobs
+
  flake8:
    runs-on: ubuntu-20.04
    steps:
@ -144,7 +155,7 @@ jobs:

  test-artifact-pkg:
    runs-on: ubuntu-20.04
-    needs: [ build-python ]
+    needs: [ build-python, cancel-azureml ]
    strategy:
      matrix:
        folder: [ hi-ml, hi-ml-azure ]
--- a/hi-ml-azure/src/health_azure/himl.py
+++ b/hi-ml-azure/src/health_azure/himl.py
@ -35,7 +35,8 @@ from azureml.train.hyperdrive import HyperDriveConfig, GridParameterSampling, Pr
 from azureml.dataprep.fuse.daemon import MountContext

 from health_azure.amulet import (ENV_AMLT_DATAREFERENCE_DATA, ENV_AMLT_DATAREFERENCE_OUTPUT, is_amulet_job)
-from health_azure.utils import (create_python_environment, create_run_recovery_id, find_file_in_parent_to_pythonpath,
+from health_azure.utils import (ENV_EXPERIMENT_NAME, create_python_environment, create_run_recovery_id,
+                                find_file_in_parent_to_pythonpath,
                                is_run_and_child_runs_completed, is_running_in_azure_ml, register_environment,
                                run_duration_string_to_seconds, to_azure_friendly_string, RUN_CONTEXT, get_workspace,
                                PathOrString, DEFAULT_ENVIRONMENT_VARIABLES, get_ml_client,
@ -424,6 +425,30 @@ def get_display_name_v2(tags: Optional[Dict[str, Any]] = None) -> str:
    return display_name


+def effective_experiment_name(experiment_name: Optional[str],
+                              entry_script: Optional[PathOrString] = None) -> str:
+    """Choose the experiment name to use for the run. If provided in the environment variable HIML_EXPERIMENT_NAME,
+    then use that. Otherwise, use the argument `experiment_name`, or fall back to the default based on the
+    entry point script.
+
+    :param experiment_name: The name of the AzureML experiment in which the run should be submitted.
+    :param entry_script: The script that should be run in AzureML.
+    :return: The effective experiment name to use, based on the fallback rules above.
+    """
+    value_from_env = os.environ.get(ENV_EXPERIMENT_NAME, "")
+    if value_from_env:
+        raw_value = value_from_env
+    elif experiment_name:
+        raw_value = experiment_name
+    elif entry_script is not None:
+        raw_value = Path(entry_script).stem
+    else:
+        raise ValueError("No experiment name provided, and no entry script provided. ")
+    cleaned_value = to_azure_friendly_string(raw_value)
+    assert cleaned_value is not None, "Expecting an actual string"
+    return cleaned_value
+
+
 def submit_run_v2(workspace: Optional[Workspace],
                  experiment_name: str,
                  environment: EnvironmentV2,
@ -852,6 +877,7 @@ def submit_to_azure_if_needed(  # type: ignore

    amlignore_path = snapshot_root_directory / AML_IGNORE_FILE
    lines_to_append = [str(path) for path in (ignored_folders or [])]
+
    with append_to_amlignore(amlignore=amlignore_path, lines_to_append=lines_to_append):
        if strictly_aml_v1:
            run_config = create_run_configuration(
@ -880,10 +906,8 @@ def submit_to_azure_if_needed(  # type: ignore
            else:
                config_to_submit = script_run_config

-            effective_experiment_name = experiment_name or Path(script_run_config.script).stem
-
            run = submit_run(workspace=workspace,
-                             experiment_name=effective_experiment_name,
+                             experiment_name=effective_experiment_name(experiment_name, script_run_config.script),
                             script_run_config=config_to_submit,
                             tags=tags,
                             wait_for_completion=wait_for_completion,
@ -898,7 +922,6 @@ def submit_to_azure_if_needed(  # type: ignore
            if entry_script is None:
                entry_script = Path(sys.argv[0])
            script_params = script_params or sys.argv[1:]
-            effective_experiment_name = experiment_name or Path(entry_script).stem

            ml_client = get_ml_client(ml_client=ml_client, aml_workspace=workspace)
            registered_env = register_environment_v2(environment, ml_client)
@ -908,7 +931,7 @@ def submit_to_azure_if_needed(  # type: ignore
            run = submit_run_v2(workspace=workspace,
                                input_datasets_v2=input_datasets_v2,
                                output_datasets_v2=output_datasets_v2,
-                                experiment_name=effective_experiment_name,
+                                experiment_name=effective_experiment_name(experiment_name, entry_script),
                                environment=registered_env,
                                snapshot_root_directory=snapshot_root_directory,
                                entry_script=entry_script,
--- a/hi-ml-azure/src/health_azure/utils.py
+++ b/hi-ml-azure/src/health_azure/utils.py
@ -76,6 +76,9 @@ ENV_LOCAL_RANK = "LOCAL_RANK"
 ENV_RANK = "RANK"
 MASTER_PORT_DEFAULT = 6105

+# Environment variables that affect job submission, in particular in builds
+ENV_EXPERIMENT_NAME = "HIML_EXPERIMENT_NAME"
+
 # Other Azure ML related variables
 ENVIRONMENT_VERSION = "1"
 FINAL_MODEL_FOLDER = "final_model"
--- a/hi-ml-azure/testazure/testazure/test_azure_util.py
+++ b/hi-ml-azure/testazure/testazure/test_azure_util.py
@ -35,17 +35,16 @@ from azure.core.exceptions import ClientAuthenticationError, ResourceNotFoundErr
 from azureml.data.azure_storage_datastore import AzureBlobDatastore

 import health_azure.utils as util
-from health_azure.himl import AML_IGNORE_FILE, append_to_amlignore
+from health_azure.himl import AML_IGNORE_FILE, append_to_amlignore, effective_experiment_name
 from health_azure.utils import (ENV_MASTER_ADDR, ENV_MASTER_PORT, MASTER_PORT_DEFAULT,
                                PackageDependency, create_argparser, get_credential)
 from testazure.test_himl import RunTarget, render_and_run_test_script
 from testazure.utils_testazure import (DEFAULT_IGNORE_FOLDERS, DEFAULT_WORKSPACE, MockRun, change_working_directory,
-                                       himl_azure_root, repository_root)
+                                       experiment_for_unittests, himl_azure_root, repository_root)

 RUN_ID = uuid4().hex
 RUN_NUMBER = 42
 EXPERIMENT_NAME = "fancy-experiment"
-AML_TESTS_EXPERIMENT = "test_experiment"


 def oh_no() -> None:
@ -1103,16 +1102,16 @@ def test_get_latest_aml_run_from_experiment(num_runs: int, tags: Dict[str, str],
            assert len(aml_runs) == expected_num_returned


-def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None:
+def test_get_latest_aml_run_from_experiment_remote() -> None:
    """
    Test that a remote run with particular tags can be correctly retrieved, ignoring any more recent
    experiments which do not have the correct tags. Note: this test will instantiate 2 new Runs in the
-    workspace described in your config.json file, under an experiment defined by AML_TESTS_EXPERIMENT
+    workspace described in your config.json file, under an experiment defined by experiment_for_unittests()
    """
    ws = DEFAULT_WORKSPACE.workspace
    assert True

-    experiment = Experiment(ws, AML_TESTS_EXPERIMENT)
+    experiment = Experiment(ws, experiment_for_unittests())
    config = ScriptRunConfig(
        source_directory=".",
        command=["cd ."],  # command that does nothing
@ -1123,6 +1122,7 @@ def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None:
            amlignore=Path("") / AML_IGNORE_FILE,
            lines_to_append=DEFAULT_IGNORE_FOLDERS):
        first_run = experiment.submit(config)
+        first_run.diplay_name = "test_get_latest_aml_run_from_experiment_remote"
    tags = {"experiment_type": "great_experiment"}
    first_run.set_tags(tags)
    first_run.wait_for_completion()
@ -1136,7 +1136,7 @@ def test_get_latest_aml_run_from_experiment_remote(tmp_path: Path) -> None:
        second_run.remove_tags(tags)

    # Retrieve latest run with given tags (expect first_run to be returned)
-    retrieved_runs = util.get_latest_aml_runs_from_experiment(AML_TESTS_EXPERIMENT, tags=tags, aml_workspace=ws)
+    retrieved_runs = util.get_latest_aml_runs_from_experiment(experiment_for_unittests(), tags=tags, aml_workspace=ws)
    assert len(retrieved_runs) == 1
    assert retrieved_runs[0].id == first_run.id
    assert retrieved_runs[0].get_tags() == tags
@ -1258,7 +1258,7 @@ def test_download_file_from_run(tmp_path: Path, dummy_env_vars: Dict[str, str],
 def test_download_file_from_run_remote(tmp_path: Path) -> None:
    # This test will create a Run in your workspace (using only local compute)
    ws = DEFAULT_WORKSPACE.workspace
-    experiment = Experiment(ws, AML_TESTS_EXPERIMENT)
+    experiment = Experiment(ws, experiment_for_unittests())
    config = ScriptRunConfig(
        source_directory=".",
        command=["cd ."],  # command that does nothing
@ -1268,6 +1268,7 @@ def test_download_file_from_run_remote(tmp_path: Path) -> None:
            amlignore=Path("") / AML_IGNORE_FILE,
            lines_to_append=DEFAULT_IGNORE_FOLDERS):
        run = experiment.submit(config)
+        run.diplay_name = "test_download_file_from_run_remote"

    file_to_upload = tmp_path / "dummy_file.txt"
    file_contents = "Hello world"
@ -1308,7 +1309,7 @@ def test_download_run_file_during_run(tmp_path: Path) -> None:
    information about the workspace to use, but pick up the current workspace.
    """
    # Create a run that contains a simple txt file
-    experiment_name = "himl-tests"
+    experiment_name = effective_experiment_name("himl-tests")
    run_to_download_from = util.create_aml_run_object(experiment_name=experiment_name,
                                                      workspace=DEFAULT_WORKSPACE.workspace)
    file_contents = "Hello World!"
@ -1582,7 +1583,7 @@ def test_checkpoint_download_remote(tmp_path: Path) -> None:
    prefix = "outputs/checkpoints/"

    ws = DEFAULT_WORKSPACE.workspace
-    experiment = Experiment(ws, AML_TESTS_EXPERIMENT)
+    experiment = Experiment(ws, experiment_for_unittests())
    config = ScriptRunConfig(
        source_directory=".",
        command=["cd ."],  # command that does nothing
@ -1592,6 +1593,7 @@ def test_checkpoint_download_remote(tmp_path: Path) -> None:
            amlignore=Path("") / AML_IGNORE_FILE,
            lines_to_append=DEFAULT_IGNORE_FOLDERS):
        run = experiment.submit(config)
+        run.display_name = "test_checkpoint_download_remote"

    file_contents = "Hello world"
    file_name = ""  # for pyright
@ -2341,8 +2343,8 @@ def test_create_run() -> None:
    """
    Test if we can create an AML run object here in the test suite, write logs and read them back in.
    """
-    run_name = "foo"
-    experiment_name = "himl-tests"
+    run_name = "test_create_run"
+    experiment_name = effective_experiment_name("himl-tests")
    run: Optional[Run] = None
    try:
        run = util.create_aml_run_object(experiment_name=experiment_name, run_name=run_name,
--- a/hi-ml-azure/testazure/testazure/test_himl.py
+++ b/hi-ml-azure/testazure/testazure/test_himl.py
@ -45,6 +45,7 @@ from health_azure.datasets import (
 )
 from health_azure.utils import (
    DEFAULT_ENVIRONMENT_VARIABLES,
+    ENV_EXPERIMENT_NAME,
    ENVIRONMENT_VERSION,
    EXPERIMENT_RUN_SEPARATOR,
    WORKSPACE_CONFIG_JSON,
@ -741,6 +742,7 @@ def test_submit_run(mock_workspace: mock.MagicMock,
        assert "AzureML completed" not in out


+@pytest.mark.fast
 def test_submit_run_v2(tmp_path: Path) -> None:
    def _mock_sweep(*args: Any, **kwargs: Any) -> MagicMock:
        assert kwargs.get("compute") == dummy_compute_target
@ -864,7 +866,7 @@ def test_submit_run_v2(tmp_path: Path) -> None:
            # 'command' should be called with the same args
            print(mock_command.call)
            # command should be called once when initialising the command job and once when updating the param sampling
-            mock_command.call_count == 2
+            assert mock_command.call_count == 2

            mock_command.assert_any_call(
                code=str(dummy_root_directory),
@ -1657,6 +1659,7 @@ def test_extract_v2_inputs_outputs_from_args() -> None:
        assert len(output_datasets) == 0


+@pytest.mark.fast
 def test_get_display_name_v2() -> None:
    dummy_display_name = "job display name"
    expected_display_name = "job-display-name"
@ -1676,3 +1679,24 @@ def test_get_display_name_v2() -> None:
    # if no tags provided, display name should be empty
    display_name = himl.get_display_name_v2()
    assert display_name == ""
+
+
+@pytest.mark.fast
+def test_experiment_name() -> None:
+    """Test the logic for choosing experiment names: Explicitly given experiment name should be used if provided,
+    otherwise fall back to environment variables and thpwden script name."""
+    # When the test suite runs on the Github, the environment variable "HIML_EXPERIMENT_NAME" will be set.
+    # Remove it to test the default behaviour.
+    with mock.patch.dict(os.environ):
+        os.environ.pop(ENV_EXPERIMENT_NAME, None)
+        assert himl.effective_experiment_name("explicit", Path()) == "explicit"
+        assert himl.effective_experiment_name("", Path("from_script.py")) == "from_script"
+        # Provide experiment names with special characters here that should be filtered out
+        with mock.patch("health_azure.himl.to_azure_friendly_string", return_value="mock_return"):
+            assert himl.effective_experiment_name("explicit", Path()) == "mock_return"
+        assert himl.effective_experiment_name("explicit/", Path()) == "explicit_"
+        with mock.patch.dict(os.environ, {ENV_EXPERIMENT_NAME: "name/from/env"}):
+            assert himl.effective_experiment_name("explicit", Path()) == "name_from_env"
+    with mock.patch.dict(os.environ, {ENV_EXPERIMENT_NAME: "name_from_env"}):
+        assert himl.effective_experiment_name("explicit", Path()) == "name_from_env"
+        assert himl.effective_experiment_name("", Path("from_script.py")) == "name_from_env"
--- a/hi-ml-azure/testazure/testazure/utils_testazure.py
+++ b/hi-ml-azure/testazure/testazure/utils_testazure.py
@ -10,7 +10,8 @@ from contextlib import contextmanager
 from pathlib import Path
 from typing import Dict, Generator, Optional

-from health_azure.utils import (UnitTestWorkspaceWrapper, WORKSPACE_CONFIG_JSON)
+from health_azure.utils import (ENV_EXPERIMENT_NAME, WORKSPACE_CONFIG_JSON, UnitTestWorkspaceWrapper,
+                                to_azure_friendly_string)

 DEFAULT_DATASTORE = "himldatasets"
 FALLBACK_SINGLE_RUN = "refs_pull_545_merge:refs_pull_545_merge_1626538212_d2b07afd"
@ -46,6 +47,15 @@ def repository_root() -> Path:
    return himl_azure_root().parent


+def experiment_for_unittests() -> str:
+    """
+    Gets the name of the experiment to use for tests.
+    """
+    experiment_name = to_azure_friendly_string(os.getenv(ENV_EXPERIMENT_NAME, "unittests"))
+    assert experiment_name is not None
+    return experiment_name
+
+
@contextmanager
 def change_working_directory(path_or_str: Path) -> Generator:
    """
--- a/hi-ml-cpath/Makefile
+++ b/hi-ml-cpath/Makefile
@ -160,11 +160,11 @@ define DEFAULT_SMOKE_TEST_ARGS
 endef

 define AML_ONE_DEVICE_ARGS
--cluster=testing-nc6 --wait_for_completion --num_nodes=1 --max_num_gpus=1 --strictly_aml_v1=True
+--cluster=testing-nc6 --wait_for_completion --num_nodes=1 --max_num_gpus=1 --strictly_aml_v1=True --max_run_duration=1h
 endef

 define AML_MULTIPLE_DEVICE_ARGS
--cluster=dedicated-nc24s-v2 --wait_for_completion --strictly_aml_v1=True
+--cluster=dedicated-nc24s-v2 --wait_for_completion --strictly_aml_v1=True --max_run_duration=1h
 endef

 define DEEPSMILEDEFAULT_SMOKE_TEST_ARGS
@ -206,7 +206,7 @@ smoke_test_cucim_slidespandaimagenetmil_local:
 # Once running in AML the following test takes around 6 minutes
 smoke_test_cucim_slidespandaimagenetmil_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_cucim_slidespandaimagenetmil_aml;}

 smoke_test_openslide_slidespandaimagenetmil_local:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -214,7 +214,7 @@ smoke_test_openslide_slidespandaimagenetmil_local:

 smoke_test_openslide_slidespandaimagenetmil_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${OPENSLIDE_BACKEND_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${OPENSLIDE_BACKEND_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_openslide_slidespandaimagenetmil_aml;}

 # The following test takes about 6 minutes
 smoke_test_tilespandaimagenetmil_local:
@ -223,7 +223,7 @@ smoke_test_tilespandaimagenetmil_local:

 smoke_test_tilespandaimagenetmil_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS}  ${AML_MULTIPLE_DEVICE_ARGS};}
+	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS}  ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tilespandaimagenetmil_aml;}

 # The following test takes about 30 seconds
 smoke_test_tcgacrcksslmil_local:
@ -232,7 +232,7 @@ smoke_test_tcgacrcksslmil_local:

 smoke_test_tcgacrcksslmil_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tcgacrcksslmil_aml;}

 smoke_test_tcgacrckimagenetmil_local:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKIMANEGETMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -240,7 +240,7 @@ smoke_test_tcgacrckimagenetmil_local:

 smoke_test_tcgacrckimagenetmil_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKIMANEGETMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tcgacrckimagenetmil_aml;}

 # The following test takes about 3 minutes
 smoke_test_crck_simclr_local:
@ -249,7 +249,7 @@ smoke_test_crck_simclr_local:

 smoke_test_crck_simclr_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${CRCKSIMCLR_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${CRCKSIMCLR_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${CRCKSIMCLR_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_crck_simclr_aml;}

 smoke_test_crck_flexible_finetuning_local:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -257,7 +257,7 @@ smoke_test_crck_flexible_finetuning_local:

 smoke_test_crck_flexible_finetuning_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${CRCK_TUNING_ARGS};}
+	${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${CRCK_TUNING_ARGS} --tag smoke_test_crck_flexible_finetuning_aml;}

 smoke_test_crck_loss_analysis_local:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -265,7 +265,7 @@ smoke_test_crck_loss_analysis_local:

 smoke_test_crck_loss_analysis_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${TCGACRCKSSLMIL_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${LOSS_ANALYSIS_ARGS};}
+	${TCGACRCKSSLMIL_SMOKE_TEST_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} ${LOSS_ANALYSIS_ARGS} --tag smoke_test_crck_loss_analysis_aml;}

 smoke_test_slides_panda_loss_analysis_local:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -273,7 +273,7 @@ smoke_test_slides_panda_loss_analysis_local:

 smoke_test_slides_panda_loss_analysis_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${LOSS_ANALYSIS_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${LOSS_ANALYSIS_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_slides_panda_loss_analysis_aml;}

 smoke_test_slides_panda_no_ddp_sampler_local:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -281,7 +281,7 @@ smoke_test_slides_panda_no_ddp_sampler_local:

 smoke_test_slides_panda_no_ddp_sampler_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDASLIDES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_slides_panda_no_ddp_sampler_aml;}

 smoke_test_tiles_panda_no_ddp_sampler_local:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
@ -289,7 +289,7 @@ smoke_test_tiles_panda_no_ddp_sampler_local:

 smoke_test_tiles_panda_no_ddp_sampler_aml:
 	{ ${BASE_CPATH_RUNNER_COMMAND} ${DEEPSMILEPANDATILES_ARGS} ${DEFAULT_SMOKE_TEST_ARGS} \
-	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS};}
+	${DEEPSMILEDEFAULT_SMOKE_TEST_ARGS} ${DDP_SAMPLER_ARGS} ${AML_MULTIPLE_DEVICE_ARGS} --tag smoke_test_tiles_panda_no_ddp_sampler_aml;}

 smoke tests local: smoke_test_cucim_slidespandaimagenetmil_local smoke_test_openslide_slidespandaimagenetmil_local smoke_test_tilespandaimagenetmil_local smoke_test_tcgacrcksslmil_local smoke_test_crck_simclr_local smoke_test_crck_flexible_finetuning_local smoke_test_tcgacrckimagenetmil_local smoke_test_crck_loss_analysis_local smoke_test_slides_panda_loss_analysis_local smoke_test_slides_panda_no_ddp_sampler_local smoke_test_tiles_panda_no_ddp_sampler_local

--- a/hi-ml/src/health_ml/experiment_config.py
+++ b/hi-ml/src/health_ml/experiment_config.py
@ -50,3 +50,7 @@ class ExperimentConfig(param.Parameterized):
        param.ClassSelector(class_=Path, default=None, allow_None=True,
                            doc="The path to the AzureML workspace configuration file. If not specified, the "
                                "configuration file in the current folder or one of its parents will be used.")
+    max_run_duration: str = param.String(
+        default="", doc="The maximum runtime that is allowed for this job in AzureML. This is given as a floating"
+        "point number with a string suffix s, m, h, d for seconds, minutes, hours, day. Examples: '3.5h', '2d'"
+    )
--- a/hi-ml/src/health_ml/runner.py
+++ b/hi-ml/src/health_ml/runner.py
@ -281,6 +281,7 @@ class Runner:
                input_datasets=input_datasets,  # type: ignore
                num_nodes=self.experiment_config.num_nodes,
                wait_for_completion=self.experiment_config.wait_for_completion,
+                max_run_duration=self.experiment_config.max_run_duration,
                ignored_folders=[],
                submit_to_azureml=bool(self.experiment_config.cluster),
                docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
--- a/hi-ml/testhiml/testhiml/test_regression_test_utils.py
+++ b/hi-ml/testhiml/testhiml/test_regression_test_utils.py
@ -28,7 +28,7 @@ from health_ml.utils.regression_test_utils import (
    compare_folder_contents,
    compare_folders_and_run_outputs,
 )
-from testazure.utils_testazure import DEFAULT_WORKSPACE
+from testazure.utils_testazure import DEFAULT_WORKSPACE, experiment_for_unittests


 def create_folder_and_write_text(file: Path, text: str) -> None:
@ -242,7 +242,9 @@ def upload_to_run_and_compare(regression_test_subfolder: str, run_to_mock: str,
    file_contents = "some file contents"
    file_name = "contents.txt"
    regression_test_folder = tmp_path / "expected"
-    run = create_aml_run_object(workspace=DEFAULT_WORKSPACE.workspace, experiment_name="test_regression_test_utils")
+    run = create_aml_run_object(workspace=DEFAULT_WORKSPACE.workspace,
+                                experiment_name=experiment_for_unittests(),
+                                run_name="upload_to_run_and_compare")
    # Upload a single file to the newly created run. When comparing the run output files,
    # and seeing this in the set of files that are expected to exist on the run, this should pass.
    file1 = tmp_path / file_name
--- a/hi-ml/testhiml/testhiml/test_run_ml.py
+++ b/hi-ml/testhiml/testhiml/test_run_ml.py
@ -2,12 +2,14 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import os
 import shutil
 import pytest

 from pathlib import Path
 from typing import Generator
-from unittest.mock import DEFAULT, MagicMock, Mock, patch
+from unittest import mock
+from unittest.mock import MagicMock, Mock, patch
 from _pytest.logging import LogCaptureFixture
 from pytorch_lightning import LightningModule

@ -21,8 +23,8 @@ from health_ml.run_ml import MLRunner, get_mlflow_run_id_from_previous_loggers
 from health_ml.utils.checkpoint_utils import CheckpointParser
 from health_ml.utils.common_utils import is_gpu_available
 from health_ml.utils.lightning_loggers import HimlMLFlowLogger, StoringLogger
-from health_azure.utils import is_global_rank_zero
-from testazure.utils_testazure import DEFAULT_WORKSPACE
+from health_azure.utils import ENV_EXPERIMENT_NAME, is_global_rank_zero
+from testazure.utils_testazure import DEFAULT_WORKSPACE, experiment_for_unittests
 from testhiml.utils.fixed_paths_for_tests import mock_run_id

 no_gpu = not is_gpu_available()
@ -299,11 +301,11 @@ def test_run(run_inference_only: bool, run_extra_val_epoch: bool, ml_runner_with
    with patch("health_ml.run_ml.create_lightning_trainer") as mock_create_trainer:
        with patch.multiple(
            ml_runner_with_container,
-            checkpoint_handler=DEFAULT,
-            load_model_checkpoint=DEFAULT,
-            run_training=DEFAULT,
-            run_validation=DEFAULT,
-            run_inference=DEFAULT,
+            checkpoint_handler=mock.DEFAULT,
+            load_model_checkpoint=mock.DEFAULT,
+            run_training=mock.DEFAULT,
+            run_validation=mock.DEFAULT,
+            run_inference=mock.DEFAULT,
        ) as mocks:
            mock_create_trainer.return_value = MagicMock(), MagicMock()
            ml_runner_with_container.run()
@ -327,9 +329,9 @@ def test_run_inference_only(ml_runner_with_run_id: MLRunner) -> None:
    with patch("health_ml.run_ml.create_lightning_trainer") as mock_create_trainer:
        with patch.multiple(
            ml_runner_with_run_id,
-            run_training=DEFAULT,
-            run_validation=DEFAULT,
-            validate_model_weights=DEFAULT
+            run_training=mock.DEFAULT,
+            run_validation=mock.DEFAULT,
+            validate_model_weights=mock.DEFAULT
        ) as mocks:
            mock_trainer = MagicMock()
            mock_create_trainer.return_value = mock_trainer, MagicMock()
@ -349,7 +351,7 @@ def test_resume_training_from_run_id(run_extra_val_epoch: bool, ml_runner_with_r
    ml_runner_with_run_id.container.max_num_gpus = 0
    ml_runner_with_run_id.container.max_epochs += 10
    assert ml_runner_with_run_id.checkpoint_handler.trained_weights_path
-    with patch.multiple(ml_runner_with_run_id, run_validation=DEFAULT, run_inference=DEFAULT) as mocks:
+    with patch.multiple(ml_runner_with_run_id, run_validation=mock.DEFAULT, run_inference=mock.DEFAULT) as mocks:
        ml_runner_with_run_id.run()
        assert mocks["run_validation"].called == run_extra_val_epoch
        mocks["run_inference"].assert_called_once()
@ -381,8 +383,7 @@ def test_log_on_vm(log_from_vm: bool) -> None:
    container = HelloWorld()
    container.max_epochs = 1
    # Mimic an experiment name given on the command line.
-    experiment_name = "unittest"
-    container.experiment = experiment_name
+    container.experiment = experiment_for_unittests()
    # The tag is used to identify the run, similar to the behaviour when submitting a run to AzureML.
    tag = f"test_log_on_vm [{log_from_vm}]"
    container.tag = tag
@ -404,18 +405,23 @@ def test_log_on_vm(log_from_vm: bool) -> None:
    assert isinstance(logger, HimlMLFlowLogger)


+@pytest.mark.fast
 def test_experiment_name() -> None:
    """Test that the experiment name is set correctly, choosing either the experiment name given on the commandline
    or the model name"""
-    container = HelloWorld()
-    # No experiment name given on the commandline: use the model name
-    model_name = "some_model"
-    container._model_name = model_name
-    assert container.effective_experiment_name == model_name
-    # Experiment name given on the commandline: use the experiment name
-    experiment_name = "unittest"
-    container.experiment = experiment_name
-    assert container.effective_experiment_name == experiment_name
+    # When the test suite runs on the Github, the environment variable "HIML_EXPERIMENT_NAME" will be set.
+    # Remove it to test the default behaviour.
+    with patch.dict(os.environ):
+        os.environ.pop(ENV_EXPERIMENT_NAME, None)
+        container = HelloWorld()
+        # No experiment name given on the commandline: use the model name
+        model_name = "some_model"
+        container._model_name = model_name
+        assert container.effective_experiment_name == model_name
+        # Experiment name given on the commandline: use the experiment name
+        experiment_name = experiment_for_unittests()
+        container.experiment = experiment_name
+        assert container.effective_experiment_name == experiment_name


 def test_get_mlflow_run_id_from_previous_loggers() -> None:
--- a/hi-ml/testhiml/testhiml/test_serialization.py
+++ b/hi-ml/testhiml/testhiml/test_serialization.py
@ -12,6 +12,7 @@ from torchvision.transforms import Compose, Resize, CenterCrop
 from azureml.core import Run

 from health_azure import object_to_yaml, create_aml_run_object
+from health_azure.himl import effective_experiment_name
 from health_azure.utils import is_running_in_azure_ml
 from health_ml.utils.serialization import ModelInfo
 from testazure.utils_testazure import DEFAULT_WORKSPACE
@ -107,7 +108,7 @@ def test_serialization_roundtrip() -> None:
 def test_get_metadata() -> None:
    """Test if model metadata is read correctly from the AzureML run."""
    run_name = "foo"
-    experiment_name = "himl-tests"
+    experiment_name = effective_experiment_name("himl-tests")
    run: Optional[Run] = None
    try:
        run = create_aml_run_object(
--- a/hi-ml/testhiml/testhiml/utils/fixed_paths_for_tests.py
+++ b/hi-ml/testhiml/testhiml/utils/fixed_paths_for_tests.py
@ -6,6 +6,7 @@ from pathlib import Path
 from typing import Optional
 from functools import lru_cache

+from health_azure.himl import effective_experiment_name
 from health_azure.utils import PathOrString
 from health_azure.utils import create_aml_run_object
 from testazure.utils_testazure import DEFAULT_WORKSPACE
@ -50,7 +51,7 @@ def mock_run_id(id: int = 0) -> str:
    :return: The run id of the created run that contains the checkpoint.
    """

-    experiment_name = "himl-tests"
+    experiment_name = effective_experiment_name("himl-tests")
    run_to_download_from = create_aml_run_object(experiment_name=experiment_name, workspace=DEFAULT_WORKSPACE.workspace)
    full_file_path = full_test_data_path(suffix="hello_world_checkpoint.ckpt")

--- a/hi-ml/testhiml/testhiml/utils/test_logging.py
+++ b/hi-ml/testhiml/testhiml/utils/test_logging.py
@ -20,13 +20,14 @@ from azureml._restclient.constants import RunStatus
 from azureml.core import Run

 from health_azure import RUN_CONTEXT, create_aml_run_object
+from health_azure.himl import effective_experiment_name
 from health_ml.utils import AzureMLLogger, AzureMLProgressBar, log_learning_rate, log_on_epoch
 from health_ml.utils.logging import _preprocess_hyperparams
 from testhiml.utils_testhiml import DEFAULT_WORKSPACE


 def create_unittest_run_object(snapshot_directory: Optional[Path] = None) -> Run:
-    return create_aml_run_object(experiment_name="himl-tests",
+    return create_aml_run_object(experiment_name=effective_experiment_name("himl-tests"),
                                 workspace=DEFAULT_WORKSPACE.workspace,
                                 snapshot_directory=snapshot_directory or ".")

@ -295,7 +296,9 @@ def test_azureml_logger_actual_run() -> None:
    """
    When running outside of AzureML, a new run should be created.
    """
-    logger = AzureMLLogger(enable_logging_outside_azure_ml=True, workspace=DEFAULT_WORKSPACE.workspace)
+    logger = AzureMLLogger(enable_logging_outside_azure_ml=True,
+                           workspace=DEFAULT_WORKSPACE.workspace,
+                           run_name="test_azureml_logger_actual_run")
    assert not logger.is_running_in_azure_ml
    assert logger.run is not None
    assert logger.run != RUN_CONTEXT