Moving InnerEye's Azure code to hi-ml package (#548)

Moving InnerEye to use the new HI-ML package. See Issue 62 on the HI-ML package
2021-08-26 10:17:09 +02:00 · 2021-08-26 10:17:09 +02:00 · b35399fb84
--- a/.github/workflows/linting_and_hello_world.yml
+++ b/.github/workflows/linting_and_hello_world.yml
@ -13,6 +13,7 @@ jobs:
      - uses: actions/checkout@v2
        with:
          lfs: true
+          submodules: true

      - name: flake8
        run: |
@ -58,7 +59,8 @@ jobs:
      - uses: actions/checkout@v2
        with:
          lfs: true
-          
+          submodules: true
+
      - uses: conda-incubator/setup-miniconda@v2
        with:
          activate-environment: InnerEye
--- a/.gitignore
+++ b/.gitignore
@ -131,6 +131,7 @@ dmypy.json

 # other
 .vscode/
+.devcontainer/

 /InnerEye/ML/src/aml_config
 *.exe
--- a/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml
+++ b/.idea/runConfigurations/Template__Run_ML_on_AzureML.xml
@ -11,8 +11,9 @@
    <option name="IS_MODULE_SDK" value="true" />
    <option name="ADD_CONTENT_ROOTS" value="true" />
    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
    <option name="SCRIPT_NAME" value="InnerEye/ML/runner.py" />
-    <option name="PARAMETERS" value="--azureml --model=HelloContainer --train" />
+    <option name="PARAMETERS" value="--azureml --model=HelloContainer --cluster=training-nc12" />
    <option name="SHOW_COMMAND_LINE" value="false" />
    <option name="EMULATE_TERMINAL" value="false" />
    <option name="MODULE_MODE" value="false" />
--- a/.idea/runConfigurations/pytest_all_simple_tests.xml
+++ b/.idea/runConfigurations/pytest_all_simple_tests.xml
@ -0,0 +1,19 @@
+<component name="ProjectRunConfigurationManager">
+  <configuration default="false" name="pytest all simple tests" type="tests" factoryName="py.test">
+    <module name="InnerEye-DeepLearning" />
+    <option name="INTERPRETER_OPTIONS" value="" />
+    <option name="PARENT_ENVS" value="true" />
+    <option name="SDK_HOME" value="" />
+    <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/Tests" />
+    <option name="IS_MODULE_SDK" value="true" />
+    <option name="ADD_CONTENT_ROOTS" value="true" />
+    <option name="ADD_SOURCE_ROOTS" value="true" />
+    <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+    <option name="_new_keywords" value="&quot;&quot;" />
+    <option name="_new_parameters" value="&quot;&quot;" />
+    <option name="_new_additionalArguments" value="&quot;-m \&quot;not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)\&quot;&quot;" />
+    <option name="_new_target" value="&quot;$PROJECT_DIR$/Tests&quot;" />
+    <option name="_new_targetType" value="&quot;PATH&quot;" />
+    <method v="2" />
+  </configuration>
+</component>
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -28,6 +28,8 @@ jobs that run in AzureML.
 - ([#533](https://github.com/microsoft/InnerEye-DeepLearning/pull/533)) Better defaults for inference on ensemble children.
 - ([#536](https://github.com/microsoft/InnerEye-DeepLearning/pull/536)) Inference will not run on the validation set by default, this can be turned on
 via the `--inference_on_val_set` flag.
+- ([#548](https://github.com/microsoft/InnerEye-DeepLearning/pull/548)) Many Azure-related functions have been moved
+out of the toolbox, into the separate hi-ml Python package.
 - ([#502](https://github.com/microsoft/InnerEye-DeepLearning/pull/502)) Renamed command line option 'perform_training_set_inference' to 'inference_on_train_set'. Replaced command line option 'perform_validation_and_test_set_inference' with the pair of options 'inference_on_val_set' and 'inference_on_test_set'.
 - ([#496](https://github.com/microsoft/InnerEye-DeepLearning/pull/496)) All plots are now saved as PNG, rather than JPG.
 - ([#497](https://github.com/microsoft/InnerEye-DeepLearning/pull/497)) Reducing the size of the code snapshot that
--- a/InnerEye/Azure/azure_config.py
+++ b/InnerEye/Azure/azure_config.py
@ -6,20 +6,17 @@ from __future__ import annotations

 import getpass
 import logging
-import sys
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union

 import param
-from azureml.core import Dataset, Datastore, Run, ScriptRunConfig, Workspace
+from azureml.core import Run, ScriptRunConfig, Workspace
 from azureml.core.authentication import InteractiveLoginAuthentication, ServicePrincipalAuthentication
-from azureml.data import FileDataset
-from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
 from azureml.train.hyperdrive import HyperDriveConfig
 from git import Repo

-from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context, remove_arg
+from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context
 from InnerEye.Azure.secrets_handling import SecretsHandling, read_all_settings
 from InnerEye.Common import fixed_paths
 from InnerEye.Common.generic_parsing import GenericConfig
@ -240,64 +237,6 @@ class AzureConfig(GenericConfig):
        """
        return fetch_run(workspace=self.get_workspace(), run_recovery_id=run_recovery_id)

-    def get_or_create_dataset(self, azure_dataset_id: str) -> FileDataset:
-        """
-        Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is
-        created and registered, assuming that the files are in a folder that has the same name as the dataset.
-        For example, if azure_dataset_id is 'foo', then the 'foo' dataset should be pointing to the folder
-        <container_root>/datasets/foo/
-        """
-        if not self.azureml_datastore:
-            raise ValueError("No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)")
-        if not azure_dataset_id:
-            raise ValueError("No dataset ID provided.")
-        workspace = self.get_workspace()
-        logging.info(f"Retrieving datastore '{self.azureml_datastore}' from AzureML workspace {workspace.name}")
-        datastore = Datastore.get(workspace, self.azureml_datastore)
-        try:
-            logging.info(f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
-            azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
-            logging.info("Dataset found.")
-        except:
-            logging.info(f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'")
-            # Ensure that there is a / at the end of the file path, otherwise folder that share a prefix could create
-            # trouble (for example, folders foo and foo_bar exist, and I'm trying to create a dataset from "foo")
-            azureml_dataset = Dataset.File.from_files(path=(datastore, azure_dataset_id + "/"))
-            logging.info("Registering the dataset for future use.")
-            azureml_dataset.register(workspace, name=azure_dataset_id)
-        return azureml_dataset
-
-    def get_dataset_consumption(self,
-                                azure_dataset_id: str,
-                                dataset_index: int,
-                                mountpoint: str) -> DatasetConsumptionConfig:
-        """
-        Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
-        dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
-        :param mountpoint: The path at which the dataset should be made available.
-        :param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
-        string to not use any datasets.
-        :param dataset_index: suffix for the dataset name, dataset name will be set to INPUT_DATA_KEY_idx
-        """
-        status = f"Dataset {azure_dataset_id} (index {dataset_index}) will be "
-        azureml_dataset = self.get_or_create_dataset(azure_dataset_id=azure_dataset_id)
-        if not azureml_dataset:
-            raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
-        named_input = azureml_dataset.as_named_input(f"{INPUT_DATA_KEY}_{dataset_index}")
-        path_on_compute = mountpoint or None
-        if self.use_dataset_mount:
-            status += "mounted at "
-            result = named_input.as_mount(path_on_compute)
-        else:
-            status += "downloaded to "
-            result = named_input.as_download(path_on_compute)
-        if path_on_compute:
-            status += f"{path_on_compute}."
-        else:
-            status += "a randomly chosen folder."
-        logging.info(status)
-        return result
-

@dataclass
 class SourceConfig:
@ -313,13 +252,6 @@ class SourceConfig:
    upload_timeout_seconds: int = 36000
    environment_variables: Optional[Dict[str, str]] = None

-    def set_script_params_except_submit_flag(self) -> None:
-        """
-        Populates the script_param field of the present object from the arguments in sys.argv, with the exception
-        of the "azureml" flag.
-        """
-        self.script_params = remove_arg(AZURECONFIG_SUBMIT_TO_AZUREML, sys.argv[1:])
-

@dataclass
 class ParserResult:
--- a/InnerEye/Azure/azure_runner.py
+++ b/InnerEye/Azure/azure_runner.py
@ -4,37 +4,25 @@
 #  ------------------------------------------------------------------------------------------
 import argparse
 import getpass
-import hashlib
 import logging
 import os
-import signal
 import sys
 from argparse import ArgumentError, ArgumentParser, Namespace
 from datetime import date
 from pathlib import Path
 from typing import Any, Dict, List, Optional

-from azureml.core import Environment, Experiment, Run, ScriptRunConfig
-from azureml.core.runconfig import MpiConfiguration, RunConfiguration
-from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
-from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
-
-from InnerEye.Azure import azure_util
-from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
-from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME, \
-    RUN_RECOVERY_ID_KEY_NAME, is_offline_run_context, merge_conda_dependencies
+from InnerEye.Azure.azure_config import AzureConfig, ParserResult
+from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME
 from InnerEye.Azure.secrets_handling import read_all_settings
-from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
 from InnerEye.Common.generic_parsing import GenericConfig
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
+from health.azure.datasets import DatasetConfig

 SLEEP_TIME_SECONDS = 30

-RUN_RECOVERY_FILE = "most_recent_run.txt"
-# The version to use when creating an AzureML Python environment. We create all environments with a unique hashed
-# name, hence version will always be fixed
-ENVIRONMENT_VERSION = "1"
+DEFAULT_DOCKER_BASE_IMAGE = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"

 # Environment variables used for multi-node training
 ENV_AZ_BATCHAI_MPI_MASTER_NODE = "AZ_BATCHAI_MPI_MASTER_NODE"
@ -47,47 +35,6 @@ ENV_GLOBAL_RANK = "GLOBAL_RANK"
 ENV_LOCAL_RANK = "LOCAL_RANK"


-def submit_to_azureml(azure_config: AzureConfig,
-                      source_config: SourceConfig,
-                      all_azure_dataset_ids: List[str],
-                      all_dataset_mountpoints: List[str]) -> Run:
-    """
-    The main entry point when submitting the runner script to AzureML.
-    It creates an AzureML workspace if needed, submits an experiment using the code
-    as specified in source_config, and waits for completion if needed.
-    :param azure_config: azure related configurations to setup valid workspace
-    :param source_config: The information about which code should be submitted, and which arguments should be used.
-    :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
-    :param all_dataset_mountpoints: When using mounted datasets in AzureML, these are the per-dataset mount points.
-    The list must have the same length as all_azure_dataset_ids.
-    """
-    azure_run: Optional[Run] = None
-
-    # When running as part of the PR build, jobs frequently get interrupted by new pushes to the repository.
-    # In this case, we'd like to cancel the current AzureML run before exiting, to reduce cost.
-    # However, at present, this does NOT work, the SIGINT is not propagated through.
-    def interrupt_handler(signal: int, _: Any) -> None:
-        logging.info('Process interrupted via signal {}'.format(str(signal)))
-        if azure_run:
-            logging.info('Trying to terminate the AzureML job now.')
-            azure_run.cancel()
-        sys.exit(0)
-
-    for s in [signal.SIGINT, signal.SIGTERM]:
-        signal.signal(s, interrupt_handler)
-    # create train/test experiment
-    script_run_config = create_run_config(azure_config, source_config, all_azure_dataset_ids, all_dataset_mountpoints)
-    commandline_args = " ".join(source_config.script_params)
-    azure_run = create_and_submit_experiment(azure_config, script_run_config, commandline_args=commandline_args)
-
-    if azure_config.wait_for_completion:
-        # We want the job output to be visible on the console, but the program should not exit if the
-        # job fails because we need to download the pytest result file.
-        azure_run.wait_for_completion(show_output=True, raise_on_error=False)
-
-    return azure_run
-
-
 def get_git_tags(azure_config: AzureConfig) -> Dict[str, str]:
    """
    Creates a dictionary with git-related information, like branch and commit ID. The dictionary key is a string
@ -107,27 +54,25 @@ def get_git_tags(azure_config: AzureConfig) -> Dict[str, str]:
    }


-def set_run_tags(run: Run, azure_config: AzureConfig, commandline_args: str) -> None:
+def additional_run_tags(azure_config: AzureConfig, commandline_args: str) -> Dict[str, str]:
    """
-    Set metadata for the run
-    :param run: Run to set metadata for.
+    Gets the set of tags that will be added to the AzureML run as metadata, like git status and user name.
    :param azure_config: The configurations for the present AzureML job
    :param commandline_args: A string that holds all commandline arguments that were used for the present run.
    """
    git_information = get_git_tags(azure_config)
-    run.set_tags({
+    return {
        "tag": azure_config.tag,
        "model_name": azure_config.model,
        "execution_mode": ModelExecutionMode.TRAIN.value if azure_config.train else ModelExecutionMode.TEST.value,
-        RUN_RECOVERY_ID_KEY_NAME: azure_util.create_run_recovery_id(run=run),
        RUN_RECOVERY_FROM_ID_KEY_NAME: azure_config.run_recovery_id,
        "build_number": str(azure_config.build_number),
        "build_user": azure_config.build_user,
        "build_user_email": azure_config.build_user_email,
        **git_information,
        "commandline_args": commandline_args,
-        CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: -1,
-    })
+        CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: "-1",
+    }


 def create_experiment_name(azure_config: AzureConfig) -> str:
@ -144,136 +89,19 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
    return branch or getpass.getuser() + f"_local_branch_{date.today().strftime('%Y%m')}"


-def create_and_submit_experiment(azure_config: AzureConfig,
-                                 script_run_config: ScriptRunConfig,
-                                 commandline_args: str) -> Run:
-    """
-    Creates an AzureML experiment in the workspace and submits it for execution.
-    :param azure_config: azure related configurations to setup a valid workspace.
-    :param script_run_config: The configuration for the script that should be run inside of AzureML.
-    :param commandline_args: A string with all commandline arguments that were provided to the runner. These are only
-    used to set a tag on the submitted AzureML run.
-    :returns: Run object for the submitted AzureML run
-    """
-    workspace = azure_config.get_workspace()
-    experiment_name = create_experiment_name(azure_config)
-    exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name))
-
-    # submit a training/testing run associated with the experiment
-    run: Run = exp.submit(script_run_config)
-
-    if is_offline_run_context(run):
-        # This codepath will only be executed in unit tests, when exp.submit is mocked.
-        return run
-
-    # Set metadata for the run.
-    set_run_tags(run, azure_config, commandline_args=commandline_args)
-
-    print("\n==============================================================================")
-    print(f"Successfully queued new run {run.id} in experiment: {exp.name}")
-
-    if azure_config.run_recovery_id:
-        print(f"\nRecovered from: {azure_config.run_recovery_id}")
-
-    recovery_id = azure_util.create_run_recovery_id(run)
-    recovery_file = Path(RUN_RECOVERY_FILE)
-    if recovery_file.exists():
-        recovery_file.unlink()
-    recovery_file.write_text(recovery_id)
-
-    print("Experiment URL: {}".format(exp.get_portal_url()))
-    print("Run URL: {}".format(run.get_portal_url()))
-    print("If this run fails, re-start runner.py and supply these additional arguments: "
-          f"--run_recovery_id={recovery_id}")
-    print(f"The run recovery ID has been written to this file: {recovery_file}")
-    print("==============================================================================")
-    if azure_config.tensorboard and azure_config.azureml:
-        print("Starting TensorBoard now because you specified --tensorboard")
-        monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config)
-    else:
-        print(f"To monitor this run locally using TensorBoard, run the script: "
-              f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
-        print("==============================================================================")
-    return run
-
-
-def get_or_create_python_environment(azure_config: AzureConfig,
-                                     source_config: SourceConfig,
-                                     environment_name: str = "",
-                                     register_environment: bool = True) -> Environment:
-    """
-    Creates a description for the Python execution environment in AzureML, based on the Conda environment
-    definition files that are specified in `source_config`. If such environment with this Conda environment already
-    exists, it is retrieved, otherwise created afresh.
-    :param azure_config: azure related configurations to use for model scale-out behaviour
-    :param source_config: configurations for model execution, such as name and execution mode
-    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
-    is not found, create one from the Conda files provided. This parameter is meant to be used when running
-    inference for an existing model.
-    :param register_environment: If True, the Python environment will be registered in the AzureML workspace. If
-    False, it will only be created, but not registered. Use this for unit testing.
-    """
-    # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not be
-    # necessary if the innereye package is installed. It is necessary when working with an outer project and
-    # InnerEye as a git submodule and submitting jobs from the local machine.
-    # In case of version conflicts, the package version in the outer project is given priority.
-    conda_dependencies, merged_yaml = merge_conda_dependencies(source_config.conda_dependencies_files)  # type: ignore
-    if azure_config.pip_extra_index_url:
-        # When an extra-index-url is supplied, swap the order in which packages are searched for.
-        # This is necessary if we need to consume packages from extra-index that clash with names of packages on
-        # pypi
-        conda_dependencies.set_pip_option(f"--index-url {azure_config.pip_extra_index_url}")
-        conda_dependencies.set_pip_option("--extra-index-url https://pypi.org/simple")
-    env_variables = {
-        "AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC": str(source_config.upload_timeout_seconds),
-        # Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300.
-        "AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900",
-        "MKL_SERVICE_FORCE_INTEL": "1",
-        # Switching to a new software stack in AML for mounting datasets
-        "RSLEX_DIRECT_VOLUME_MOUNT": "true",
-        "RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1",
-        **(source_config.environment_variables or {})
-    }
-    base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
-    # Create a name for the environment that will likely uniquely identify it. AzureML does hashing on top of that,
-    # and will re-use existing environments even if they don't have the same name.
-    # Hashing should include everything that can reasonably change. Rely on hashlib here, because the built-in
-    # hash function gives different results for the same string in different python instances.
-    hash_string = "\n".join([merged_yaml, azure_config.docker_shm_size, base_image, str(env_variables)])
-    sha1 = hashlib.sha1(hash_string.encode("utf8"))
-    overall_hash = sha1.hexdigest()[:32]
-    unique_env_name = f"InnerEye-{overall_hash}"
-    try:
-        env_name_to_find = environment_name or unique_env_name
-        env = Environment.get(azure_config.get_workspace(), name=env_name_to_find, version=ENVIRONMENT_VERSION)
-        logging.info(f"Using existing Python environment '{env.name}'.")
-        return env
-    except Exception:
-        logging.info(f"Python environment '{unique_env_name}' does not yet exist, creating and registering it.")
-    env = Environment(name=unique_env_name)
-    env.docker.enabled = True
-    env.docker.shm_size = azure_config.docker_shm_size
-    env.python.conda_dependencies = conda_dependencies
-    env.docker.base_image = base_image
-    env.environment_variables = env_variables
-    if register_environment:
-        env.register(azure_config.get_workspace())
-    return env
-
-
-def create_dataset_consumptions(azure_config: AzureConfig,
-                                all_azure_dataset_ids: List[str],
-                                all_dataset_mountpoints: List[str]) -> List[DatasetConsumptionConfig]:
+def create_dataset_configs(azure_config: AzureConfig,
+                           all_azure_dataset_ids: List[str],
+                           all_dataset_mountpoints: List[str]) -> List[DatasetConfig]:
    """
    Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be
    skipped.
    :param azure_config: azure related configurations to use for model scale-out behaviour
    :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
    :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
-    :return: A list of DatasetConsumptionConfig, in the same order as datasets were provided in all_azure_dataset_ids,
+    :return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids,
    omitting datasets with an empty name.
    """
-    dataset_consumptions: List[DatasetConsumptionConfig] = []
+    datasets: List[DatasetConfig] = []
    if len(all_dataset_mountpoints) > 0:
        if len(all_azure_dataset_ids) != len(all_dataset_mountpoints):
            raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) "
@ -282,64 +110,14 @@ def create_dataset_consumptions(azure_config: AzureConfig,
        all_dataset_mountpoints = [""] * len(all_azure_dataset_ids)
    for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)):
        if dataset_id:
-            dataset_consumption = azure_config.get_dataset_consumption(dataset_id, i, mount_point)
-            dataset_consumptions.append(dataset_consumption)
+            datasets.append(DatasetConfig(name=dataset_id,
+                                          target_folder=mount_point,
+                                          use_mounting=azure_config.use_dataset_mount,
+                                          datastore=azure_config.azureml_datastore))
        elif mount_point:
            raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has "
                             f"been provided ('{mount_point}')")
-    return dataset_consumptions
-
-
-def create_run_config(azure_config: AzureConfig,
-                      source_config: SourceConfig,
-                      all_azure_dataset_ids: List[str],
-                      all_dataset_mountpoints: List[str],
-                      environment_name: str = "") -> ScriptRunConfig:
-    """
-    Creates a configuration to run the InnerEye training script in AzureML.
-    :param azure_config: azure related configurations to use for model scale-out behaviour
-    :param source_config: configurations for model execution, such as name and execution mode
-    :param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
-    :param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
-    :param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
-    is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
-    when running inference for an existing model.
-    :return: The configured script run.
-    """
-    dataset_consumptions = create_dataset_consumptions(azure_config, all_azure_dataset_ids, all_dataset_mountpoints)
-    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
-    entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix()
-    logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
-                 f"source directory {source_config.root_folder})")
-    max_run_duration = None
-    if azure_config.max_run_duration:
-        max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration)
-    workspace = azure_config.get_workspace()
-    run_config = RunConfiguration(
-        script=entry_script_relative_path,
-        arguments=source_config.script_params,
-    )
-    run_config.environment = get_or_create_python_environment(azure_config, source_config,
-                                                              environment_name=environment_name)
-    run_config.target = azure_config.cluster
-    run_config.max_run_duration_seconds = max_run_duration
-    if azure_config.num_nodes > 1:
-        distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes)
-        run_config.mpi = distributed_job_config
-        run_config.framework = "Python"
-        run_config.communicator = "IntelMpi"
-        run_config.node_count = distributed_job_config.node_count
-    if len(dataset_consumptions) > 0:
-        run_config.data = {dataset.name: dataset for dataset in dataset_consumptions}
-    # Use blob storage for storing the source, rather than the FileShares section of the storage account.
-    run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
-    script_run_config = ScriptRunConfig(
-        source_directory=str(source_config.root_folder),
-        run_config=run_config,
-    )
-    if azure_config.hyperdrive:
-        script_run_config = source_config.hyperdrive_config_func(script_run_config)  # type: ignore
-    return script_run_config
+    return datasets


 def create_runner_parser(model_config_class: type = None) -> argparse.ArgumentParser:
--- a/InnerEye/Azure/azure_util.py
+++ b/InnerEye/Azure/azure_util.py
@ -5,19 +5,15 @@
 import logging
 import os
 import re
-import tempfile
 from pathlib import Path
 from typing import Generator, List, Optional, Tuple

-import conda_merge
-import ruamel.yaml
-from azureml._restclient.constants import RunStatus
 from azureml.core import Experiment, Run, Workspace, get_run
-from azureml.core.conda_dependencies import CondaDependencies
 from azureml.exceptions import UserErrorException

 from InnerEye.Common import fixed_paths
 from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
+from health.azure.azure_util import create_run_recovery_id

 DEFAULT_CROSS_VALIDATION_SPLIT_INDEX = -1
 EXPERIMENT_RUN_SEPARATOR = ":"
@ -45,16 +41,6 @@ INNEREYE_SDK_NAME = "innereye"
 INNEREYE_SDK_VERSION = "1.0"


-def create_run_recovery_id(run: Run) -> str:
-    """
-   Creates an recovery id for a run so it's checkpoints could be recovered for training/testing
-
-   :param run: an instantiated run.
-   :return: recovery id for a given run in format: [experiment name]:[run id]
-   """
-    return str(run.experiment.name + EXPERIMENT_RUN_SEPARATOR + run.id)
-
-
 def split_recovery_id(id: str) -> Tuple[str, str]:
    """
    Splits a run ID into the experiment name and the actual run.
@ -149,12 +135,13 @@ def fetch_child_runs(run: Run, status: Optional[str] = None,
            run = PARENT_RUN_CONTEXT
    children_runs = list(run.get_children(tags=RUN_RECOVERY_ID_KEY_NAME))
    if 0 < expected_number_cross_validation_splits != len(children_runs):
-        logging.warning(
-            f"The expected number of child runs was {expected_number_cross_validation_splits}."
-            f"Fetched only: {len(children_runs)} runs. Now trying to fetch them manually.")
-        run_ids_to_evaluate = [f"{create_run_recovery_id(run)}_{i}"
-                               for i in range(expected_number_cross_validation_splits)]
-        children_runs = [fetch_run(run.experiment.workspace, id) for id in run_ids_to_evaluate]
+        if 0 < expected_number_cross_validation_splits != len(children_runs):
+            logging.warning(
+                f"The expected number of child runs was {expected_number_cross_validation_splits}."
+                f"Fetched only: {len(children_runs)} runs. Now trying to fetch them manually.")
+            run_ids_to_evaluate = [f"{create_run_recovery_id(run)}_{i}"
+                                   for i in range(expected_number_cross_validation_splits)]
+            children_runs = [fetch_run(run.experiment.workspace, id) for id in run_ids_to_evaluate]
    if status is not None:
        children_runs = [child_run for child_run in children_runs if child_run.get_status() == status]
    return children_runs
@ -237,53 +224,6 @@ def strip_prefix(string: str, prefix: str) -> str:
    return string


-def _log_conda_dependencies_stats(conda: CondaDependencies, message_prefix: str) -> None:
-    """
-    Write number of conda and pip packages to logs.
-    :param conda: A conda dependencies object
-    :param message_prefix: A message to prefix to the log string.
-    """
-    conda_packages_count = len(list(conda.conda_packages))
-    pip_packages_count = len(list(conda.pip_packages))
-    logging.info(f"{message_prefix}: {conda_packages_count} conda packages, {pip_packages_count} pip packages")
-    logging.debug("  Conda packages:")
-    for p in conda.conda_packages:
-        logging.debug(f"    {p}")
-    logging.debug("  Pip packages:")
-    for p in conda.pip_packages:
-        logging.debug(f"    {p}")
-
-
-def merge_conda_files(files: List[Path], result_file: Path) -> None:
-    """
-    Merges the given Conda environment files using the conda_merge package, and writes the merged file to disk.
-    :param files: The Conda environment files to read.
-    :param result_file: The location where the merge results should be written.
-    """
-    # This code is a slightly modified version of conda_merge. That code can't be re-used easily
-    # it defaults to writing to stdout
-    env_definitions = [conda_merge.read_file(str(f)) for f in files]
-    unified_definition = {}
-    NAME = "name"
-    CHANNELS = "channels"
-    DEPENDENCIES = "dependencies"
-    name = conda_merge.merge_names(env.get(NAME) for env in env_definitions)
-    if name:
-        unified_definition[NAME] = name
-    try:
-        channels = conda_merge.merge_channels(env.get(CHANNELS) for env in env_definitions)
-    except conda_merge.MergeError:
-        logging.error("Failed to merge channel priorities.")
-        raise
-    if channels:
-        unified_definition[CHANNELS] = channels
-    deps = conda_merge.merge_dependencies(env.get(DEPENDENCIES) for env in env_definitions)
-    if deps:
-        unified_definition[DEPENDENCIES] = deps
-    with result_file.open("w") as f:
-        ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
-
-
 def get_all_environment_files(project_root: Path) -> List[Path]:
    """
    Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
@ -299,27 +239,6 @@ def get_all_environment_files(project_root: Path) -> List[Path]:
    return files


-def merge_conda_dependencies(files: List[Path]) -> Tuple[CondaDependencies, str]:
-    """
-    Creates a CondaDependencies object from the Conda environments specified in one or more files.
-    The resulting object contains the union of the Conda and pip packages in the files, where merging
-    is done via the conda_merge package.
-    :param files: The Conda environment files to read.
-    :return: Tuple of (CondaDependencies object that contains packages from all the files,
-    string contents of the merge Conda environment)
-    """
-    for file in files:
-        _log_conda_dependencies_stats(CondaDependencies(file), f"Conda environment in {file}")
-    temp_merged_file = tempfile.NamedTemporaryFile(delete=False)
-    merged_file_path = Path(temp_merged_file.name)
-    merge_conda_files(files, result_file=merged_file_path)
-    merged_dependencies = CondaDependencies(temp_merged_file.name)
-    _log_conda_dependencies_stats(merged_dependencies, "Merged Conda environment")
-    merged_file_contents = merged_file_path.read_text()
-    temp_merged_file.close()
-    return merged_dependencies, merged_file_contents
-
-
 def tag_values_all_distinct(runs: List[Run], tag: str) -> bool:
    """
    Returns True iff the runs all have the specified tag and all the values are different.
@ -395,26 +314,6 @@ def is_running_on_azure_agent() -> bool:
    return bool(os.environ.get("AGENT_OS", None))


-def is_run_and_child_runs_completed(run: Run) -> bool:
-    """
-    Checks if the given run has successfully completed. If the run has child runs, it also checks if the child runs
-    completed successfully.
-    :param run: The AzureML run to check.
-    :return: True if the run and all child runs completed successfully.
-    """
-
-    def is_completed(run: Run) -> bool:
-        status = run.get_status()
-        if run.status == RunStatus.COMPLETED:
-            return True
-        logging.info(f"Run {run.id} in experiment {run.experiment.name} finished with status {status}.")
-        return False
-
-    runs = list(run.get_children())
-    runs.append(run)
-    return all(is_completed(run) for run in runs)
-
-
 def get_comparison_baseline_paths(outputs_folder: Path,
                                  blob_path: Path, run: Run,
                                  dataset_csv_file_name: str) -> \
@ -458,43 +357,3 @@ def step_up_directories(path: Path) -> Generator[Path, None, None]:
        if parent == path:
            break
        path = parent
-
-
-def remove_arg(arg: str, args: List[str]) -> List[str]:
-    """
-    Remove an argument from a list of arguments. The argument list is assumed to contain
-    elements of the form:
-    "-a", "--arg1", "--arg2", "value2", or "--arg3=value"
-    If there is an item matching "--arg" then it will be removed from the list.
-
-    :param arg: Argument to look for.
-    :param args: List of arguments to scan.
-    :return: List of arguments with --arg removed, if present.
-    """
-    arg_opt = f"--{arg}"
-    no_arg_opt = f"--no-{arg}"
-    retained_args = []
-    i = 0
-    while i < len(args):
-        arg = args[i]
-        if arg.startswith(arg_opt):
-            if len(arg) == len(arg_opt):
-                # The commandline argument is "--arg", with something possibly following: This can either be
-                # "--arg_opt value" or "--arg_opt --some_other_param"
-                if i < (len(args) - 1):
-                    # If the next argument starts with a "-" then assume that it does not belong to the --arg
-                    # argument. If there is no "-", assume it belongs to the --arg_opt argument, and skip both
-                    if not args[i + 1].startswith("-"):
-                        i = i + 1
-            elif arg[len(arg_opt)] == "=":
-                # The commandline argument is "--arg=value": Continue with next arg
-                pass
-            else:
-                # The argument list contains an argument like "--arg_other_param": Keep that.
-                retained_args.append(arg)
-        elif arg == no_arg_opt:
-            pass
-        else:
-            retained_args.append(arg)
-        i = i + 1
-    return retained_args
--- a/InnerEye/Common/common_util.py
+++ b/InnerEye/Common/common_util.py
@ -14,7 +14,6 @@ from functools import wraps
 from pathlib import Path
 from typing import Any, Callable, Generator, Iterable, List, Optional, Union

-from InnerEye.Common import fixed_paths
 from InnerEye.Common.fixed_paths import repository_root_directory
 from InnerEye.Common.type_annotations import PathOrString
 from InnerEye.ML.common import ModelExecutionMode
@ -25,7 +24,6 @@ MAX_PATH_LENGTH = 260
 empty_string_to_none = lambda x: None if (x is None or len(x.strip()) == 0) else x
 string_to_path = lambda x: None if (x is None or len(x.strip()) == 0) else Path(x)

-
 SUBJECT_METRICS_FILE_NAME = "metrics.csv"
 EPOCH_METRICS_FILE_NAME = "epoch_metrics.csv"
 METRICS_AGGREGATES_FILE = "metrics_aggregates.csv"
@ -390,20 +388,6 @@ def remove_file_or_directory(pth: Path) -> None:
        pth.unlink()


-def add_folder_to_sys_path_if_needed(folder_under_repo_root: str) -> None:
-    """
-    Checks if the Python paths in sys.path already contain the given folder, which is expected to be relative
-    to the repository root. If that folder is not yet in sys.path, add it.
-    """
-    full_folder = repository_root_directory() / folder_under_repo_root
-    for path_str in sys.path:
-        path = Path(path_str)
-        if path == full_folder:
-            return
-    print(f"Adding {full_folder} to sys.path")
-    sys.path.append(str(full_folder))
-
-
@contextmanager
 def change_working_directory(path_or_str: PathOrString) -> Generator:
    """
@ -414,16 +398,3 @@ def change_working_directory(path_or_str: PathOrString) -> Generator:
    os.chdir(new_path)
    yield
    os.chdir(old_path)
-
-
-@contextmanager
-def append_to_amlignore(lines_to_append: List[str]) -> Generator:
-    """
-    Context manager that appends lines to the .amlignore file, and reverts to the previous contents after.
-    """
-    amlignore = fixed_paths.repository_root_directory(".amlignore")
-    old_contents = amlignore.read_text()
-    new_contents = old_contents.splitlines() + lines_to_append
-    amlignore.write_text("\n".join(new_contents))
-    yield
-    amlignore.write_text(old_contents)
--- a/InnerEye/Common/fixed_paths.py
+++ b/InnerEye/Common/fixed_paths.py
@ -2,7 +2,9 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import logging
 import os
+import sys
 from pathlib import Path
 from typing import Optional

@ -91,3 +93,28 @@ def get_environment_yaml_file() -> Path:
            raise ValueError(f"File {ENVIRONMENT_YAML_FILE_NAME} was not found not found in the package folder "
                             f"{INNEREYE_PACKAGE_ROOT}, and not in the repository root {repository_root_directory()}.")
    return env
+
+
+def add_submodules_to_path() -> None:
+    """
+    This function adds all submodules that the code uses to sys.path and to the environment variables. This is
+    necessary to make the code work without any further changes when switching from/to using hi-ml as a package
+    or as a submodule for development.
+    It also adds the InnerEye root folder to sys.path. The latter is necessary to make AzureML and Pytorch Lightning
+    work together: When spawning additional processes for DDP, the working directory is not correctly picked
+    up in sys.path.
+    """
+    innereye_root = repository_root_directory()
+    folders_to_add = [(innereye_root, "InnerEye"),
+                      (innereye_root / "fastMRI", "fastmri"),
+                      (innereye_root / "hi-ml" / "src", "health")]
+    for (folder, subfolder_that_must_exist) in folders_to_add:
+        if (folder / subfolder_that_must_exist).is_dir():
+            folder_str = str(folder)
+            if folder_str not in sys.path:
+                logging.debug(f"Adding folder {folder} to sys.path")
+                sys.path.insert(0, folder_str)
+            else:
+                logging.debug(f"Not adding folder {folder} because it is already in sys.path")
+        else:
+            logging.debug(f"Not adding folder {folder} because it does not have subfolder {subfolder_that_must_exist}")
--- a/InnerEye/ML/configs/other/fastmri_varnet.py
+++ b/InnerEye/ML/configs/other/fastmri_varnet.py
@ -16,11 +16,7 @@ import torch
 from pytorch_lightning import LightningDataModule, LightningModule
 from torch.utils.tensorboard import SummaryWriter

-from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
 from InnerEye.ML.lightning_container import LightningContainer
-
-add_folder_to_sys_path_if_needed("fastMRI")
-
 from fastmri.data.subsample import create_mask_for_mask_type
 from fastmri.data.transforms import VarNetDataTransform
 from fastmri.pl_modules import FastMriDataModule, VarNetModule
--- a/InnerEye/ML/normalize_and_visualize_dataset.py
+++ b/InnerEye/ML/normalize_and_visualize_dataset.py
@ -73,9 +73,8 @@ def main(yaml_file_path: Path) -> None:
    In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
    """
    config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
-    local_dataset = MLRunner(config, azure_config=runner_config).mount_or_download_dataset(config.azure_dataset_id,
-                                                                                           config.local_dataset)
-    assert local_dataset is not None
+    runner = MLRunner(config, azure_config=runner_config)
+    local_dataset = runner.download_or_use_existing_dataset(config.azure_dataset_id, config.local_dataset)
    dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
    normalizer_config = NormalizeAndVisualizeConfig(**args)
    actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id
--- a/InnerEye/ML/reports/classification_crossval_report.ipynb
+++ b/InnerEye/ML/reports/classification_crossval_report.ipynb
@ -50,6 +50,8 @@
    "import sys\n",
    "if str(innereye_path) not in sys.path:\n",
    "    sys.path.append(str(innereye_path))\n",
+    "from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
+    "add_submodules_to_path()\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
--- a/InnerEye/ML/reports/classification_multilabel_report.ipynb
+++ b/InnerEye/ML/reports/classification_multilabel_report.ipynb
@ -56,6 +56,8 @@
    "import sys\n",
    "if str(innereye_path) not in sys.path:\n",
    "    sys.path.append(str(innereye_path))\n",
+    "from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
+    "add_submodules_to_path()\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
--- a/InnerEye/ML/reports/segmentation_report.ipynb
+++ b/InnerEye/ML/reports/segmentation_report.ipynb
@ -45,6 +45,9 @@
    "\n",
    "if str(innereye_path) not in sys.path:\n",
    "    sys.path.append(str(innereye_path))\n",
+    "from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
+    "add_submodules_to_path()\n",
+    "\n",
    "from InnerEye.ML.reports.segmentation_report import plot_scores_for_csv\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
@ -159,4 +162,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 5
-}
+}
--- a/InnerEye/ML/run_ml.py
+++ b/InnerEye/ML/run_ml.py
@ -21,12 +21,12 @@ from pytorch_lightning.core.datamodule import LightningDataModule
 from torch.utils.data import DataLoader

 from InnerEye.Azure import azure_util
-from InnerEye.Azure.azure_config import AzureConfig, INPUT_DATA_KEY
-from InnerEye.Azure.azure_runner import ENVIRONMENT_VERSION, ENV_OMPI_COMM_WORLD_RANK, get_git_tags
+from InnerEye.Azure.azure_config import AzureConfig
+from InnerEye.Azure.azure_runner import ENV_OMPI_COMM_WORLD_RANK, get_git_tags
 from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, \
    EFFECTIVE_RANDOM_SEED_KEY_NAME, IS_ENSEMBLE_KEY_NAME, MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, \
    PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT, RUN_RECOVERY_FROM_ID_KEY_NAME, RUN_RECOVERY_ID_KEY_NAME, \
-    create_run_recovery_id, get_all_environment_files, is_offline_run_context, merge_conda_files
+    get_all_environment_files, is_offline_run_context
 from InnerEye.Common import fixed_paths
 from InnerEye.Common.common_util import BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE, \
    CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
@ -35,11 +35,12 @@ from InnerEye.Common.common_util import BASELINE_COMPARISONS_FOLDER, BASELINE_WI
    change_working_directory, get_best_epoch_results_path, is_windows, logging_section, logging_to_file, \
    print_exception, remove_file_or_directory
 from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, LOG_FILE_NAME, PYTHON_ENVIRONMENT_NAME
+from InnerEye.Common.type_annotations import PathOrString
 from InnerEye.ML.baselines_util import compare_folders_and_run_outputs
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.config import SegmentationModelBase
-from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, FINAL_ENSEMBLE_MODEL_FOLDER, \
-    FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod, load_checkpoint, EXTRA_RUN_SUBFOLDER
+from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, EXTRA_RUN_SUBFOLDER, \
+    FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod, load_checkpoint
 from InnerEye.ML.lightning_base import InnerEyeContainer
 from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
 from InnerEye.ML.metrics import InferenceMetrics, InferenceMetricsForSegmentation
@ -57,26 +58,14 @@ from InnerEye.ML.utils.run_recovery import RunRecovery
 from InnerEye.ML.visualizers import activation_maps
 from InnerEye.ML.visualizers.plot_cross_validation import \
    get_config_and_results_for_offline_runs, plot_cross_validation_from_files
+from health.azure.azure_util import ENVIRONMENT_VERSION, create_run_recovery_id, merge_conda_files
+from health.azure.datasets import get_or_create_dataset
+from health.azure.himl import AzureRunInfo

 ModelDeploymentHookSignature = Callable[[LightningContainer, AzureConfig, Model, ModelProcessing], Any]
 PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]


-def try_to_mount_input_dataset(dataset_index: int = 0) -> Optional[Path]:
-    """
-    Checks if the AzureML run context has a field for input datasets. If yes, the dataset stored there is
-    returned as a Path. Returns None if no input datasets was found.
-
-    :param dataset_index: suffix of AML dataset name, return path to INPUT_DATA_KEY_idx dataset
-    """
-    if hasattr(RUN_CONTEXT, "input_datasets"):
-        try:
-            return Path(RUN_CONTEXT.input_datasets[f"{INPUT_DATA_KEY}_{dataset_index}"])
-        except KeyError:
-            logging.warning(f"Run context field input_datasets has no {INPUT_DATA_KEY}_{dataset_index} entry.")
-    return None
-
-
 def download_dataset(azure_dataset_id: str,
                     target_folder: Path,
                     dataset_csv: str,
@ -95,7 +84,9 @@ def download_dataset(azure_dataset_id: str,
    :return: A path on the local machine that contains the dataset.
    """
    logging.info("Trying to download dataset via AzureML datastore now.")
-    azure_dataset = azure_config.get_or_create_dataset(azure_dataset_id)
+    azure_dataset = get_or_create_dataset(workspace=azure_config.get_workspace(),
+                                          datastore_name=azure_config.azureml_datastore,
+                                          dataset_name=azure_dataset_id)
    if not isinstance(azure_dataset, FileDataset):
        raise ValueError(f"Expected to get a FileDataset, but got {type(azure_dataset)}")
    # The downloaded dataset may already exist from a previous run.
@ -122,6 +113,20 @@ def download_dataset(azure_dataset_id: str,
    return expected_dataset_path


+def check_dataset_folder_exists(local_dataset: PathOrString) -> Path:
+    """
+    Checks if a folder with a local dataset exists. If it does exist, return the argument converted to a Path instance.
+    If it does not exist, raise a FileNotFoundError.
+    :param local_dataset: The dataset folder to check.
+    :return: The local_dataset argument, converted to a Path.
+    """
+    expected_dir = Path(local_dataset)
+    if not expected_dir.is_dir():
+        raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
+    logging.info(f"Model training will use the local dataset provided in {expected_dir}")
+    return expected_dir
+
+
 def log_metrics(metrics: Dict[ModelExecutionMode, InferenceMetrics],
                run_context: Run) -> None:
    """
@ -185,36 +190,40 @@ class MLRunner:
        self.output_subfolder = output_subfolder
        self._has_setup_run = False

-    def setup(self, use_mount_or_download_dataset: bool = True) -> None:
+    def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None:
        """
        If the present object is using one of the InnerEye built-in models, create a (fake) container for it
        and call the setup method. It sets the random seeds, and then creates the actual Lightning modules.
-        :param use_mount_or_download_dataset: If True, try to download or mount the dataset that is used by the model.
-        If False, assume that the dataset is already available (this should only be used for unit tests).
+        :param azure_run_info: When running in AzureML or on a local VM, this contains the paths to the datasets.
+        This can be missing when running in unit tests, where the local dataset paths are already populated.
        """
        if self._has_setup_run:
            return
-        if (not self.azure_config.only_register_model) and use_mount_or_download_dataset:
+        if (not self.azure_config.only_register_model) and azure_run_info:
+            dataset_index = 0
            # Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails
            # and config.local_dataset was not already set.
            # This must happen before container setup because that could already read datasets.
-            mounted_dataset = self.mount_or_download_dataset(self.container.azure_dataset_id,
-                                                             self.container.local_dataset)
-            if mounted_dataset is not None:
+            if self.container.azure_dataset_id:
+                mounted_dataset = azure_run_info.input_datasets[dataset_index]
+                if mounted_dataset is None:
+                    mounted_dataset = self.download_or_use_existing_dataset(self.container.azure_dataset_id,
+                                                                            self.container.local_dataset)
                self.container.local_dataset = mounted_dataset
-
-            extra_locals = []
-            if self.is_offline_run and len(self.container.extra_local_dataset_paths) != 0:
-                for local in self.container.extra_local_dataset_paths:
-                    extra_local_dataset = self.mount_or_download_dataset(None, local)
-                    assert extra_local_dataset is not None  # for mypy
-                    extra_locals.append(extra_local_dataset)
-            elif len(self.container.extra_azure_dataset_ids) != 0:
-                for i, azure_id in enumerate(self.container.extra_azure_dataset_ids, 1):
-                    extra_local_dataset = self.mount_or_download_dataset(azure_id, None, dataset_index=i)
-                    assert extra_local_dataset is not None  # for mypy
-                    extra_locals.append(extra_local_dataset)
-            self.container.extra_local_dataset_paths = extra_locals
+                dataset_index += 1
+            if self.container.extra_azure_dataset_ids:
+                num_extra_local_datasets = len(self.container.extra_local_dataset_paths)
+                extra_locals: List[Path] = []
+                for i, extra_azure_id in enumerate(self.container.extra_azure_dataset_ids):
+                    if num_extra_local_datasets > 0 and i < (num_extra_local_datasets - 1):
+                        raise ValueError(f"The model refers to an Azure dataset '{extra_azure_id}' at index {i}, "
+                                         f"but there are not enough local datasets given ")
+                    mounted_dataset = azure_run_info.input_datasets[dataset_index]
+                    if mounted_dataset is None:
+                        local = None if num_extra_local_datasets == 0 else self.container.extra_local_dataset_paths[i]
+                        mounted_dataset = self.download_or_use_existing_dataset(extra_azure_id, local_dataset=local)
+                    extra_locals.append(mounted_dataset)
+                self.container.extra_local_dataset_paths = extra_locals
        # Ensure that we use fixed seeds before initializing the PyTorch models
        seed_everything(self.container.get_effective_random_seed())
        # Creating the folder structure must happen before the LightningModule is created, because the output
@ -235,10 +244,11 @@ class MLRunner:

        if self.container.pretraining_run_recovery_id is not None:
            run_to_recover = self.azure_config.fetch_run(self.container.pretraining_run_recovery_id.strip())
+            only_return_path = not is_global_rank_zero()
            run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.container,
                                                                                run_to_recover,
                                                                                EXTRA_RUN_SUBFOLDER,
-                                                                                only_return_path=not is_global_rank_zero())
+                                                                                only_return_path=only_return_path)
            self.container.pretraining_run_checkpoints = run_recovery_object

        # A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
@ -543,54 +553,30 @@ class MLRunner:
            activation_maps.extract_activation_maps(self.innereye_config)  # type: ignore
            logging.info("Successfully extracted and saved activation maps")

-    def mount_or_download_dataset(self,
-                                  azure_dataset_id: Optional[str],
-                                  local_dataset: Optional[Path],
-                                  dataset_index: int = 0) -> Optional[Path]:
+    def download_or_use_existing_dataset(self,
+                                         azure_dataset_id: Optional[str],
+                                         local_dataset: Optional[Path]) -> Path:
        """
        Makes the dataset that the model uses available on the executing machine. If the present training run is outside
        of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be
        taken. If a dataset is specified in `azure_dataset_id`, it will attempt to download the dataset from Azure
        into the local repository, in the "datasets" folder.
-        If the training run is inside of AzureML, the dataset that was specified at job submission time will be
-        mounted or downloaded.
        :param azure_dataset_id: id of the dataset in AML workspace
        :param local_dataset: alternatively local path for this dataset
-        :param index of the dataset processed
-        :returns: the path of the dataset on the executing machine.
+        :returns: The path of the dataset on the executing machine.
        """
-        if self.is_offline_run:
-            # A dataset, either local or in Azure, is required for the built-in InnerEye models. When models are
-            # specified via a LightningContainer, these dataset fields are optional, because the container datasets
-            # could be downloaded even from the web.
-            is_dataset_required = isinstance(self.container, InnerEyeContainer)
-            # The present run is outside of AzureML: If local_dataset is set, use that as the path to the data.
-            # Otherwise, download the dataset specified by the azure_dataset_id
-            if is_dataset_required:
-                if (not azure_dataset_id) and (local_dataset is None):
-                    raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
-            if local_dataset:
-                expected_dir = Path(local_dataset)
-                if not expected_dir.is_dir():
-                    raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
-                logging.info(f"Model training will use the local dataset provided in {expected_dir}")
-                return expected_dir
-            if azure_dataset_id:
-                dataset_csv = ""
-                if isinstance(self.model_config, DeepLearningConfig):
-                    dataset_csv = self.model_config.dataset_csv
-                return download_dataset(azure_dataset_id=azure_dataset_id,
-                                        target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
-                                        dataset_csv=dataset_csv, azure_config=self.azure_config)
-            return None
-
-        # Inside of AzureML, datasets can be either mounted or downloaded.
+        if not self.is_offline_run:
+            raise ValueError("This function should only be called in runs outside AzureML.")
+        if local_dataset:
+            return check_dataset_folder_exists(local_dataset)
        if azure_dataset_id:
-            mounted = try_to_mount_input_dataset(dataset_index)
-            if not mounted:
-                raise ValueError("Unable to mount or download input dataset.")
-            return mounted
-        return None
+            dataset_csv = ""
+            if isinstance(self.model_config, DeepLearningConfig):
+                dataset_csv = self.model_config.dataset_csv
+            return download_dataset(azure_dataset_id=azure_dataset_id,
+                                    target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
+                                    dataset_csv=dataset_csv, azure_config=self.azure_config)
+        raise ValueError("The model must contain either local_dataset or azure_dataset_id")

    def set_multiprocessing_start_method(self) -> None:
        """
--- a/InnerEye/ML/runner.py
+++ b/InnerEye/ML/runner.py
@ -2,12 +2,12 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import logging
 import os
 import sys
-import warnings
+import uuid
 from pathlib import Path
-
-import matplotlib
+from typing import Optional, Tuple

 # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
 # individual warnings only.
@ -21,29 +21,29 @@ if (innereye_root / "InnerEye").is_dir():
    if innereye_root_str not in sys.path:
        print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
        sys.path.insert(0, innereye_root_str)
-# We change the current working directory before starting the actual training. However, this throws off starting
-# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
-# path.
-runner_path = Path(sys.argv[0])
-if not runner_path.is_absolute():
-    sys.argv[0] = str(runner_path.absolute())
-
-import logging
-from typing import Optional, Tuple
+from InnerEye.Common import fixed_paths
+fixed_paths.add_submodules_to_path()

 from azureml._base_sdk_common import user_agent
-from azureml.core import Run
+from azureml.core import Run, ScriptRunConfig
+from health.azure.himl import AzureRunInfo, submit_to_azure_if_needed
+from health.azure.azure_util import create_run_recovery_id, merge_conda_files, to_azure_friendly_string
+import matplotlib

+from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
 from InnerEye.Azure import azure_util
 from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
-from InnerEye.Azure.azure_runner import create_runner_parser, get_git_tags, parse_args_and_add_yaml_variables, \
-    parse_arguments, set_environment_variables_for_multi_node, submit_to_azureml
-from InnerEye.Azure.azure_util import RUN_CONTEXT, get_all_environment_files, is_offline_run_context, \
-    is_run_and_child_runs_completed
+from InnerEye.Azure.azure_runner import (DEFAULT_DOCKER_BASE_IMAGE, create_dataset_configs, create_experiment_name,
+                                         create_runner_parser,
+                                         get_git_tags,
+                                         parse_args_and_add_yaml_variables,
+                                         parse_arguments, additional_run_tags,
+                                         set_environment_variables_for_multi_node)
+from InnerEye.Azure.azure_util import (RUN_CONTEXT, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files,
+                                       is_offline_run_context)
 from InnerEye.Azure.run_pytest import download_pytest_result, run_pytest
-from InnerEye.Common import fixed_paths
-from InnerEye.Common.common_util import FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
-    append_to_amlignore, disable_logging_to_file, is_linux, logging_to_stdout
+from InnerEye.Common.common_util import (FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE,
+                                         disable_logging_to_file, is_linux, logging_to_stdout)
 from InnerEye.Common.generic_parsing import GenericConfig
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
@ -54,6 +54,13 @@ from InnerEye.ML.run_ml import MLRunner, ModelDeploymentHookSignature, PostCross
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
 from InnerEye.ML.lightning_container import LightningContainer

+# We change the current working directory before starting the actual training. However, this throws off starting
+# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
+# path.
+runner_path = Path(sys.argv[0])
+if not runner_path.is_absolute():
+    sys.argv[0] = str(runner_path.absolute())
+

 def initialize_rpdb() -> None:
    """
@ -185,7 +192,7 @@ class Runner:
            logging.info("extra_code_directory is unset")
        return parser_result

-    def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]:
+    def run(self) -> Tuple[Optional[DeepLearningConfig], AzureRunInfo]:
        """
        The main entry point for training and testing models from the commandline. This chooses a model to train
        via a commandline argument, runs training or testing, and writes all required info to disk and logs.
@ -201,64 +208,130 @@ class Runner:
        if self.lightning_container.perform_cross_validation:
            # force hyperdrive usage if performing cross validation
            self.azure_config.hyperdrive = True
-        run_object: Optional[Run] = None
-        if self.azure_config.azureml:
-            run_object = self.submit_to_azureml()
-        else:
-            self.run_in_situ()
+        azure_run_info = self.submit_to_azureml_if_needed()
+        self.run_in_situ(azure_run_info)
        if self.model_config is None:
-            return self.lightning_container, run_object
-        return self.model_config, run_object
+            return self.lightning_container, azure_run_info
+        return self.model_config, azure_run_info

-    def submit_to_azureml(self) -> Run:
+    def submit_to_azureml_if_needed(self) -> AzureRunInfo:
        """
        Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
        completion and the Run did not succeed.
        """
-        # The adal package creates a logging.info line each time it gets an authentication token, avoid that.
-        logging.getLogger('adal-python').setLevel(logging.WARNING)
-        # Azure core prints full HTTP requests even in INFO mode
-        logging.getLogger('azure').setLevel(logging.WARNING)
-        # PyJWT prints out warnings that are beyond our control
-        warnings.filterwarnings("ignore", category=DeprecationWarning)
-        if isinstance(self.model_config, DeepLearningConfig) and not self.lightning_container.azure_dataset_id:
+        if self.azure_config.azureml and isinstance(self.model_config, DeepLearningConfig) \
+                and not self.lightning_container.azure_dataset_id:
            raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
                             "property must be set.")
        source_config = SourceConfig(
            root_folder=self.project_root,
            entry_script=Path(sys.argv[0]).resolve(),
+            script_params=sys.argv[1:],
            conda_dependencies_files=get_all_environment_files(self.project_root),
            hyperdrive_config_func=(self.model_config.get_hyperdrive_config if self.model_config
                                    else self.lightning_container.get_hyperdrive_config),
            # For large jobs, upload of results can time out because of large checkpoint files. Default is 600
            upload_timeout_seconds=86400
        )
-        source_config.set_script_params_except_submit_flag()
        # Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed
        # when running pytest.
        ignored_folders = []
        if not self.azure_config.pytest_mark:
-            ignored_folders.extend(["Tests", "TestsOutsidePackage", "TestSubmodule"])
+            ignored_folders.extend(["Tests", "TestsOutsidePackage"])
        if not self.lightning_container.regression_test_folder:
            ignored_folders.append("RegressionTestResults")
-        with append_to_amlignore(ignored_folders):
-            azure_run = submit_to_azureml(self.azure_config, source_config,
-                                          self.lightning_container.all_azure_dataset_ids(),
-                                          self.lightning_container.all_dataset_mountpoints())
-        logging.info("Job submission to AzureML done.")
-        if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
-            # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
-            # A build step will pick up that file and publish it to Azure DevOps.
-            # If pytest_mark is set, this file must exist.
-            logging.info("Downloading pytest result file.")
-            download_pytest_result(azure_run)
-        else:
-            logging.info("No pytest_mark present, hence not downloading the pytest result file.")
-        # For PR builds where we wait for job completion, the job must have ended in a COMPLETED state.
-        if self.azure_config.wait_for_completion and not is_run_and_child_runs_completed(azure_run):
-            raise ValueError(f"Run {azure_run.id} in experiment {azure_run.experiment.name} or one of its child "
-                             "runs failed.")
-        return azure_run
+
+        input_datasets = create_dataset_configs(self.azure_config,
+                                                all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(),
+                                                all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints())
+
+
+        def after_submission_hook(azure_run: Run) -> None:
+            """
+            A function that will be called right after job submission.
+            """
+            # Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
+            # run in cross validation analysis
+            recovery_id = create_run_recovery_id(azure_run)
+            azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id)
+            print("If this run fails, re-start runner.py and supply these additional arguments: "
+                  f"--run_recovery_id={recovery_id}")
+            if self.azure_config.tensorboard:
+                print("Starting TensorBoard now because you specified --tensorboard")
+                monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[azure_run.id]),
+                        azure_config=self.azure_config)
+            else:
+                print(f"To monitor this run locally using TensorBoard, run the script: "
+                      f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")
+
+            if self.azure_config.wait_for_completion:
+                # We want the job output to be visible on the console, but the program should not exit if the
+                # job fails because we need to download the pytest result file.
+                azure_run.wait_for_completion(show_output=True, raise_on_error=False)
+            if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
+                # The AzureML job can optionally run pytest. Attempt to download it to the current directory.
+                # A build step will pick up that file and publish it to Azure DevOps.
+                # If pytest_mark is set, this file must exist.
+                logging.info("Downloading pytest result file.")
+                download_pytest_result(azure_run)
+
+        hyperdrive_config = None
+        if self.azure_config.hyperdrive:
+            hyperdrive_config = self.lightning_container.get_hyperdrive_config(ScriptRunConfig(source_directory=""))
+
+        # Create a temporary file for the merged conda file, that will be removed after submission of the job.
+        temp_conda: Optional[Path] = None
+        try:
+            if len(source_config.conda_dependencies_files) > 1:
+                temp_conda = source_config.root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml"
+                # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not
+                # be necessary if the innereye package is installed. It is necessary when working with an outer project
+                # and InnerEye as a git submodule and submitting jobs from the local machine.
+                # In case of version conflicts, the package version in the outer project is given priority.
+                merge_conda_files(source_config.conda_dependencies_files, temp_conda)
+
+            # Calls like `self.azure_config.get_workspace()` will fail if we have no AzureML credentials set up, and so
+            # we should only attempt them if we intend to elevate this to AzureML
+            if self.azure_config.azureml:
+                if not self.azure_config.cluster:
+                    raise ValueError("self.azure_config.cluster not set, but we need a compute_cluster_name to submit"
+                                     "the script to run in AzureML")
+                azure_run_info = submit_to_azure_if_needed(
+                    entry_script=source_config.entry_script,
+                    snapshot_root_directory=source_config.root_folder,
+                    script_params=source_config.script_params,
+                    conda_environment_file=temp_conda or source_config.conda_dependencies_files[0],
+                    aml_workspace=self.azure_config.get_workspace(),
+                    compute_cluster_name=self.azure_config.cluster,
+                    environment_variables=source_config.environment_variables,
+                    default_datastore=self.azure_config.azureml_datastore,
+                    experiment_name=to_azure_friendly_string(create_experiment_name(self.azure_config)),
+                    max_run_duration=self.azure_config.max_run_duration,
+                    input_datasets=input_datasets,
+                    num_nodes=self.azure_config.num_nodes,
+                    wait_for_completion=False,
+                    ignored_folders=ignored_folders,
+                    pip_extra_index_url=self.azure_config.pip_extra_index_url,
+                    submit_to_azureml=self.azure_config.azureml,
+                    docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
+                    docker_shm_size=self.azure_config.docker_shm_size,
+                    tags=additional_run_tags(
+                        azure_config=self.azure_config,
+                        commandline_args=" ".join(source_config.script_params)),
+                    after_submission=after_submission_hook,
+                    hyperdrive_config=hyperdrive_config)
+            else:
+                # compute_cluster_name is a required parameter in early versions of the HI-ML package
+                azure_run_info = submit_to_azure_if_needed(
+                    input_datasets=input_datasets,
+                    submit_to_azureml=False,
+                    compute_cluster_name="")
+        finally:
+            if temp_conda:
+                temp_conda.unlink()
+        # submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running
+        # the script locally or in AzureML.
+        return azure_run_info

    def print_git_tags(self) -> None:
        """
@ -278,9 +351,11 @@ class Runner:
        for key, value in tags_to_print.items():
            logging.info(f"    {key:20}: {value}")

-    def run_in_situ(self) -> None:
+    def run_in_situ(self, azure_run_info: AzureRunInfo) -> None:
        """
        Actually run the AzureML job; this method will typically run on an Azure VM.
+        :param azure_run_info: Contains all information about the present run in AzureML, in particular where the
+        datasets are mounted.
        """
        # Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
        # build itself, but not the tons of debug information that AzureML submissions create.
@ -305,7 +380,7 @@ class Runner:
            # if it detects that it is not in a multi-node environment.
            set_environment_variables_for_multi_node()
            ml_runner = self.create_ml_runner()
-            ml_runner.setup()
+            ml_runner.setup(azure_run_info)
            ml_runner.start_logging_to_file()
            try:
                ml_runner.run()
--- a/InnerEye/Scripts/submit_for_inference.py
+++ b/InnerEye/Scripts/submit_for_inference.py
@ -11,15 +11,15 @@ from typing import Dict, List, Optional

 import param
 import requests
-from azureml.core import Experiment, Model
+from azureml.core import Model, ScriptRunConfig

-from InnerEye.Azure.azure_config import AzureConfig, SourceConfig
-from InnerEye.Azure.azure_runner import create_run_config
+from InnerEye.Azure.azure_config import AzureConfig
 from InnerEye.Common.common_util import logging_to_stdout
 from InnerEye.Common.fixed_paths import DEFAULT_DATA_FOLDER, DEFAULT_RESULT_IMAGE_NAME, DEFAULT_RESULT_ZIP_DICOM_NAME, \
-    DEFAULT_TEST_IMAGE_NAME, DEFAULT_TEST_ZIP_NAME, ENVIRONMENT_YAML_FILE_NAME, RUN_SCORING_SCRIPT, SCORE_SCRIPT, \
-    SETTINGS_YAML_FILE, repository_root_directory, PYTHON_ENVIRONMENT_NAME
+    DEFAULT_TEST_IMAGE_NAME, DEFAULT_TEST_ZIP_NAME, ENVIRONMENT_YAML_FILE_NAME, PYTHON_ENVIRONMENT_NAME, \
+    RUN_SCORING_SCRIPT, SCORE_SCRIPT, SETTINGS_YAML_FILE, repository_root_directory
 from InnerEye.Common.generic_parsing import GenericConfig
+from health.azure.himl import create_run_configuration, submit_run


 class SubmitForInferenceConfig(GenericConfig):
@ -29,8 +29,9 @@ class SubmitForInferenceConfig(GenericConfig):
    experiment_name: str = param.String(default="model_inference",
                                        doc="Name of experiment the run should belong to")
    model_id: str = param.String(doc="Id of model, e.g. Prostate:123. Mandatory.")
-    image_file: Path = param.ClassSelector(class_=Path, doc="Image file to segment, ending in .nii.gz if use_dicom=False, "
-                                                            "or zip of a DICOM series otherwise. Mandatory.")
+    image_file: Path = param.ClassSelector(class_=Path,
+                                           doc="Image file to segment, ending in .nii.gz if use_dicom=False, "
+                                               "or zip of a DICOM series otherwise. Mandatory.")
    settings: Path = param.ClassSelector(class_=Path,
                                         doc="File containing Azure settings (typically your settings.yml). If not "
                                             "provided, use the default settings file.")
@ -66,7 +67,8 @@ class SubmitForInferenceConfig(GenericConfig):
 def copy_image_file(image: Path, destination_folder: Path, use_dicom: bool) -> Path:
    """
    Copy the source image file into the given folder destination_folder.
-    :param image: image file, must be Gzipped Nifti format with name ending .nii.gz if use_dicom=False or .zip otherwise.
+    :param image: image file, must be Gzipped Nifti format with name ending .nii.gz if use_dicom=False or .zip
+    otherwise.
    :param destination_folder: top-level directory to copy image into (as test.nii.gz or test.zip)
    :param use_dicom: True to treat as a zip file.
    :return: The full path of the image in the destination_folder
@ -154,43 +156,45 @@ def submit_for_inference(args: SubmitForInferenceConfig, azure_config: AzureConf
    # clash.
    temp_folder = source_directory_path / "temp_for_scoring"
    conda_files = download_files_from_model(model_sas_urls, ENVIRONMENT_YAML_FILE_NAME, dir_path=temp_folder)
-    if not conda_files:
-        raise ValueError("At least 1 Conda environment definition must exist in the model.")
+    if len(conda_files) != 1:
+        raise ValueError("Exactly 1 Conda environment definition must exist in the model.")
    # Retrieve the name of the Python environment that the training run used. This environment should have been
    # registered. If no such environment exists, it will be re-create from the Conda files provided.
    python_environment_name = model.tags.get(PYTHON_ENVIRONMENT_NAME, "")
+    if not python_environment_name:
+        raise ValueError(f"The model did not contain tag {PYTHON_ENVIRONMENT_NAME} for the AzureML environment to use.")
    # Copy the scoring script from the repository. This will start the model download from Azure, and invoke the
    # scoring script.
    entry_script = source_directory_path / Path(RUN_SCORING_SCRIPT).name
    shutil.copyfile(str(repository_root_directory(RUN_SCORING_SCRIPT)),
                    str(entry_script))
-    source_config = SourceConfig(
-        root_folder=source_directory_path,
-        entry_script=entry_script,
-        script_params=["--model-folder", ".",
-                       "--model-id", model_id,
-                       SCORE_SCRIPT,
-                       # The data folder must be relative to the root folder of the AzureML job. test_image_files
-                       # is then just the file relative to the data_folder
-                       "--data_folder", image.parent.name,
-                       "--image_files", image.name,
-                       "--use_dicom", str(args.use_dicom),
-                       "--model_id", model_id],
-        conda_dependencies_files=conda_files,
+    run_config = create_run_configuration(workspace=azure_config.get_workspace(),
+                                          compute_cluster_name=azure_config.cluster,
+                                          aml_environment_name=python_environment_name)
+    script_run_config = ScriptRunConfig(
+        source_directory=str(source_directory_path),
+        script=entry_script.relative_to(source_directory_path),
+        arguments=["--model-folder", ".",
+                   "--model-id", model_id,
+                   SCORE_SCRIPT,
+                   # The data folder must be relative to the root folder of the AzureML
+                   # job. image_files is then just the file relative to the data_folder
+                   "--data_folder", image.parent.name,
+                   "--image_files", image.name,
+                   "--use_dicom", str(args.use_dicom),
+                   "--model_id", model_id],
+        run_config=run_config
    )
-    run_config = create_run_config(azure_config, source_config, environment_name=python_environment_name,
-                                   all_azure_dataset_ids=[], all_dataset_mountpoints=[])
-    exp = Experiment(workspace=workspace, name=args.experiment_name)
-    run = exp.submit(run_config)
-    logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}")
-    logging.info(f"Run URL: {run.get_portal_url()}")
+
+    run = submit_run(workspace=workspace,
+                     experiment_name=args.experiment_name,
+                     script_run_config=script_run_config,
+                     wait_for_completion=True)
    if not args.keep_upload_folder:
        source_directory.cleanup()
        logging.info(f"Deleted submission directory {source_directory_path}")
    if args.download_folder is None:
        return None
-    logging.info("Awaiting run completion")
-    run.wait_for_completion()
    logging.info(f"Run has completed with status {run.get_status()}")
    download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if args.use_dicom else DEFAULT_RESULT_IMAGE_NAME
    download_path = choose_download_path(download_file, args.download_folder)
--- a/Tests/AfterTraining/test_after_training.py
+++ b/Tests/AfterTraining/test_after_training.py
@ -20,9 +20,9 @@ import numpy as np
 import pytest
 from azureml._restclient.constants import RunStatus
 from azureml.core import Model, Run
+from health.azure.himl import RUN_RECOVERY_FILE

 from InnerEye.Azure.azure_config import AzureConfig
-from InnerEye.Azure.azure_runner import RUN_RECOVERY_FILE
 from InnerEye.Azure.azure_util import MODEL_ID_KEY_NAME, download_run_output_file, download_run_outputs_by_prefix, \
    get_comparison_baseline_paths, \
    is_running_on_azure_agent, to_azure_friendly_string
@ -101,6 +101,7 @@ def get_most_recent_model_id(fallback_run_id_for_local_execution: str = FALLBACK
    azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE,
                                         project_root=fixed_paths.repository_root_directory())
    run = azure_config.fetch_run(most_recent_run)
+    assert run.status == "Completed", f"AzureML run {run.id} did not complete successfully."
    tags = run.get_tags()
    model_id = tags.get(MODEL_ID_KEY_NAME, None)
    assert model_id, f"No model_id tag was found on run {most_recent_run}"
@ -209,40 +210,41 @@ def test_check_dataset_mountpoint(test_output_dirs: OutputFolderForTests) -> Non


@pytest.mark.inference
-@pytest.mark.parametrize("use_dicom", [False, True])
-def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderForTests) -> None:
+def test_submit_for_inference(test_output_dirs: OutputFolderForTests) -> None:
    """
    Execute the submit_for_inference script on the model that was recently trained. This starts an AzureML job,
    and downloads the segmentation. Then check if the segmentation was actually produced.
-
-    :param use_dicom: True to test DICOM in/out, False otherwise.
    :param test_output_dirs: Test output directories.
    """
    model = get_most_recent_model(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
    assert PYTHON_ENVIRONMENT_NAME in model.tags, "Environment name not present in model properties"
-    if use_dicom:
-        size = (64, 64, 64)
-        spacing = (1., 1., 2.5)
-        image_file = test_output_dirs.root_dir / "temp_pack_dicom_series" / "dicom_series.zip"
-        scratch_folder = test_output_dirs.root_dir / "temp_dicom_series"
-        zip_random_dicom_series(size, spacing, image_file, scratch_folder)
-    else:
-        image_file = fixed_paths_for_tests.full_ml_test_data_path() / "train_and_test_data" / "id1_channel1.nii.gz"
-    assert image_file.exists(), f"Image file not found: {image_file}"
-    settings_file = fixed_paths.SETTINGS_YAML_FILE
-    assert settings_file.exists(), f"Settings file not found: {settings_file}"
-    args = ["--image_file", str(image_file),
-            "--model_id", model.id,
-            "--settings", str(settings_file),
-            "--download_folder", str(test_output_dirs.root_dir),
-            "--cluster", "training-nc12",
-            "--experiment", get_experiment_name_from_environment() or "model_inference",
-            "--use_dicom", str(use_dicom)]
-    download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if use_dicom else DEFAULT_RESULT_IMAGE_NAME
-    seg_path = test_output_dirs.root_dir / download_file
-    assert not seg_path.exists(), f"Result file {seg_path} should not yet exist"
-    submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
-    assert seg_path.exists(), f"Result file {seg_path} was not created"
+    # Both parts of this test rely on the same model that was trained in a previous run. If these tests are executed
+    # independently (via pytest.mark.parametrize), get_most_recent_model would pick up the AML run that the
+    # previously executed part of this test submitted.
+    for use_dicom in [False, True]:
+        if use_dicom:
+            size = (64, 64, 64)
+            spacing = (1., 1., 2.5)
+            image_file = test_output_dirs.root_dir / "temp_pack_dicom_series" / "dicom_series.zip"
+            scratch_folder = test_output_dirs.root_dir / "temp_dicom_series"
+            zip_random_dicom_series(size, spacing, image_file, scratch_folder)
+        else:
+            image_file = fixed_paths_for_tests.full_ml_test_data_path() / "train_and_test_data" / "id1_channel1.nii.gz"
+        assert image_file.exists(), f"Image file not found: {image_file}"
+        settings_file = fixed_paths.SETTINGS_YAML_FILE
+        assert settings_file.exists(), f"Settings file not found: {settings_file}"
+        args = ["--image_file", str(image_file),
+                "--model_id", model.id,
+                "--settings", str(settings_file),
+                "--download_folder", str(test_output_dirs.root_dir),
+                "--cluster", "training-nc12",
+                "--experiment", get_experiment_name_from_environment() or "model_inference",
+                "--use_dicom", str(use_dicom)]
+        download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if use_dicom else DEFAULT_RESULT_IMAGE_NAME
+        seg_path = test_output_dirs.root_dir / download_file
+        assert not seg_path.exists(), f"Result file {seg_path} should not yet exist"
+        submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
+        assert seg_path.exists(), f"Result file {seg_path} was not created"


 def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool:
@ -375,9 +377,7 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
    # Check diagnostic messages that show if DDP was set up correctly. This could fail if Lightning
    # changes its diagnostic outputs.
    assert "initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4" in log0_txt
-    assert "initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/4" in log0_txt
    assert "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
-    assert "initializing ddp: GLOBAL_RANK: 3, MEMBER: 4/4" in log1_txt


@pytest.mark.skip("The recovery job hangs after completing on AML")
@ -395,8 +395,10 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
                 "--tag", "recovery_on_2_nodes"
                 ]
    script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
-    with mock.patch("sys.argv", [script] + args_list):
-        main()
+    # Submission of the recovery job will try to exit the process, catch that and check the submitted run.
+    with pytest.raises(SystemExit):
+        with mock.patch("sys.argv", [script] + args_list):
+            main()
    run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
    assert run.status == RunStatus.COMPLETED
    files = run.get_file_names()
--- a/Tests/Azure/test_azure_config.py
+++ b/Tests/Azure/test_azure_config.py
@ -5,7 +5,7 @@
 import pytest

 from InnerEye.Azure.azure_config import AzureConfig
-from InnerEye.Azure.azure_runner import create_dataset_consumptions
+from InnerEye.Azure.azure_runner import create_dataset_configs
 from Tests.ML.util import get_default_azure_config


@ -20,7 +20,7 @@ def test_dataset_consumption1() -> None:
    Test that an empty dataset ID will not produce any dataset consumption.
    """
    azure_config = get_default_azure_config()
-    assert len(create_dataset_consumptions(azure_config, [""], [""])) == 0
+    assert len(create_dataset_configs(azure_config, [""], [""])) == 0


 def test_dataset_consumption2() -> None:
@ -29,7 +29,7 @@ def test_dataset_consumption2() -> None:
    """
    azure_config = get_default_azure_config()
    with pytest.raises(ValueError) as ex:
-        create_dataset_consumptions(azure_config, [""], ["foo"])
+        create_dataset_configs(azure_config, [""], ["foo"])
    assert "but a mount point has been provided" in str(ex)


@ -38,7 +38,7 @@ def test_dataset_consumption3() -> None:
    Test that a matching number of mount points is created.
    """
    azure_config = get_default_azure_config()
-    assert len(create_dataset_consumptions(azure_config, ["test-dataset", "test-dataset"], [])) == 2
+    assert len(create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], [])) == 2


 def test_dataset_consumption4() -> None:
@ -47,15 +47,5 @@ def test_dataset_consumption4() -> None:
    """
    azure_config = get_default_azure_config()
    with pytest.raises(ValueError) as ex:
-        create_dataset_consumptions(azure_config, ["test-dataset", "test-dataset"], ["foo"])
+        create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], ["foo"])
    assert "must equal the number of Azure dataset IDs" in str(ex)
-
-
-def test_dataset_consumption5() -> None:
-    """
-    Test error handling for empty dataset IDs.
-    """
-    azure_config = get_default_azure_config()
-    with pytest.raises(ValueError) as ex:
-        azure_config.get_or_create_dataset("")
-    assert "No dataset ID provided" in str(ex)
--- a/Tests/Azure/test_azure_util.py
+++ b/Tests/Azure/test_azure_util.py
@ -2,25 +2,23 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
-import time
 from pathlib import Path
-from typing import Iterator, Set

 import pytest
 from azureml.core import Run
 from azureml.core.workspace import Workspace

-from InnerEye.Azure.azure_config import AzureConfig, SourceConfig
-from InnerEye.Azure.azure_runner import create_experiment_name, get_or_create_python_environment
+from InnerEye.Azure.azure_config import AzureConfig
+from InnerEye.Azure.azure_runner import create_experiment_name
 from InnerEye.Azure.azure_util import DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, fetch_child_runs, fetch_run, \
-    get_cross_validation_split_index, is_cross_validation_child_run, is_run_and_child_runs_completed, \
-    merge_conda_dependencies, merge_conda_files, to_azure_friendly_container_path
-from InnerEye.Common.common_util import logging_to_stdout, is_linux
+    get_cross_validation_split_index, is_cross_validation_child_run, \
+    to_azure_friendly_container_path
+from InnerEye.Common.common_util import logging_to_stdout
 from InnerEye.Common.fixed_paths import PRIVATE_SETTINGS_FILE, PROJECT_SECRETS_FILE, \
-    get_environment_yaml_file, repository_root_directory
-from InnerEye.Common.output_directories import OutputFolderForTests
+    repository_root_directory
 from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, get_most_recent_run, get_most_recent_run_id
-from Tests.ML.util import get_default_azure_config, get_default_workspace
+from Tests.ML.util import get_default_workspace
+from health.azure.azure_util import is_run_and_child_runs_completed


 def test_os_path_to_azure_friendly_container_path() -> None:
@ -85,66 +83,6 @@ def test_is_cross_validation_child_run_ensemble_run() -> None:
    assert all([is_cross_validation_child_run(x) for x in fetch_child_runs(run)])


-@pytest.mark.skipif(is_linux(), reason="Spurious file read/write errors on linux build agents.")
-def test_merge_conda(test_output_dirs: OutputFolderForTests) -> None:
-    """
-    Tests the logic for merging Conda environment files.
-    """
-    env1 = """
-channels:
-  - defaults
-  - pytorch
-dependencies:
-  - conda1=1.0
-  - conda2=2.0
-  - conda_both=3.0
-  - pip:
-      - azureml-sdk==1.7.0
-      - foo==1.0
-"""
-    env2 = """
-channels:
-  - defaults
-dependencies:
-  - conda1=1.1
-  - conda_both=3.0
-  - pip:
-      - azureml-sdk==1.6.0
-      - bar==2.0
-"""
-    # Spurious test failures on Linux build agents, saying that they can't write the file. Wait a bit.
-    time.sleep(0.5)
-    file1 = test_output_dirs.root_dir / "env1.yml"
-    file1.write_text(env1)
-    file2 = test_output_dirs.root_dir / "env2.yml"
-    file2.write_text(env2)
-    # Spurious test failures on Linux build agents, saying that they can't read the file. Wait a bit.
-    time.sleep(0.5)
-    files = [file1, file2]
-    merged_file = test_output_dirs.root_dir / "merged.yml"
-    merge_conda_files(files, merged_file)
-    assert merged_file.read_text().splitlines() == """channels:
- defaults
- pytorch
-dependencies:
- conda1=1.0
- conda1=1.1
- conda2=2.0
- conda_both=3.0
- pip:
-  - azureml-sdk==1.6.0
-  - azureml-sdk==1.7.0
-  - bar==2.0
-  - foo==1.0
-""".splitlines()
-    conda_dep, _ = merge_conda_dependencies(files)
-    # We expect to see the union of channels.
-    assert list(conda_dep.conda_channels) == ["defaults", "pytorch"]
-    # Package version conflicts are not resolved, both versions are retained.
-    assert list(conda_dep.conda_packages) == ["conda1=1.0", "conda1=1.1", "conda2=2.0", "conda_both=3.0"]
-    assert list(conda_dep.pip_packages) == ["azureml-sdk==1.6.0", "azureml-sdk==1.7.0", "bar==2.0", "foo==1.0"]
-
-
 def test_experiment_name() -> None:
    c = AzureConfig()
    c.build_branch = "branch"
@ -196,29 +134,3 @@ def test_amlignore() -> None:
    test_variables = repository_root_directory(PROJECT_SECRETS_FILE)
    if test_variables.is_file():
        assert PROJECT_SECRETS_FILE in ignored, f"{PROJECT_SECRETS_FILE} is not in .amlignore"
-
-
-def test_create_python_env() -> None:
-    """
-    Checks if environment variables in the SourceConfig are correctly passed through to the Python environment.
-    Environment variables in SourceConfig are only used in the internal InnerEye repo.
-    :return:
-    """
-    foo = "foo"
-    bar = "bar"
-    entry_script = Path("something.py")
-    conda_file = get_environment_yaml_file()
-    s = SourceConfig(root_folder=Path(""), entry_script=entry_script, conda_dependencies_files=[conda_file],
-                     environment_variables={foo: bar})
-    env = get_or_create_python_environment(source_config=s,
-                                           azure_config=get_default_azure_config(),
-                                           register_environment=False)
-    assert foo in env.environment_variables
-    assert env.environment_variables[foo] == bar
-
-    # Check that some of the basic packages that we expect to always exist are picked up correctly in the Conda env
-    def remove_version_number(items: Iterator[str]) -> Set[str]:
-        return set(c.split("=")[0] for c in items)
-
-    assert "pytorch" in remove_version_number(env.python.conda_dependencies.conda_packages)
-    assert "pytorch-lightning" in remove_version_number(env.python.conda_dependencies.pip_packages)
--- a/Tests/Azure/test_parsing.py
+++ b/Tests/Azure/test_parsing.py
@ -3,13 +3,12 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 from enum import Enum
-from pathlib import Path
 from typing import Any, Optional
 from unittest import mock

 import pytest

-from InnerEye.Azure.azure_config import AZURECONFIG_SUBMIT_TO_AZUREML, AzureConfig, SourceConfig
+from InnerEye.Azure.azure_config import AZURECONFIG_SUBMIT_TO_AZUREML, AzureConfig
 from InnerEye.Azure.azure_runner import create_runner_parser, parse_args_and_add_yaml_variables, \
    run_duration_string_to_seconds
 from InnerEye.Azure.parser_util import _is_empty_or_empty_string_list
@ -134,37 +133,6 @@ def test_azureml_submit_constant() -> None:
    assert hasattr(azure_config, AZURECONFIG_SUBMIT_TO_AZUREML)


-def test_source_config_set_params() -> None:
-    """
-    Check that commandline arguments are set correctly when submitting the script to AzureML.
-    In particular, the azureml flag should be omitted, irrespective of how the argument is written.
-    """
-    s = SourceConfig(root_folder=Path(""), entry_script=Path("something.py"), conda_dependencies_files=[])
-
-    def assert_has_params(expected_args: str) -> None:
-        assert s.script_params is not None
-        # Arguments are in the keys of the dictionary only, and should have been added in the right order
-        assert " ".join(s.script_params) == expected_args
-
-    with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}=True", "--more"]):
-        s.set_script_params_except_submit_flag()
-    assert_has_params("some --param 1 --more")
-    with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "False", "--more"]):
-        s.set_script_params_except_submit_flag()
-    assert_has_params("some --param 1 --more")
-    # Using the new syntax for boolean flags
-    with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "--more"]):
-        s.set_script_params_except_submit_flag()
-    assert_has_params("some --param 1 --more")
-    with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}"]):
-        s.set_script_params_except_submit_flag()
-    assert_has_params("some --param 1")
-    # Arguments where azureml is just the prefix should not be removed.
-    with mock.patch("sys.argv", ["", "some", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}foo", "False", "--more"]):
-        s.set_script_params_except_submit_flag()
-    assert_has_params(f"some --{AZURECONFIG_SUBMIT_TO_AZUREML}foo False --more")
-
-
@pytest.mark.parametrize(["s", "expected"],
                         [
                             ("1s", 1),
--- a/Tests/Common/test_commandline_parsing.py
+++ b/Tests/Common/test_commandline_parsing.py
@ -15,6 +15,7 @@ from InnerEye.ML.config import PhotometricNormalizationMethod, SegmentationModel
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
 from InnerEye.ML.runner import Runner
 from Tests.ML.configs.DummyModel import DummyModel
+from health.azure.himl import AzureRunInfo


@pytest.mark.parametrize("is_container", [True, False])
@ -55,12 +56,19 @@ def test_create_ml_runner_args(is_container: bool,
    with mock.patch("sys.argv", [""] + args_list):
        with mock.patch("InnerEye.ML.deep_learning_config.is_offline_run_context", return_value=is_offline_run):
            with mock.patch("InnerEye.ML.run_ml.MLRunner.run", return_value=None):
-                with mock.patch("InnerEye.ML.run_ml.MLRunner.mount_or_download_dataset", return_value=dataset_folder):
+                with mock.patch("InnerEye.ML.run_ml.MLRunner.download_or_use_existing_dataset",
+                                return_value=dataset_folder):
                    runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
                    runner.parse_and_load_model()
                    # Only when calling config.create_filesystem we expect to see the correct paths, and this happens
                    # inside run_in_situ
-                    runner.run_in_situ()
+                    azure_run_info = AzureRunInfo(input_datasets=[None],
+                                                  output_datasets=[None],
+                                                  run=None,
+                                                  is_running_in_azure=False,
+                                                  output_folder=Path.cwd(),
+                                                  logs_folder=Path.cwd())
+                    runner.run_in_situ(azure_run_info)
                    azure_config = runner.azure_config
                    container_or_legacy_config = runner.lightning_container if is_container else runner.model_config
    assert azure_config.model == model_name
--- a/Tests/Common/test_util.py
+++ b/Tests/Common/test_util.py
@ -3,13 +3,15 @@
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
 import os
+import sys
 from pathlib import Path

 import pytest

-from InnerEye.Common import common_util, fixed_paths
-from InnerEye.Common.common_util import (append_to_amlignore, change_working_directory, check_is_any_of,
+from InnerEye.Common import common_util
+from InnerEye.Common.common_util import (change_working_directory, check_is_any_of,
                                         is_private_field_name, namespace_to_path, path_to_namespace, print_exception)
+from InnerEye.Common.fixed_paths import add_submodules_to_path, repository_root_directory
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path, tests_root_directory
 from InnerEye.Common.output_directories import OutputFolderForTests

@ -125,19 +127,15 @@ def test_change_dir(test_output_dirs: OutputFolderForTests) -> None:
    assert (new_dir / "bar.txt").is_file()


-def test_modify_amlignore() -> None:
-    """
-    Test that we can change the .AMLignore file and change it back to what it was before.
-    """
-    folder1 = "Added1"
-    folder2 = "Added2"
-    added_folders = [folder1, folder2]
-    amlignore = fixed_paths.repository_root_directory(".amlignore")
-    old_contents = amlignore.read_text()
-    for f in added_folders:
-        assert f not in old_contents
-    with append_to_amlignore(added_folders):
-        new_contents = amlignore.read_text()
-        for f in added_folders:
-            assert f in new_contents
-    assert amlignore.read_text() == old_contents
+def test_add_submodules_to_path() -> None:
+    original_sys_path = sys.path
+    try:
+        fastmri_folder = repository_root_directory() / "fastMRI"
+        fastmri_str = str(fastmri_folder)
+        assert fastmri_folder.is_dir()
+        if fastmri_str in sys.path:
+            sys.path.remove(fastmri_str)
+        add_submodules_to_path()
+        assert fastmri_str in sys.path
+    finally:
+        sys.path = original_sys_path
--- a/Tests/ML/configs/fastmri_random.py
+++ b/Tests/ML/configs/fastmri_random.py
@ -13,18 +13,13 @@ from typing import Any, Optional
 from _pytest.monkeypatch import MonkeyPatch
 from pytorch_lightning import LightningDataModule, LightningModule

-from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
 from InnerEye.ML.configs.other.fastmri_varnet import VarNetWithImageLogging
 from InnerEye.ML.lightning_container import LightningContainer
-
-add_folder_to_sys_path_if_needed("fastMRI")
-
+from fastMRI.tests.create_temp_data import create_temp_data
 from fastmri.data import SliceDataset
 from fastmri.data.subsample import create_mask_for_mask_type
 from fastmri.data.transforms import VarNetDataTransform
 from fastmri.pl_modules import FastMriDataModule
-# This import can fail if written as "from tests.create_temp_data, even though fastMRI is already in the path.
-from fastMRI.tests.create_temp_data import create_temp_data


 class FastMriRandomData(FastMriDataModule):
--- a/Tests/ML/models/test_instantiate_models.py
+++ b/Tests/ML/models/test_instantiate_models.py
@ -7,11 +7,9 @@ from typing import List
 from unittest import mock

 import pytest
-from azureml.core import Run

 from InnerEye.Common.common_util import logging_to_stdout, namespace_to_path
 from InnerEye.Common.output_directories import OutputFolderForTests
-from InnerEye.ML.lightning_container import LightningContainer
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
 from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling, generate_and_print_model_summary
 from Tests.ML.configs.DummyModel import DummyModel
@ -28,7 +26,8 @@ def find_models() -> List[str]:
    path = namespace_to_path(ModelConfigLoader.get_default_search_module())
    folders = [path / "segmentation", path / "classification", path / "regression"]
    names = [str(f.stem) for folder in folders for f in folder.glob("*.py") if folder.exists()]
-    return [name for name in names if not (name.endswith("Base") or name.endswith("Paper")) and not name.startswith("__")]
+    return [name for name in names if
+            not (name.endswith("Base") or name.endswith("Paper")) and not name.startswith("__")]


 def test_any_models_found() -> None:
@ -128,27 +127,6 @@ class MockDatasetConsumption:
    name = "dummy"


-@pytest.mark.parametrize("container_name", ["DummyContainerWithAzureDataset",
-                                            "DummyContainerWithoutDataset",
-                                            "DummyContainerWithLocalDataset",
-                                            "DummyContainerWithAzureAndLocalDataset"])
-def test_submit_container_to_azureml(container_name: str) -> None:
-    """
-    Test if we can get the config loader to load a Lightning container model, and get it through the AzureML
-    submission process.
-    """
-    runner = default_runner()
-    mock_run = Run.get_context()
-    args = ["", f"--model={container_name}", "--azureml=True", "--model_configs_namespace=Tests.ML.configs"]
-    with mock.patch("sys.argv", args):
-        with mock.patch("InnerEye.Azure.azure_config.AzureConfig.get_dataset_consumption",
-                        return_value=MockDatasetConsumption):
-            with mock.patch("azureml.core.Experiment.submit", return_value=mock_run):
-                loaded_config, actual_run = runner.run()
-    assert actual_run == mock_run
-    assert isinstance(runner.lightning_container, LightningContainer)
-
-
 def test_load_container_with_arguments() -> None:
    """
    Test if we can load a container and override a value in it via the commandline. Parameters can only be set at
--- a/Tests/ML/models/test_scalar_model.py
+++ b/Tests/ML/models/test_scalar_model.py
@ -377,7 +377,7 @@ def test_runner_restart(test_output_dirs: OutputFolderForTests) -> None:
    model_config.recovery_checkpoint_save_interval = 1
    model_config.recovery_checkpoints_save_last_k = -1
    runner = MLRunner(model_config=model_config)
-    runner.setup(use_mount_or_download_dataset=False)
+    runner.setup()
    # Epochs are 0 based for saving
    create_model_and_store_checkpoint(model_config,
                                      runner.container.checkpoint_folder / f"{RECOVERY_CHECKPOINT_FILE_NAME}_epoch="
--- a/Tests/ML/runners/test_runner.py
+++ b/Tests/ML/runners/test_runner.py
@ -396,14 +396,16 @@ def test_cross_validation_for_lighting_container_models_is_supported() -> None:
    to azure_runner's submit_to_azureml method.
    """
    args_list = ["--model=HelloContainer", "--number_of_cross_validation_splits=5", "--azureml=True"]
+    mock_run = mock.MagicMock()
+    mock_run.id = "foo"
+    mock_run.experiment.name = "bar"
    with mock.patch("sys.argv", [""] + args_list):
        runner = Runner(project_root=fixed_paths.repository_root_directory(),
                        yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
-        with mock.patch("InnerEye.Azure.azure_runner.create_and_submit_experiment",
-                        return_value=None) as create_and_submit_experiment_patch:
-            runner.run()
+        with mock.patch("health.azure.himl.submit_run", return_value=mock_run) as create_and_submit_experiment_patch:
+            with pytest.raises(SystemExit):
+                runner.run()
            assert runner.lightning_container.model_name == 'HelloContainer'
            assert runner.lightning_container.number_of_cross_validation_splits == 5
-            args, _ = create_and_submit_experiment_patch.call_args
-            script_run_config = args[1]
-            assert isinstance(script_run_config, HyperDriveConfig)
+            script_run_config_arg = create_and_submit_experiment_patch.call_args[1]["script_run_config"]
+            assert isinstance(script_run_config_arg, HyperDriveConfig)
--- a/Tests/ML/test_download_upload.py
+++ b/Tests/ML/test_download_upload.py
@ -25,6 +25,7 @@ from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, FALLB
 from Tests.ML.configs.DummyModel import DummyModel
 from Tests.ML.configs.lightning_test_containers import DummyContainerWithDatasets
 from Tests.ML.util import get_default_azure_config
+from health.azure.himl import AzureRunInfo

 logging_to_stdout(logging.DEBUG)

@ -89,26 +90,27 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
    # creation may need access to the dataset.
    with pytest.raises(ValueError) as ex:
        runner.setup()
-    assert ex.value.args[0] == "The model must contain either local_dataset or azure_dataset_id."
+    assert ex.value.args[0] == "Expecting that a dataset is available here."
    runner.project_root = test_output_dirs.root_dir

    # Pointing the model to a dataset folder that does not exist should raise an Exception
    fake_folder = runner.project_root / "foo"
    runner.container.local_dataset = fake_folder
    with pytest.raises(FileNotFoundError):
-        runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
+        runner.download_or_use_existing_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)

    # If the local dataset folder exists, mount_or_download_dataset should not do anything.
    fake_folder.mkdir()
-    local_dataset = runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
+    local_dataset = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
+                                                            runner.container.local_dataset)
    assert local_dataset == fake_folder

    # Pointing the model to a dataset in Azure should trigger a download
    runner.container.local_dataset = None
    runner.container.azure_dataset_id = dataset_name
    with logging_section("Starting download"):
-        result_path = runner.mount_or_download_dataset(runner.container.azure_dataset_id,
-                                                       runner.container.local_dataset)
+        result_path = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
+                                                              runner.container.local_dataset)
    # Download goes into <project_root> / "datasets" / "test_dataset"
    expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name
    assert result_path == expected_path
@ -156,17 +158,23 @@ def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests,

    with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run):
        with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path):
-            with mock.patch("InnerEye.ML.run_ml.try_to_mount_input_dataset", return_value=mount_path):
-                runner = MLRunner(config, container=container,
-                                  azure_config=None, project_root=test_output_dirs.root_dir)
-                runner.setup()
-                return runner.container
+            runner = MLRunner(config, container=container,
+                              azure_config=None, project_root=test_output_dirs.root_dir)
+            path_from_aml: List[Optional[Path]] = [None] if is_offline_run else [mount_path]
+            runner.setup(azure_run_info=AzureRunInfo(input_datasets=path_from_aml,
+                                                     output_datasets=[],
+                                                     run=None,
+                                                     is_running_in_azure=False,
+                                                     output_folder=Path(),
+                                                     logs_folder=Path()
+                                                     ))
+            return runner.container


@pytest.mark.parametrize(("is_lightning_model", "expected_error"),
                         [
                             # A built-in InnerEye model must have either local dataset or azure dataset provided.
-                             (False, "The model must contain either local_dataset or azure_dataset_id"),
+                             (False, "Expecting that a dataset is available here."),
                             # ... but this is OK for Lightning container models. A Lightning container could simply
                             # download its data from the web before training.
                             (True, "")
--- a/Tests/ML/test_lightning_containers.py
+++ b/Tests/ML/test_lightning_containers.py
@ -9,9 +9,9 @@ from unittest import mock

 import pandas as pd
 import pytest
-from pytorch_lightning import LightningModule
 from azureml.core import ScriptRunConfig
 from azureml.train.hyperdrive.runconfig import HyperDriveConfig
+from pytorch_lightning import LightningModule

 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.ML.common import ModelExecutionMode
@ -21,9 +21,10 @@ from InnerEye.ML.lightning_container import LightningContainer
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.run_ml import MLRunner
 from Tests.ML.configs.DummyModel import DummyModel
-from Tests.ML.configs.lightning_test_containers import DummyContainerWithAzureDataset, DummyContainerWithHooks, DummyContainerWithModel, \
-    DummyContainerWithPlainLightning
+from Tests.ML.configs.lightning_test_containers import (DummyContainerWithAzureDataset, DummyContainerWithHooks,
+                                                        DummyContainerWithModel, DummyContainerWithPlainLightning)
 from Tests.ML.util import default_runner
+from health.azure.himl import AzureRunInfo


 def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
@ -36,8 +37,7 @@ def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
    args = ["", "--model=DummyContainerWithModel", "--model_configs_namespace=Tests.ML.configs",
            f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
    with mock.patch("sys.argv", args):
-        loaded_config, actual_run = runner.run()
-    assert actual_run is None
+        runner.run()
    assert isinstance(runner.lightning_container, DummyContainerWithModel)
    # Test if the outputs folder is relative to the folder that we specified via the commandline
    runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
@ -81,8 +81,7 @@ def test_run_container_with_plain_lightning_in_situ(test_output_dirs: OutputFold
    args = ["", "--model=DummyContainerWithPlainLightning", "--model_configs_namespace=Tests.ML.configs",
            f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
    with mock.patch("sys.argv", args):
-        loaded_config, actual_run = runner.run()
-    assert actual_run is None
+        runner.run()
    assert isinstance(runner.lightning_container, DummyContainerWithPlainLightning)
    # Test if the outputs folder is relative to the folder that we specified via the commandline
    runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
@ -143,8 +142,8 @@ def test_run_fastmri_container(test_output_dirs: OutputFolderForTests) -> None:
            f"--output_to={test_output_dirs.root_dir}",
            "--model_configs_namespace=Tests.ML.configs"]
    with mock.patch("sys.argv", args):
-        loaded_config, actual_run = runner.run()
-    assert actual_run is None
+        loaded_config, run_info = runner.run()
+    assert isinstance(run_info, AzureRunInfo)
    from Tests.ML.configs.fastmri_random import FastMriOnRandomData
    assert isinstance(runner.lightning_container, FastMriOnRandomData)

--- a/Tests/ML/test_regression_tests.py
+++ b/Tests/ML/test_regression_tests.py
@ -39,7 +39,7 @@ def test_regression_test(test_output_dirs: OutputFolderForTests) -> None:
    container.local_dataset = test_output_dirs.root_dir
    container.regression_test_folder = Path(str(uuid.uuid4().hex))
    runner = MLRunner(container=container)
-    runner.setup(use_mount_or_download_dataset=False)
+    runner.setup()
    with pytest.raises(ValueError) as ex:
        runner.run()
    assert "Folder with expected files does not exist" in str(ex)
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@ -279,7 +279,7 @@ def model_train_unittest(config: Optional[DeepLearningConfig],
    # It will also set random seeds correctly. Later we use so initialized container.
    # For all tests running in AzureML, we need to skip the downloading of datasets that would otherwise happen,
    # because all unit test configs come with their own local dataset already.
-    runner.setup(use_mount_or_download_dataset=False)
+    runner.setup()
    if checkpoint_handler is None:
        azure_config = get_default_azure_config()
        checkpoint_handler = CheckpointHandler(azure_config=azure_config,
--- a/azure-pipelines/build.yaml
+++ b/azure-pipelines/build.yaml
@ -33,7 +33,8 @@ steps:
    displayName: Install InnerEye (Dev) Package

  # First run all tests that require the InnerEye package. All code should be consumed via the InnerEye package,
-  # hence don't set PYTHONPATH
+  # hence don't set PYTHONPATH to InnerEye - but do set it to hi-ml if that has been included as a submodule for dev
+  # work on the package
  - bash: |
      source activate InnerEye
      pytest ./Tests/ -m "not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)" --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-config=.coveragerc --cov-report=xml -n 2 --dist=loadscope --verbose
@ -49,7 +50,6 @@ steps:
      source activate InnerEye
      pytest ./TestsOutsidePackage -n 2 --dist=loadscope --verbose --junitxml=junit/test-outside-package-results.xml
    env:
-      PYTHONPATH: $(Build.SourcesDirectory)/
      APPLICATION_KEY: $(InnerEyeDeepLearningServicePrincipalKey)
    failOnStderr: false
    condition: succeededOrFailed()
--- a/conftest.py
+++ b/conftest.py
@ -13,8 +13,12 @@ from typing import Generator

 import pytest

-from InnerEye.Common.output_directories import OutputFolderForTests, remove_and_create_folder
+from InnerEye.Common.fixed_paths import add_submodules_to_path
 from InnerEye.Common.fixed_paths_for_tests import TEST_OUTPUTS_PATH
+from InnerEye.Common.output_directories import OutputFolderForTests, remove_and_create_folder
+
+# This needs to be right at the start of conftest, so that already test collection has access to all submodules
+add_submodules_to_path()


@pytest.fixture(autouse=True, scope='session')
--- a/docs/contributing.md
+++ b/docs/contributing.md
@ -13,7 +13,20 @@ This document describes guidelines for contributing to the InnerEye-DeepLearning
 - DO NOT mix independent and unrelated changes in one PR.

 ## Coding Style
-The coding style is enforced via flake8 and mypy
+The coding style is enforced via `flake8` and `mypy`. Before pushing any changes to a PR, run both tools on
+your dev box:
+* `flake8`
+* `python mypy_runner.py`
+
+## Unit testing
+- DO write unit tests for each new function or class that you add.
+- DO extend unit tests for existing functions or classes if you change their core behaviour.
+- DO ensure that your tests are designed in a way that they can pass on the local box, even if they are relying on
+specific cloud features.
+- DO run all unit tests on your dev box before submitting your changes. The test suite is designed to pass completely
+also outside of cloud builds.
+- DO NOT rely only on the test builds in the cloud. Cloud builds trigger AzureML runs on GPU 
+machines that have a higher CO2 footprint than your dev box.

 ## Creating Issues
 - DO use a descriptive title that identifies the issue or the requested feature.
--- a/docs/environment.md
+++ b/docs/environment.md
@ -188,3 +188,36 @@ You can do this, for example, to force an update of the `azureml-sdk` and all it
 If you want to take the second route:
 1. Use `conda env update -f environment.yml --prune` to refresh if you make changes in environment.yml
 1. To update packages use `conda update --all` and `pip-review --local --interactive`
+
+
+## Using the hi-ml package
+
+To work on `hi-ml` package at the same time as `InnerEye-DeepLearning`, it can help to add the `hi-ml` package
+as a submodule, rather than a package from pypi. Any change to the package will require a full new docker image build,
+and that costs 20min per run.
+
+* In the repository root, run `git submodule add https://github.com/microsoft/hi-ml`
+* In PyCharm's project browser, mark the folder `hi-ml/src` as Sources Root
+* Remove the entry for the `hi-ml` package from `environment.yml`
+* Modify the start of `InnerEye/ML/runner.py` to look like this:
+```python
+print(f"Starting InnerEye runner at {sys.argv[0]}")
+innereye_root = Path(__file__).absolute().parent.parent.parent
+if (innereye_root / "InnerEye").is_dir():
+    innereye_root_str = str(innereye_root)
+    if innereye_root_str not in sys.path:
+        print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
+        sys.path.insert(0, innereye_root_str)
+    sys.path.append(str(innereye_root / "hi-ml" / "src"))
+```
+
+Alternatively, you can consume a developer version of `hi-ml` from `test.pypi`:
+* Remove the entry for the `hi-ml` package from `environment.yml`
+* Add a section like this to `environment.yml`, to point pip to `test.pypi`, and a specific version of th package:
+```
+  ...
+  - pip:
+      - --extra-index-url https://test.pypi.org/simple/
+      - hi-ml==0.1.0.post236
+      ...
+```
--- a/environment.yml
+++ b/environment.yml
@ -12,9 +12,9 @@ dependencies:
      - git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
      - azure-mgmt-resource==12.1.0
      - azure-mgmt-datafactory==1.1.0
-      - azureml-mlflow==1.23.0
-      - azureml-sdk==1.23.0
-      - azureml-tensorboard==1.23.0
+      - azureml-mlflow==1.32.0
+      - azureml-sdk==1.32.0
+      - azureml-tensorboard==1.32.0
      - conda-merge==0.1.5
      - cryptography==3.3.2
      - dataclasses-json==0.5.2
@ -23,6 +23,7 @@ dependencies:
      - gitpython==3.1.7
      - gputil==1.4.0
      - h5py==2.10.0
+      - hi-ml==0.1.2
      - InnerEye-DICOM-RT==1.0.1
      - joblib==0.16.0
      - jupyter==1.0.0
@ -30,7 +31,7 @@ dependencies:
      - lightning-bolts==0.3.4
      - matplotlib==3.3.0
      - mlflow==1.17.0
-      - mypy==0.812
+      - mypy==0.910
      - mypy-extensions==0.4.3
      - numba==0.51.2
      - numpy==1.19.1
--- a/mypy_runner.py
+++ b/mypy_runner.py
@ -37,7 +37,7 @@ def run_mypy(files: List[str], mypy_executable_path: str) -> int:
        else:
            print("Skipping.")
        if mypy_args:
-            command = [mypy_executable_path, "--config=mypy.ini", *mypy_args]
+            command = [mypy_executable_path, "--install-types", "--non-interactive", "--config=mypy.ini", *mypy_args]
            # We pipe stdout and then print it, otherwise lines can appear in the wrong order in builds.
            process = subprocess.run(command)
            return_code = max(return_code, process.returncode)