Moving InnerEye's Azure code to hi-ml package (#548)
Moving InnerEye to use the new HI-ML package. See Issue 62 on the HI-ML package
This commit is contained in:
Родитель
b3516f3a00
Коммит
b35399fb84
|
@ -13,6 +13,7 @@ jobs:
|
|||
- uses: actions/checkout@v2
|
||||
with:
|
||||
lfs: true
|
||||
submodules: true
|
||||
|
||||
- name: flake8
|
||||
run: |
|
||||
|
@ -58,7 +59,8 @@ jobs:
|
|||
- uses: actions/checkout@v2
|
||||
with:
|
||||
lfs: true
|
||||
|
||||
submodules: true
|
||||
|
||||
- uses: conda-incubator/setup-miniconda@v2
|
||||
with:
|
||||
activate-environment: InnerEye
|
||||
|
|
|
@ -131,6 +131,7 @@ dmypy.json
|
|||
|
||||
# other
|
||||
.vscode/
|
||||
.devcontainer/
|
||||
|
||||
/InnerEye/ML/src/aml_config
|
||||
*.exe
|
||||
|
|
|
@ -11,8 +11,9 @@
|
|||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="InnerEye/ML/runner.py" />
|
||||
<option name="PARAMETERS" value="--azureml --model=HelloContainer --train" />
|
||||
<option name="PARAMETERS" value="--azureml --model=HelloContainer --cluster=training-nc12" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="pytest all simple tests" type="tests" factoryName="py.test">
|
||||
<module name="InnerEye-DeepLearning" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/Tests" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="_new_keywords" value="""" />
|
||||
<option name="_new_parameters" value="""" />
|
||||
<option name="_new_additionalArguments" value=""-m \"not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)\""" />
|
||||
<option name="_new_target" value=""$PROJECT_DIR$/Tests"" />
|
||||
<option name="_new_targetType" value=""PATH"" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
|
@ -28,6 +28,8 @@ jobs that run in AzureML.
|
|||
- ([#533](https://github.com/microsoft/InnerEye-DeepLearning/pull/533)) Better defaults for inference on ensemble children.
|
||||
- ([#536](https://github.com/microsoft/InnerEye-DeepLearning/pull/536)) Inference will not run on the validation set by default, this can be turned on
|
||||
via the `--inference_on_val_set` flag.
|
||||
- ([#548](https://github.com/microsoft/InnerEye-DeepLearning/pull/548)) Many Azure-related functions have been moved
|
||||
out of the toolbox, into the separate hi-ml Python package.
|
||||
- ([#502](https://github.com/microsoft/InnerEye-DeepLearning/pull/502)) Renamed command line option 'perform_training_set_inference' to 'inference_on_train_set'. Replaced command line option 'perform_validation_and_test_set_inference' with the pair of options 'inference_on_val_set' and 'inference_on_test_set'.
|
||||
- ([#496](https://github.com/microsoft/InnerEye-DeepLearning/pull/496)) All plots are now saved as PNG, rather than JPG.
|
||||
- ([#497](https://github.com/microsoft/InnerEye-DeepLearning/pull/497)) Reducing the size of the code snapshot that
|
||||
|
|
|
@ -6,20 +6,17 @@ from __future__ import annotations
|
|||
|
||||
import getpass
|
||||
import logging
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
|
||||
import param
|
||||
from azureml.core import Dataset, Datastore, Run, ScriptRunConfig, Workspace
|
||||
from azureml.core import Run, ScriptRunConfig, Workspace
|
||||
from azureml.core.authentication import InteractiveLoginAuthentication, ServicePrincipalAuthentication
|
||||
from azureml.data import FileDataset
|
||||
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
|
||||
from azureml.train.hyperdrive import HyperDriveConfig
|
||||
from git import Repo
|
||||
|
||||
from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context, remove_arg
|
||||
from InnerEye.Azure.azure_util import fetch_run, is_offline_run_context
|
||||
from InnerEye.Azure.secrets_handling import SecretsHandling, read_all_settings
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
|
@ -240,64 +237,6 @@ class AzureConfig(GenericConfig):
|
|||
"""
|
||||
return fetch_run(workspace=self.get_workspace(), run_recovery_id=run_recovery_id)
|
||||
|
||||
def get_or_create_dataset(self, azure_dataset_id: str) -> FileDataset:
|
||||
"""
|
||||
Looks in the AzureML datastore for a dataset of the given name. If there is no such dataset, a dataset is
|
||||
created and registered, assuming that the files are in a folder that has the same name as the dataset.
|
||||
For example, if azure_dataset_id is 'foo', then the 'foo' dataset should be pointing to the folder
|
||||
<container_root>/datasets/foo/
|
||||
"""
|
||||
if not self.azureml_datastore:
|
||||
raise ValueError("No value set for 'azureml_datastore' (name of the datastore in the AzureML workspace)")
|
||||
if not azure_dataset_id:
|
||||
raise ValueError("No dataset ID provided.")
|
||||
workspace = self.get_workspace()
|
||||
logging.info(f"Retrieving datastore '{self.azureml_datastore}' from AzureML workspace {workspace.name}")
|
||||
datastore = Datastore.get(workspace, self.azureml_datastore)
|
||||
try:
|
||||
logging.info(f"Trying to retrieve AzureML Dataset '{azure_dataset_id}'")
|
||||
azureml_dataset = Dataset.get_by_name(workspace, name=azure_dataset_id)
|
||||
logging.info("Dataset found.")
|
||||
except:
|
||||
logging.info(f"Dataset does not yet exist, creating a new one from data in folder '{azure_dataset_id}'")
|
||||
# Ensure that there is a / at the end of the file path, otherwise folder that share a prefix could create
|
||||
# trouble (for example, folders foo and foo_bar exist, and I'm trying to create a dataset from "foo")
|
||||
azureml_dataset = Dataset.File.from_files(path=(datastore, azure_dataset_id + "/"))
|
||||
logging.info("Registering the dataset for future use.")
|
||||
azureml_dataset.register(workspace, name=azure_dataset_id)
|
||||
return azureml_dataset
|
||||
|
||||
def get_dataset_consumption(self,
|
||||
azure_dataset_id: str,
|
||||
dataset_index: int,
|
||||
mountpoint: str) -> DatasetConsumptionConfig:
|
||||
"""
|
||||
Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
|
||||
dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
|
||||
:param mountpoint: The path at which the dataset should be made available.
|
||||
:param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
|
||||
string to not use any datasets.
|
||||
:param dataset_index: suffix for the dataset name, dataset name will be set to INPUT_DATA_KEY_idx
|
||||
"""
|
||||
status = f"Dataset {azure_dataset_id} (index {dataset_index}) will be "
|
||||
azureml_dataset = self.get_or_create_dataset(azure_dataset_id=azure_dataset_id)
|
||||
if not azureml_dataset:
|
||||
raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
|
||||
named_input = azureml_dataset.as_named_input(f"{INPUT_DATA_KEY}_{dataset_index}")
|
||||
path_on_compute = mountpoint or None
|
||||
if self.use_dataset_mount:
|
||||
status += "mounted at "
|
||||
result = named_input.as_mount(path_on_compute)
|
||||
else:
|
||||
status += "downloaded to "
|
||||
result = named_input.as_download(path_on_compute)
|
||||
if path_on_compute:
|
||||
status += f"{path_on_compute}."
|
||||
else:
|
||||
status += "a randomly chosen folder."
|
||||
logging.info(status)
|
||||
return result
|
||||
|
||||
|
||||
@dataclass
|
||||
class SourceConfig:
|
||||
|
@ -313,13 +252,6 @@ class SourceConfig:
|
|||
upload_timeout_seconds: int = 36000
|
||||
environment_variables: Optional[Dict[str, str]] = None
|
||||
|
||||
def set_script_params_except_submit_flag(self) -> None:
|
||||
"""
|
||||
Populates the script_param field of the present object from the arguments in sys.argv, with the exception
|
||||
of the "azureml" flag.
|
||||
"""
|
||||
self.script_params = remove_arg(AZURECONFIG_SUBMIT_TO_AZUREML, sys.argv[1:])
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserResult:
|
||||
|
|
|
@ -4,37 +4,25 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
import argparse
|
||||
import getpass
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from argparse import ArgumentError, ArgumentParser, Namespace
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from azureml.core import Environment, Experiment, Run, ScriptRunConfig
|
||||
from azureml.core.runconfig import MpiConfiguration, RunConfiguration
|
||||
from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
|
||||
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
|
||||
|
||||
from InnerEye.Azure import azure_util
|
||||
from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
|
||||
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME, \
|
||||
RUN_RECOVERY_ID_KEY_NAME, is_offline_run_context, merge_conda_dependencies
|
||||
from InnerEye.Azure.azure_config import AzureConfig, ParserResult
|
||||
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME
|
||||
from InnerEye.Azure.secrets_handling import read_all_settings
|
||||
from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from health.azure.datasets import DatasetConfig
|
||||
|
||||
SLEEP_TIME_SECONDS = 30
|
||||
|
||||
RUN_RECOVERY_FILE = "most_recent_run.txt"
|
||||
# The version to use when creating an AzureML Python environment. We create all environments with a unique hashed
|
||||
# name, hence version will always be fixed
|
||||
ENVIRONMENT_VERSION = "1"
|
||||
DEFAULT_DOCKER_BASE_IMAGE = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
|
||||
|
||||
# Environment variables used for multi-node training
|
||||
ENV_AZ_BATCHAI_MPI_MASTER_NODE = "AZ_BATCHAI_MPI_MASTER_NODE"
|
||||
|
@ -47,47 +35,6 @@ ENV_GLOBAL_RANK = "GLOBAL_RANK"
|
|||
ENV_LOCAL_RANK = "LOCAL_RANK"
|
||||
|
||||
|
||||
def submit_to_azureml(azure_config: AzureConfig,
|
||||
source_config: SourceConfig,
|
||||
all_azure_dataset_ids: List[str],
|
||||
all_dataset_mountpoints: List[str]) -> Run:
|
||||
"""
|
||||
The main entry point when submitting the runner script to AzureML.
|
||||
It creates an AzureML workspace if needed, submits an experiment using the code
|
||||
as specified in source_config, and waits for completion if needed.
|
||||
:param azure_config: azure related configurations to setup valid workspace
|
||||
:param source_config: The information about which code should be submitted, and which arguments should be used.
|
||||
:param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
|
||||
:param all_dataset_mountpoints: When using mounted datasets in AzureML, these are the per-dataset mount points.
|
||||
The list must have the same length as all_azure_dataset_ids.
|
||||
"""
|
||||
azure_run: Optional[Run] = None
|
||||
|
||||
# When running as part of the PR build, jobs frequently get interrupted by new pushes to the repository.
|
||||
# In this case, we'd like to cancel the current AzureML run before exiting, to reduce cost.
|
||||
# However, at present, this does NOT work, the SIGINT is not propagated through.
|
||||
def interrupt_handler(signal: int, _: Any) -> None:
|
||||
logging.info('Process interrupted via signal {}'.format(str(signal)))
|
||||
if azure_run:
|
||||
logging.info('Trying to terminate the AzureML job now.')
|
||||
azure_run.cancel()
|
||||
sys.exit(0)
|
||||
|
||||
for s in [signal.SIGINT, signal.SIGTERM]:
|
||||
signal.signal(s, interrupt_handler)
|
||||
# create train/test experiment
|
||||
script_run_config = create_run_config(azure_config, source_config, all_azure_dataset_ids, all_dataset_mountpoints)
|
||||
commandline_args = " ".join(source_config.script_params)
|
||||
azure_run = create_and_submit_experiment(azure_config, script_run_config, commandline_args=commandline_args)
|
||||
|
||||
if azure_config.wait_for_completion:
|
||||
# We want the job output to be visible on the console, but the program should not exit if the
|
||||
# job fails because we need to download the pytest result file.
|
||||
azure_run.wait_for_completion(show_output=True, raise_on_error=False)
|
||||
|
||||
return azure_run
|
||||
|
||||
|
||||
def get_git_tags(azure_config: AzureConfig) -> Dict[str, str]:
|
||||
"""
|
||||
Creates a dictionary with git-related information, like branch and commit ID. The dictionary key is a string
|
||||
|
@ -107,27 +54,25 @@ def get_git_tags(azure_config: AzureConfig) -> Dict[str, str]:
|
|||
}
|
||||
|
||||
|
||||
def set_run_tags(run: Run, azure_config: AzureConfig, commandline_args: str) -> None:
|
||||
def additional_run_tags(azure_config: AzureConfig, commandline_args: str) -> Dict[str, str]:
|
||||
"""
|
||||
Set metadata for the run
|
||||
:param run: Run to set metadata for.
|
||||
Gets the set of tags that will be added to the AzureML run as metadata, like git status and user name.
|
||||
:param azure_config: The configurations for the present AzureML job
|
||||
:param commandline_args: A string that holds all commandline arguments that were used for the present run.
|
||||
"""
|
||||
git_information = get_git_tags(azure_config)
|
||||
run.set_tags({
|
||||
return {
|
||||
"tag": azure_config.tag,
|
||||
"model_name": azure_config.model,
|
||||
"execution_mode": ModelExecutionMode.TRAIN.value if azure_config.train else ModelExecutionMode.TEST.value,
|
||||
RUN_RECOVERY_ID_KEY_NAME: azure_util.create_run_recovery_id(run=run),
|
||||
RUN_RECOVERY_FROM_ID_KEY_NAME: azure_config.run_recovery_id,
|
||||
"build_number": str(azure_config.build_number),
|
||||
"build_user": azure_config.build_user,
|
||||
"build_user_email": azure_config.build_user_email,
|
||||
**git_information,
|
||||
"commandline_args": commandline_args,
|
||||
CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: -1,
|
||||
})
|
||||
CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: "-1",
|
||||
}
|
||||
|
||||
|
||||
def create_experiment_name(azure_config: AzureConfig) -> str:
|
||||
|
@ -144,136 +89,19 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
|
|||
return branch or getpass.getuser() + f"_local_branch_{date.today().strftime('%Y%m')}"
|
||||
|
||||
|
||||
def create_and_submit_experiment(azure_config: AzureConfig,
|
||||
script_run_config: ScriptRunConfig,
|
||||
commandline_args: str) -> Run:
|
||||
"""
|
||||
Creates an AzureML experiment in the workspace and submits it for execution.
|
||||
:param azure_config: azure related configurations to setup a valid workspace.
|
||||
:param script_run_config: The configuration for the script that should be run inside of AzureML.
|
||||
:param commandline_args: A string with all commandline arguments that were provided to the runner. These are only
|
||||
used to set a tag on the submitted AzureML run.
|
||||
:returns: Run object for the submitted AzureML run
|
||||
"""
|
||||
workspace = azure_config.get_workspace()
|
||||
experiment_name = create_experiment_name(azure_config)
|
||||
exp = Experiment(workspace=workspace, name=azure_util.to_azure_friendly_string(experiment_name))
|
||||
|
||||
# submit a training/testing run associated with the experiment
|
||||
run: Run = exp.submit(script_run_config)
|
||||
|
||||
if is_offline_run_context(run):
|
||||
# This codepath will only be executed in unit tests, when exp.submit is mocked.
|
||||
return run
|
||||
|
||||
# Set metadata for the run.
|
||||
set_run_tags(run, azure_config, commandline_args=commandline_args)
|
||||
|
||||
print("\n==============================================================================")
|
||||
print(f"Successfully queued new run {run.id} in experiment: {exp.name}")
|
||||
|
||||
if azure_config.run_recovery_id:
|
||||
print(f"\nRecovered from: {azure_config.run_recovery_id}")
|
||||
|
||||
recovery_id = azure_util.create_run_recovery_id(run)
|
||||
recovery_file = Path(RUN_RECOVERY_FILE)
|
||||
if recovery_file.exists():
|
||||
recovery_file.unlink()
|
||||
recovery_file.write_text(recovery_id)
|
||||
|
||||
print("Experiment URL: {}".format(exp.get_portal_url()))
|
||||
print("Run URL: {}".format(run.get_portal_url()))
|
||||
print("If this run fails, re-start runner.py and supply these additional arguments: "
|
||||
f"--run_recovery_id={recovery_id}")
|
||||
print(f"The run recovery ID has been written to this file: {recovery_file}")
|
||||
print("==============================================================================")
|
||||
if azure_config.tensorboard and azure_config.azureml:
|
||||
print("Starting TensorBoard now because you specified --tensorboard")
|
||||
monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[run.id]), azure_config=azure_config)
|
||||
else:
|
||||
print(f"To monitor this run locally using TensorBoard, run the script: "
|
||||
f"InnerEye/Azure/tensorboard_monitor.py --run_ids={run.id}")
|
||||
print("==============================================================================")
|
||||
return run
|
||||
|
||||
|
||||
def get_or_create_python_environment(azure_config: AzureConfig,
|
||||
source_config: SourceConfig,
|
||||
environment_name: str = "",
|
||||
register_environment: bool = True) -> Environment:
|
||||
"""
|
||||
Creates a description for the Python execution environment in AzureML, based on the Conda environment
|
||||
definition files that are specified in `source_config`. If such environment with this Conda environment already
|
||||
exists, it is retrieved, otherwise created afresh.
|
||||
:param azure_config: azure related configurations to use for model scale-out behaviour
|
||||
:param source_config: configurations for model execution, such as name and execution mode
|
||||
:param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
|
||||
is not found, create one from the Conda files provided. This parameter is meant to be used when running
|
||||
inference for an existing model.
|
||||
:param register_environment: If True, the Python environment will be registered in the AzureML workspace. If
|
||||
False, it will only be created, but not registered. Use this for unit testing.
|
||||
"""
|
||||
# Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not be
|
||||
# necessary if the innereye package is installed. It is necessary when working with an outer project and
|
||||
# InnerEye as a git submodule and submitting jobs from the local machine.
|
||||
# In case of version conflicts, the package version in the outer project is given priority.
|
||||
conda_dependencies, merged_yaml = merge_conda_dependencies(source_config.conda_dependencies_files) # type: ignore
|
||||
if azure_config.pip_extra_index_url:
|
||||
# When an extra-index-url is supplied, swap the order in which packages are searched for.
|
||||
# This is necessary if we need to consume packages from extra-index that clash with names of packages on
|
||||
# pypi
|
||||
conda_dependencies.set_pip_option(f"--index-url {azure_config.pip_extra_index_url}")
|
||||
conda_dependencies.set_pip_option("--extra-index-url https://pypi.org/simple")
|
||||
env_variables = {
|
||||
"AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC": str(source_config.upload_timeout_seconds),
|
||||
# Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300.
|
||||
"AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900",
|
||||
"MKL_SERVICE_FORCE_INTEL": "1",
|
||||
# Switching to a new software stack in AML for mounting datasets
|
||||
"RSLEX_DIRECT_VOLUME_MOUNT": "true",
|
||||
"RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1",
|
||||
**(source_config.environment_variables or {})
|
||||
}
|
||||
base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04"
|
||||
# Create a name for the environment that will likely uniquely identify it. AzureML does hashing on top of that,
|
||||
# and will re-use existing environments even if they don't have the same name.
|
||||
# Hashing should include everything that can reasonably change. Rely on hashlib here, because the built-in
|
||||
# hash function gives different results for the same string in different python instances.
|
||||
hash_string = "\n".join([merged_yaml, azure_config.docker_shm_size, base_image, str(env_variables)])
|
||||
sha1 = hashlib.sha1(hash_string.encode("utf8"))
|
||||
overall_hash = sha1.hexdigest()[:32]
|
||||
unique_env_name = f"InnerEye-{overall_hash}"
|
||||
try:
|
||||
env_name_to_find = environment_name or unique_env_name
|
||||
env = Environment.get(azure_config.get_workspace(), name=env_name_to_find, version=ENVIRONMENT_VERSION)
|
||||
logging.info(f"Using existing Python environment '{env.name}'.")
|
||||
return env
|
||||
except Exception:
|
||||
logging.info(f"Python environment '{unique_env_name}' does not yet exist, creating and registering it.")
|
||||
env = Environment(name=unique_env_name)
|
||||
env.docker.enabled = True
|
||||
env.docker.shm_size = azure_config.docker_shm_size
|
||||
env.python.conda_dependencies = conda_dependencies
|
||||
env.docker.base_image = base_image
|
||||
env.environment_variables = env_variables
|
||||
if register_environment:
|
||||
env.register(azure_config.get_workspace())
|
||||
return env
|
||||
|
||||
|
||||
def create_dataset_consumptions(azure_config: AzureConfig,
|
||||
all_azure_dataset_ids: List[str],
|
||||
all_dataset_mountpoints: List[str]) -> List[DatasetConsumptionConfig]:
|
||||
def create_dataset_configs(azure_config: AzureConfig,
|
||||
all_azure_dataset_ids: List[str],
|
||||
all_dataset_mountpoints: List[str]) -> List[DatasetConfig]:
|
||||
"""
|
||||
Sets up all the dataset consumption objects for the datasets provided. Datasets that have an empty name will be
|
||||
skipped.
|
||||
:param azure_config: azure related configurations to use for model scale-out behaviour
|
||||
:param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
|
||||
:param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
|
||||
:return: A list of DatasetConsumptionConfig, in the same order as datasets were provided in all_azure_dataset_ids,
|
||||
:return: A list of DatasetConfig objects, in the same order as datasets were provided in all_azure_dataset_ids,
|
||||
omitting datasets with an empty name.
|
||||
"""
|
||||
dataset_consumptions: List[DatasetConsumptionConfig] = []
|
||||
datasets: List[DatasetConfig] = []
|
||||
if len(all_dataset_mountpoints) > 0:
|
||||
if len(all_azure_dataset_ids) != len(all_dataset_mountpoints):
|
||||
raise ValueError(f"The number of dataset mount points ({len(all_dataset_mountpoints)}) "
|
||||
|
@ -282,64 +110,14 @@ def create_dataset_consumptions(azure_config: AzureConfig,
|
|||
all_dataset_mountpoints = [""] * len(all_azure_dataset_ids)
|
||||
for i, (dataset_id, mount_point) in enumerate(zip(all_azure_dataset_ids, all_dataset_mountpoints)):
|
||||
if dataset_id:
|
||||
dataset_consumption = azure_config.get_dataset_consumption(dataset_id, i, mount_point)
|
||||
dataset_consumptions.append(dataset_consumption)
|
||||
datasets.append(DatasetConfig(name=dataset_id,
|
||||
target_folder=mount_point,
|
||||
use_mounting=azure_config.use_dataset_mount,
|
||||
datastore=azure_config.azureml_datastore))
|
||||
elif mount_point:
|
||||
raise ValueError(f"Inconsistent setup: Dataset name at index {i} is empty, but a mount point has "
|
||||
f"been provided ('{mount_point}')")
|
||||
return dataset_consumptions
|
||||
|
||||
|
||||
def create_run_config(azure_config: AzureConfig,
|
||||
source_config: SourceConfig,
|
||||
all_azure_dataset_ids: List[str],
|
||||
all_dataset_mountpoints: List[str],
|
||||
environment_name: str = "") -> ScriptRunConfig:
|
||||
"""
|
||||
Creates a configuration to run the InnerEye training script in AzureML.
|
||||
:param azure_config: azure related configurations to use for model scale-out behaviour
|
||||
:param source_config: configurations for model execution, such as name and execution mode
|
||||
:param all_azure_dataset_ids: The name of all datasets on blob storage that will be used for this run.
|
||||
:param all_dataset_mountpoints: When using the datasets in AzureML, these are the per-dataset mount points.
|
||||
:param environment_name: If specified, try to retrieve the existing Python environment with this name. If that
|
||||
is not found, create one from the Conda files provided in `source_config`. This parameter is meant to be used
|
||||
when running inference for an existing model.
|
||||
:return: The configured script run.
|
||||
"""
|
||||
dataset_consumptions = create_dataset_consumptions(azure_config, all_azure_dataset_ids, all_dataset_mountpoints)
|
||||
# AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
|
||||
entry_script_relative_path = source_config.entry_script.relative_to(source_config.root_folder).as_posix()
|
||||
logging.info(f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
|
||||
f"source directory {source_config.root_folder})")
|
||||
max_run_duration = None
|
||||
if azure_config.max_run_duration:
|
||||
max_run_duration = run_duration_string_to_seconds(azure_config.max_run_duration)
|
||||
workspace = azure_config.get_workspace()
|
||||
run_config = RunConfiguration(
|
||||
script=entry_script_relative_path,
|
||||
arguments=source_config.script_params,
|
||||
)
|
||||
run_config.environment = get_or_create_python_environment(azure_config, source_config,
|
||||
environment_name=environment_name)
|
||||
run_config.target = azure_config.cluster
|
||||
run_config.max_run_duration_seconds = max_run_duration
|
||||
if azure_config.num_nodes > 1:
|
||||
distributed_job_config = MpiConfiguration(node_count=azure_config.num_nodes)
|
||||
run_config.mpi = distributed_job_config
|
||||
run_config.framework = "Python"
|
||||
run_config.communicator = "IntelMpi"
|
||||
run_config.node_count = distributed_job_config.node_count
|
||||
if len(dataset_consumptions) > 0:
|
||||
run_config.data = {dataset.name: dataset for dataset in dataset_consumptions}
|
||||
# Use blob storage for storing the source, rather than the FileShares section of the storage account.
|
||||
run_config.source_directory_data_store = workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME).name
|
||||
script_run_config = ScriptRunConfig(
|
||||
source_directory=str(source_config.root_folder),
|
||||
run_config=run_config,
|
||||
)
|
||||
if azure_config.hyperdrive:
|
||||
script_run_config = source_config.hyperdrive_config_func(script_run_config) # type: ignore
|
||||
return script_run_config
|
||||
return datasets
|
||||
|
||||
|
||||
def create_runner_parser(model_config_class: type = None) -> argparse.ArgumentParser:
|
||||
|
|
|
@ -5,19 +5,15 @@
|
|||
import logging
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Generator, List, Optional, Tuple
|
||||
|
||||
import conda_merge
|
||||
import ruamel.yaml
|
||||
from azureml._restclient.constants import RunStatus
|
||||
from azureml.core import Experiment, Run, Workspace, get_run
|
||||
from azureml.core.conda_dependencies import CondaDependencies
|
||||
from azureml.exceptions import UserErrorException
|
||||
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME
|
||||
from health.azure.azure_util import create_run_recovery_id
|
||||
|
||||
DEFAULT_CROSS_VALIDATION_SPLIT_INDEX = -1
|
||||
EXPERIMENT_RUN_SEPARATOR = ":"
|
||||
|
@ -45,16 +41,6 @@ INNEREYE_SDK_NAME = "innereye"
|
|||
INNEREYE_SDK_VERSION = "1.0"
|
||||
|
||||
|
||||
def create_run_recovery_id(run: Run) -> str:
|
||||
"""
|
||||
Creates an recovery id for a run so it's checkpoints could be recovered for training/testing
|
||||
|
||||
:param run: an instantiated run.
|
||||
:return: recovery id for a given run in format: [experiment name]:[run id]
|
||||
"""
|
||||
return str(run.experiment.name + EXPERIMENT_RUN_SEPARATOR + run.id)
|
||||
|
||||
|
||||
def split_recovery_id(id: str) -> Tuple[str, str]:
|
||||
"""
|
||||
Splits a run ID into the experiment name and the actual run.
|
||||
|
@ -149,12 +135,13 @@ def fetch_child_runs(run: Run, status: Optional[str] = None,
|
|||
run = PARENT_RUN_CONTEXT
|
||||
children_runs = list(run.get_children(tags=RUN_RECOVERY_ID_KEY_NAME))
|
||||
if 0 < expected_number_cross_validation_splits != len(children_runs):
|
||||
logging.warning(
|
||||
f"The expected number of child runs was {expected_number_cross_validation_splits}."
|
||||
f"Fetched only: {len(children_runs)} runs. Now trying to fetch them manually.")
|
||||
run_ids_to_evaluate = [f"{create_run_recovery_id(run)}_{i}"
|
||||
for i in range(expected_number_cross_validation_splits)]
|
||||
children_runs = [fetch_run(run.experiment.workspace, id) for id in run_ids_to_evaluate]
|
||||
if 0 < expected_number_cross_validation_splits != len(children_runs):
|
||||
logging.warning(
|
||||
f"The expected number of child runs was {expected_number_cross_validation_splits}."
|
||||
f"Fetched only: {len(children_runs)} runs. Now trying to fetch them manually.")
|
||||
run_ids_to_evaluate = [f"{create_run_recovery_id(run)}_{i}"
|
||||
for i in range(expected_number_cross_validation_splits)]
|
||||
children_runs = [fetch_run(run.experiment.workspace, id) for id in run_ids_to_evaluate]
|
||||
if status is not None:
|
||||
children_runs = [child_run for child_run in children_runs if child_run.get_status() == status]
|
||||
return children_runs
|
||||
|
@ -237,53 +224,6 @@ def strip_prefix(string: str, prefix: str) -> str:
|
|||
return string
|
||||
|
||||
|
||||
def _log_conda_dependencies_stats(conda: CondaDependencies, message_prefix: str) -> None:
|
||||
"""
|
||||
Write number of conda and pip packages to logs.
|
||||
:param conda: A conda dependencies object
|
||||
:param message_prefix: A message to prefix to the log string.
|
||||
"""
|
||||
conda_packages_count = len(list(conda.conda_packages))
|
||||
pip_packages_count = len(list(conda.pip_packages))
|
||||
logging.info(f"{message_prefix}: {conda_packages_count} conda packages, {pip_packages_count} pip packages")
|
||||
logging.debug(" Conda packages:")
|
||||
for p in conda.conda_packages:
|
||||
logging.debug(f" {p}")
|
||||
logging.debug(" Pip packages:")
|
||||
for p in conda.pip_packages:
|
||||
logging.debug(f" {p}")
|
||||
|
||||
|
||||
def merge_conda_files(files: List[Path], result_file: Path) -> None:
|
||||
"""
|
||||
Merges the given Conda environment files using the conda_merge package, and writes the merged file to disk.
|
||||
:param files: The Conda environment files to read.
|
||||
:param result_file: The location where the merge results should be written.
|
||||
"""
|
||||
# This code is a slightly modified version of conda_merge. That code can't be re-used easily
|
||||
# it defaults to writing to stdout
|
||||
env_definitions = [conda_merge.read_file(str(f)) for f in files]
|
||||
unified_definition = {}
|
||||
NAME = "name"
|
||||
CHANNELS = "channels"
|
||||
DEPENDENCIES = "dependencies"
|
||||
name = conda_merge.merge_names(env.get(NAME) for env in env_definitions)
|
||||
if name:
|
||||
unified_definition[NAME] = name
|
||||
try:
|
||||
channels = conda_merge.merge_channels(env.get(CHANNELS) for env in env_definitions)
|
||||
except conda_merge.MergeError:
|
||||
logging.error("Failed to merge channel priorities.")
|
||||
raise
|
||||
if channels:
|
||||
unified_definition[CHANNELS] = channels
|
||||
deps = conda_merge.merge_dependencies(env.get(DEPENDENCIES) for env in env_definitions)
|
||||
if deps:
|
||||
unified_definition[DEPENDENCIES] = deps
|
||||
with result_file.open("w") as f:
|
||||
ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
|
||||
|
||||
|
||||
def get_all_environment_files(project_root: Path) -> List[Path]:
|
||||
"""
|
||||
Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
|
||||
|
@ -299,27 +239,6 @@ def get_all_environment_files(project_root: Path) -> List[Path]:
|
|||
return files
|
||||
|
||||
|
||||
def merge_conda_dependencies(files: List[Path]) -> Tuple[CondaDependencies, str]:
|
||||
"""
|
||||
Creates a CondaDependencies object from the Conda environments specified in one or more files.
|
||||
The resulting object contains the union of the Conda and pip packages in the files, where merging
|
||||
is done via the conda_merge package.
|
||||
:param files: The Conda environment files to read.
|
||||
:return: Tuple of (CondaDependencies object that contains packages from all the files,
|
||||
string contents of the merge Conda environment)
|
||||
"""
|
||||
for file in files:
|
||||
_log_conda_dependencies_stats(CondaDependencies(file), f"Conda environment in {file}")
|
||||
temp_merged_file = tempfile.NamedTemporaryFile(delete=False)
|
||||
merged_file_path = Path(temp_merged_file.name)
|
||||
merge_conda_files(files, result_file=merged_file_path)
|
||||
merged_dependencies = CondaDependencies(temp_merged_file.name)
|
||||
_log_conda_dependencies_stats(merged_dependencies, "Merged Conda environment")
|
||||
merged_file_contents = merged_file_path.read_text()
|
||||
temp_merged_file.close()
|
||||
return merged_dependencies, merged_file_contents
|
||||
|
||||
|
||||
def tag_values_all_distinct(runs: List[Run], tag: str) -> bool:
|
||||
"""
|
||||
Returns True iff the runs all have the specified tag and all the values are different.
|
||||
|
@ -395,26 +314,6 @@ def is_running_on_azure_agent() -> bool:
|
|||
return bool(os.environ.get("AGENT_OS", None))
|
||||
|
||||
|
||||
def is_run_and_child_runs_completed(run: Run) -> bool:
|
||||
"""
|
||||
Checks if the given run has successfully completed. If the run has child runs, it also checks if the child runs
|
||||
completed successfully.
|
||||
:param run: The AzureML run to check.
|
||||
:return: True if the run and all child runs completed successfully.
|
||||
"""
|
||||
|
||||
def is_completed(run: Run) -> bool:
|
||||
status = run.get_status()
|
||||
if run.status == RunStatus.COMPLETED:
|
||||
return True
|
||||
logging.info(f"Run {run.id} in experiment {run.experiment.name} finished with status {status}.")
|
||||
return False
|
||||
|
||||
runs = list(run.get_children())
|
||||
runs.append(run)
|
||||
return all(is_completed(run) for run in runs)
|
||||
|
||||
|
||||
def get_comparison_baseline_paths(outputs_folder: Path,
|
||||
blob_path: Path, run: Run,
|
||||
dataset_csv_file_name: str) -> \
|
||||
|
@ -458,43 +357,3 @@ def step_up_directories(path: Path) -> Generator[Path, None, None]:
|
|||
if parent == path:
|
||||
break
|
||||
path = parent
|
||||
|
||||
|
||||
def remove_arg(arg: str, args: List[str]) -> List[str]:
|
||||
"""
|
||||
Remove an argument from a list of arguments. The argument list is assumed to contain
|
||||
elements of the form:
|
||||
"-a", "--arg1", "--arg2", "value2", or "--arg3=value"
|
||||
If there is an item matching "--arg" then it will be removed from the list.
|
||||
|
||||
:param arg: Argument to look for.
|
||||
:param args: List of arguments to scan.
|
||||
:return: List of arguments with --arg removed, if present.
|
||||
"""
|
||||
arg_opt = f"--{arg}"
|
||||
no_arg_opt = f"--no-{arg}"
|
||||
retained_args = []
|
||||
i = 0
|
||||
while i < len(args):
|
||||
arg = args[i]
|
||||
if arg.startswith(arg_opt):
|
||||
if len(arg) == len(arg_opt):
|
||||
# The commandline argument is "--arg", with something possibly following: This can either be
|
||||
# "--arg_opt value" or "--arg_opt --some_other_param"
|
||||
if i < (len(args) - 1):
|
||||
# If the next argument starts with a "-" then assume that it does not belong to the --arg
|
||||
# argument. If there is no "-", assume it belongs to the --arg_opt argument, and skip both
|
||||
if not args[i + 1].startswith("-"):
|
||||
i = i + 1
|
||||
elif arg[len(arg_opt)] == "=":
|
||||
# The commandline argument is "--arg=value": Continue with next arg
|
||||
pass
|
||||
else:
|
||||
# The argument list contains an argument like "--arg_other_param": Keep that.
|
||||
retained_args.append(arg)
|
||||
elif arg == no_arg_opt:
|
||||
pass
|
||||
else:
|
||||
retained_args.append(arg)
|
||||
i = i + 1
|
||||
return retained_args
|
||||
|
|
|
@ -14,7 +14,6 @@ from functools import wraps
|
|||
from pathlib import Path
|
||||
from typing import Any, Callable, Generator, Iterable, List, Optional, Union
|
||||
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.fixed_paths import repository_root_directory
|
||||
from InnerEye.Common.type_annotations import PathOrString
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
|
@ -25,7 +24,6 @@ MAX_PATH_LENGTH = 260
|
|||
empty_string_to_none = lambda x: None if (x is None or len(x.strip()) == 0) else x
|
||||
string_to_path = lambda x: None if (x is None or len(x.strip()) == 0) else Path(x)
|
||||
|
||||
|
||||
SUBJECT_METRICS_FILE_NAME = "metrics.csv"
|
||||
EPOCH_METRICS_FILE_NAME = "epoch_metrics.csv"
|
||||
METRICS_AGGREGATES_FILE = "metrics_aggregates.csv"
|
||||
|
@ -390,20 +388,6 @@ def remove_file_or_directory(pth: Path) -> None:
|
|||
pth.unlink()
|
||||
|
||||
|
||||
def add_folder_to_sys_path_if_needed(folder_under_repo_root: str) -> None:
|
||||
"""
|
||||
Checks if the Python paths in sys.path already contain the given folder, which is expected to be relative
|
||||
to the repository root. If that folder is not yet in sys.path, add it.
|
||||
"""
|
||||
full_folder = repository_root_directory() / folder_under_repo_root
|
||||
for path_str in sys.path:
|
||||
path = Path(path_str)
|
||||
if path == full_folder:
|
||||
return
|
||||
print(f"Adding {full_folder} to sys.path")
|
||||
sys.path.append(str(full_folder))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def change_working_directory(path_or_str: PathOrString) -> Generator:
|
||||
"""
|
||||
|
@ -414,16 +398,3 @@ def change_working_directory(path_or_str: PathOrString) -> Generator:
|
|||
os.chdir(new_path)
|
||||
yield
|
||||
os.chdir(old_path)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def append_to_amlignore(lines_to_append: List[str]) -> Generator:
|
||||
"""
|
||||
Context manager that appends lines to the .amlignore file, and reverts to the previous contents after.
|
||||
"""
|
||||
amlignore = fixed_paths.repository_root_directory(".amlignore")
|
||||
old_contents = amlignore.read_text()
|
||||
new_contents = old_contents.splitlines() + lines_to_append
|
||||
amlignore.write_text("\n".join(new_contents))
|
||||
yield
|
||||
amlignore.write_text(old_contents)
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
@ -91,3 +93,28 @@ def get_environment_yaml_file() -> Path:
|
|||
raise ValueError(f"File {ENVIRONMENT_YAML_FILE_NAME} was not found not found in the package folder "
|
||||
f"{INNEREYE_PACKAGE_ROOT}, and not in the repository root {repository_root_directory()}.")
|
||||
return env
|
||||
|
||||
|
||||
def add_submodules_to_path() -> None:
|
||||
"""
|
||||
This function adds all submodules that the code uses to sys.path and to the environment variables. This is
|
||||
necessary to make the code work without any further changes when switching from/to using hi-ml as a package
|
||||
or as a submodule for development.
|
||||
It also adds the InnerEye root folder to sys.path. The latter is necessary to make AzureML and Pytorch Lightning
|
||||
work together: When spawning additional processes for DDP, the working directory is not correctly picked
|
||||
up in sys.path.
|
||||
"""
|
||||
innereye_root = repository_root_directory()
|
||||
folders_to_add = [(innereye_root, "InnerEye"),
|
||||
(innereye_root / "fastMRI", "fastmri"),
|
||||
(innereye_root / "hi-ml" / "src", "health")]
|
||||
for (folder, subfolder_that_must_exist) in folders_to_add:
|
||||
if (folder / subfolder_that_must_exist).is_dir():
|
||||
folder_str = str(folder)
|
||||
if folder_str not in sys.path:
|
||||
logging.debug(f"Adding folder {folder} to sys.path")
|
||||
sys.path.insert(0, folder_str)
|
||||
else:
|
||||
logging.debug(f"Not adding folder {folder} because it is already in sys.path")
|
||||
else:
|
||||
logging.debug(f"Not adding folder {folder} because it does not have subfolder {subfolder_that_must_exist}")
|
||||
|
|
|
@ -16,11 +16,7 @@ import torch
|
|||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
add_folder_to_sys_path_if_needed("fastMRI")
|
||||
|
||||
from fastmri.data.subsample import create_mask_for_mask_type
|
||||
from fastmri.data.transforms import VarNetDataTransform
|
||||
from fastmri.pl_modules import FastMriDataModule, VarNetModule
|
||||
|
|
|
@ -73,9 +73,8 @@ def main(yaml_file_path: Path) -> None:
|
|||
In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
|
||||
"""
|
||||
config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
|
||||
local_dataset = MLRunner(config, azure_config=runner_config).mount_or_download_dataset(config.azure_dataset_id,
|
||||
config.local_dataset)
|
||||
assert local_dataset is not None
|
||||
runner = MLRunner(config, azure_config=runner_config)
|
||||
local_dataset = runner.download_or_use_existing_dataset(config.azure_dataset_id, config.local_dataset)
|
||||
dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
|
||||
normalizer_config = NormalizeAndVisualizeConfig(**args)
|
||||
actual_mask_channel = None if normalizer_config.ignore_mask else config.mask_id
|
||||
|
|
|
@ -50,6 +50,8 @@
|
|||
"import sys\n",
|
||||
"if str(innereye_path) not in sys.path:\n",
|
||||
" sys.path.append(str(innereye_path))\n",
|
||||
"from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
|
||||
"add_submodules_to_path()\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
|
|
|
@ -56,6 +56,8 @@
|
|||
"import sys\n",
|
||||
"if str(innereye_path) not in sys.path:\n",
|
||||
" sys.path.append(str(innereye_path))\n",
|
||||
"from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
|
||||
"add_submodules_to_path()\n",
|
||||
"\n",
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
|
|
|
@ -45,6 +45,9 @@
|
|||
"\n",
|
||||
"if str(innereye_path) not in sys.path:\n",
|
||||
" sys.path.append(str(innereye_path))\n",
|
||||
"from InnerEye.Common.fixed_paths import add_submodules_to_path\n",
|
||||
"add_submodules_to_path()\n",
|
||||
"\n",
|
||||
"from InnerEye.ML.reports.segmentation_report import plot_scores_for_csv\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings(\"ignore\")\n",
|
||||
|
@ -159,4 +162,4 @@
|
|||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
}
|
|
@ -21,12 +21,12 @@ from pytorch_lightning.core.datamodule import LightningDataModule
|
|||
from torch.utils.data import DataLoader
|
||||
|
||||
from InnerEye.Azure import azure_util
|
||||
from InnerEye.Azure.azure_config import AzureConfig, INPUT_DATA_KEY
|
||||
from InnerEye.Azure.azure_runner import ENVIRONMENT_VERSION, ENV_OMPI_COMM_WORLD_RANK, get_git_tags
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Azure.azure_runner import ENV_OMPI_COMM_WORLD_RANK, get_git_tags
|
||||
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, \
|
||||
EFFECTIVE_RANDOM_SEED_KEY_NAME, IS_ENSEMBLE_KEY_NAME, MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, \
|
||||
PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT, RUN_RECOVERY_FROM_ID_KEY_NAME, RUN_RECOVERY_ID_KEY_NAME, \
|
||||
create_run_recovery_id, get_all_environment_files, is_offline_run_context, merge_conda_files
|
||||
get_all_environment_files, is_offline_run_context
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE, \
|
||||
CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
|
||||
|
@ -35,11 +35,12 @@ from InnerEye.Common.common_util import BASELINE_COMPARISONS_FOLDER, BASELINE_WI
|
|||
change_working_directory, get_best_epoch_results_path, is_windows, logging_section, logging_to_file, \
|
||||
print_exception, remove_file_or_directory
|
||||
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, LOG_FILE_NAME, PYTHON_ENVIRONMENT_NAME
|
||||
from InnerEye.Common.type_annotations import PathOrString
|
||||
from InnerEye.ML.baselines_util import compare_folders_and_run_outputs
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, FINAL_ENSEMBLE_MODEL_FOLDER, \
|
||||
FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod, load_checkpoint, EXTRA_RUN_SUBFOLDER
|
||||
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, EXTRA_RUN_SUBFOLDER, \
|
||||
FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod, load_checkpoint
|
||||
from InnerEye.ML.lightning_base import InnerEyeContainer
|
||||
from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
|
||||
from InnerEye.ML.metrics import InferenceMetrics, InferenceMetricsForSegmentation
|
||||
|
@ -57,26 +58,14 @@ from InnerEye.ML.utils.run_recovery import RunRecovery
|
|||
from InnerEye.ML.visualizers import activation_maps
|
||||
from InnerEye.ML.visualizers.plot_cross_validation import \
|
||||
get_config_and_results_for_offline_runs, plot_cross_validation_from_files
|
||||
from health.azure.azure_util import ENVIRONMENT_VERSION, create_run_recovery_id, merge_conda_files
|
||||
from health.azure.datasets import get_or_create_dataset
|
||||
from health.azure.himl import AzureRunInfo
|
||||
|
||||
ModelDeploymentHookSignature = Callable[[LightningContainer, AzureConfig, Model, ModelProcessing], Any]
|
||||
PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
|
||||
|
||||
|
||||
def try_to_mount_input_dataset(dataset_index: int = 0) -> Optional[Path]:
|
||||
"""
|
||||
Checks if the AzureML run context has a field for input datasets. If yes, the dataset stored there is
|
||||
returned as a Path. Returns None if no input datasets was found.
|
||||
|
||||
:param dataset_index: suffix of AML dataset name, return path to INPUT_DATA_KEY_idx dataset
|
||||
"""
|
||||
if hasattr(RUN_CONTEXT, "input_datasets"):
|
||||
try:
|
||||
return Path(RUN_CONTEXT.input_datasets[f"{INPUT_DATA_KEY}_{dataset_index}"])
|
||||
except KeyError:
|
||||
logging.warning(f"Run context field input_datasets has no {INPUT_DATA_KEY}_{dataset_index} entry.")
|
||||
return None
|
||||
|
||||
|
||||
def download_dataset(azure_dataset_id: str,
|
||||
target_folder: Path,
|
||||
dataset_csv: str,
|
||||
|
@ -95,7 +84,9 @@ def download_dataset(azure_dataset_id: str,
|
|||
:return: A path on the local machine that contains the dataset.
|
||||
"""
|
||||
logging.info("Trying to download dataset via AzureML datastore now.")
|
||||
azure_dataset = azure_config.get_or_create_dataset(azure_dataset_id)
|
||||
azure_dataset = get_or_create_dataset(workspace=azure_config.get_workspace(),
|
||||
datastore_name=azure_config.azureml_datastore,
|
||||
dataset_name=azure_dataset_id)
|
||||
if not isinstance(azure_dataset, FileDataset):
|
||||
raise ValueError(f"Expected to get a FileDataset, but got {type(azure_dataset)}")
|
||||
# The downloaded dataset may already exist from a previous run.
|
||||
|
@ -122,6 +113,20 @@ def download_dataset(azure_dataset_id: str,
|
|||
return expected_dataset_path
|
||||
|
||||
|
||||
def check_dataset_folder_exists(local_dataset: PathOrString) -> Path:
|
||||
"""
|
||||
Checks if a folder with a local dataset exists. If it does exist, return the argument converted to a Path instance.
|
||||
If it does not exist, raise a FileNotFoundError.
|
||||
:param local_dataset: The dataset folder to check.
|
||||
:return: The local_dataset argument, converted to a Path.
|
||||
"""
|
||||
expected_dir = Path(local_dataset)
|
||||
if not expected_dir.is_dir():
|
||||
raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
|
||||
logging.info(f"Model training will use the local dataset provided in {expected_dir}")
|
||||
return expected_dir
|
||||
|
||||
|
||||
def log_metrics(metrics: Dict[ModelExecutionMode, InferenceMetrics],
|
||||
run_context: Run) -> None:
|
||||
"""
|
||||
|
@ -185,36 +190,40 @@ class MLRunner:
|
|||
self.output_subfolder = output_subfolder
|
||||
self._has_setup_run = False
|
||||
|
||||
def setup(self, use_mount_or_download_dataset: bool = True) -> None:
|
||||
def setup(self, azure_run_info: Optional[AzureRunInfo] = None) -> None:
|
||||
"""
|
||||
If the present object is using one of the InnerEye built-in models, create a (fake) container for it
|
||||
and call the setup method. It sets the random seeds, and then creates the actual Lightning modules.
|
||||
:param use_mount_or_download_dataset: If True, try to download or mount the dataset that is used by the model.
|
||||
If False, assume that the dataset is already available (this should only be used for unit tests).
|
||||
:param azure_run_info: When running in AzureML or on a local VM, this contains the paths to the datasets.
|
||||
This can be missing when running in unit tests, where the local dataset paths are already populated.
|
||||
"""
|
||||
if self._has_setup_run:
|
||||
return
|
||||
if (not self.azure_config.only_register_model) and use_mount_or_download_dataset:
|
||||
if (not self.azure_config.only_register_model) and azure_run_info:
|
||||
dataset_index = 0
|
||||
# Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails
|
||||
# and config.local_dataset was not already set.
|
||||
# This must happen before container setup because that could already read datasets.
|
||||
mounted_dataset = self.mount_or_download_dataset(self.container.azure_dataset_id,
|
||||
self.container.local_dataset)
|
||||
if mounted_dataset is not None:
|
||||
if self.container.azure_dataset_id:
|
||||
mounted_dataset = azure_run_info.input_datasets[dataset_index]
|
||||
if mounted_dataset is None:
|
||||
mounted_dataset = self.download_or_use_existing_dataset(self.container.azure_dataset_id,
|
||||
self.container.local_dataset)
|
||||
self.container.local_dataset = mounted_dataset
|
||||
|
||||
extra_locals = []
|
||||
if self.is_offline_run and len(self.container.extra_local_dataset_paths) != 0:
|
||||
for local in self.container.extra_local_dataset_paths:
|
||||
extra_local_dataset = self.mount_or_download_dataset(None, local)
|
||||
assert extra_local_dataset is not None # for mypy
|
||||
extra_locals.append(extra_local_dataset)
|
||||
elif len(self.container.extra_azure_dataset_ids) != 0:
|
||||
for i, azure_id in enumerate(self.container.extra_azure_dataset_ids, 1):
|
||||
extra_local_dataset = self.mount_or_download_dataset(azure_id, None, dataset_index=i)
|
||||
assert extra_local_dataset is not None # for mypy
|
||||
extra_locals.append(extra_local_dataset)
|
||||
self.container.extra_local_dataset_paths = extra_locals
|
||||
dataset_index += 1
|
||||
if self.container.extra_azure_dataset_ids:
|
||||
num_extra_local_datasets = len(self.container.extra_local_dataset_paths)
|
||||
extra_locals: List[Path] = []
|
||||
for i, extra_azure_id in enumerate(self.container.extra_azure_dataset_ids):
|
||||
if num_extra_local_datasets > 0 and i < (num_extra_local_datasets - 1):
|
||||
raise ValueError(f"The model refers to an Azure dataset '{extra_azure_id}' at index {i}, "
|
||||
f"but there are not enough local datasets given ")
|
||||
mounted_dataset = azure_run_info.input_datasets[dataset_index]
|
||||
if mounted_dataset is None:
|
||||
local = None if num_extra_local_datasets == 0 else self.container.extra_local_dataset_paths[i]
|
||||
mounted_dataset = self.download_or_use_existing_dataset(extra_azure_id, local_dataset=local)
|
||||
extra_locals.append(mounted_dataset)
|
||||
self.container.extra_local_dataset_paths = extra_locals
|
||||
# Ensure that we use fixed seeds before initializing the PyTorch models
|
||||
seed_everything(self.container.get_effective_random_seed())
|
||||
# Creating the folder structure must happen before the LightningModule is created, because the output
|
||||
|
@ -235,10 +244,11 @@ class MLRunner:
|
|||
|
||||
if self.container.pretraining_run_recovery_id is not None:
|
||||
run_to_recover = self.azure_config.fetch_run(self.container.pretraining_run_recovery_id.strip())
|
||||
only_return_path = not is_global_rank_zero()
|
||||
run_recovery_object = RunRecovery.download_all_checkpoints_from_run(self.container,
|
||||
run_to_recover,
|
||||
EXTRA_RUN_SUBFOLDER,
|
||||
only_return_path=not is_global_rank_zero())
|
||||
only_return_path=only_return_path)
|
||||
self.container.pretraining_run_checkpoints = run_recovery_object
|
||||
|
||||
# A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
|
||||
|
@ -543,54 +553,30 @@ class MLRunner:
|
|||
activation_maps.extract_activation_maps(self.innereye_config) # type: ignore
|
||||
logging.info("Successfully extracted and saved activation maps")
|
||||
|
||||
def mount_or_download_dataset(self,
|
||||
azure_dataset_id: Optional[str],
|
||||
local_dataset: Optional[Path],
|
||||
dataset_index: int = 0) -> Optional[Path]:
|
||||
def download_or_use_existing_dataset(self,
|
||||
azure_dataset_id: Optional[str],
|
||||
local_dataset: Optional[Path]) -> Path:
|
||||
"""
|
||||
Makes the dataset that the model uses available on the executing machine. If the present training run is outside
|
||||
of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be
|
||||
taken. If a dataset is specified in `azure_dataset_id`, it will attempt to download the dataset from Azure
|
||||
into the local repository, in the "datasets" folder.
|
||||
If the training run is inside of AzureML, the dataset that was specified at job submission time will be
|
||||
mounted or downloaded.
|
||||
:param azure_dataset_id: id of the dataset in AML workspace
|
||||
:param local_dataset: alternatively local path for this dataset
|
||||
:param index of the dataset processed
|
||||
:returns: the path of the dataset on the executing machine.
|
||||
:returns: The path of the dataset on the executing machine.
|
||||
"""
|
||||
if self.is_offline_run:
|
||||
# A dataset, either local or in Azure, is required for the built-in InnerEye models. When models are
|
||||
# specified via a LightningContainer, these dataset fields are optional, because the container datasets
|
||||
# could be downloaded even from the web.
|
||||
is_dataset_required = isinstance(self.container, InnerEyeContainer)
|
||||
# The present run is outside of AzureML: If local_dataset is set, use that as the path to the data.
|
||||
# Otherwise, download the dataset specified by the azure_dataset_id
|
||||
if is_dataset_required:
|
||||
if (not azure_dataset_id) and (local_dataset is None):
|
||||
raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
|
||||
if local_dataset:
|
||||
expected_dir = Path(local_dataset)
|
||||
if not expected_dir.is_dir():
|
||||
raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
|
||||
logging.info(f"Model training will use the local dataset provided in {expected_dir}")
|
||||
return expected_dir
|
||||
if azure_dataset_id:
|
||||
dataset_csv = ""
|
||||
if isinstance(self.model_config, DeepLearningConfig):
|
||||
dataset_csv = self.model_config.dataset_csv
|
||||
return download_dataset(azure_dataset_id=azure_dataset_id,
|
||||
target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
|
||||
dataset_csv=dataset_csv, azure_config=self.azure_config)
|
||||
return None
|
||||
|
||||
# Inside of AzureML, datasets can be either mounted or downloaded.
|
||||
if not self.is_offline_run:
|
||||
raise ValueError("This function should only be called in runs outside AzureML.")
|
||||
if local_dataset:
|
||||
return check_dataset_folder_exists(local_dataset)
|
||||
if azure_dataset_id:
|
||||
mounted = try_to_mount_input_dataset(dataset_index)
|
||||
if not mounted:
|
||||
raise ValueError("Unable to mount or download input dataset.")
|
||||
return mounted
|
||||
return None
|
||||
dataset_csv = ""
|
||||
if isinstance(self.model_config, DeepLearningConfig):
|
||||
dataset_csv = self.model_config.dataset_csv
|
||||
return download_dataset(azure_dataset_id=azure_dataset_id,
|
||||
target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
|
||||
dataset_csv=dataset_csv, azure_config=self.azure_config)
|
||||
raise ValueError("The model must contain either local_dataset or azure_dataset_id")
|
||||
|
||||
def set_multiprocessing_start_method(self) -> None:
|
||||
"""
|
||||
|
|
|
@ -2,12 +2,12 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import matplotlib
|
||||
from typing import Optional, Tuple
|
||||
|
||||
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
|
||||
# individual warnings only.
|
||||
|
@ -21,29 +21,29 @@ if (innereye_root / "InnerEye").is_dir():
|
|||
if innereye_root_str not in sys.path:
|
||||
print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
|
||||
sys.path.insert(0, innereye_root_str)
|
||||
# We change the current working directory before starting the actual training. However, this throws off starting
|
||||
# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
|
||||
# path.
|
||||
runner_path = Path(sys.argv[0])
|
||||
if not runner_path.is_absolute():
|
||||
sys.argv[0] = str(runner_path.absolute())
|
||||
|
||||
import logging
|
||||
from typing import Optional, Tuple
|
||||
from InnerEye.Common import fixed_paths
|
||||
fixed_paths.add_submodules_to_path()
|
||||
|
||||
from azureml._base_sdk_common import user_agent
|
||||
from azureml.core import Run
|
||||
from azureml.core import Run, ScriptRunConfig
|
||||
from health.azure.himl import AzureRunInfo, submit_to_azure_if_needed
|
||||
from health.azure.azure_util import create_run_recovery_id, merge_conda_files, to_azure_friendly_string
|
||||
import matplotlib
|
||||
|
||||
from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
|
||||
from InnerEye.Azure import azure_util
|
||||
from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
|
||||
from InnerEye.Azure.azure_runner import create_runner_parser, get_git_tags, parse_args_and_add_yaml_variables, \
|
||||
parse_arguments, set_environment_variables_for_multi_node, submit_to_azureml
|
||||
from InnerEye.Azure.azure_util import RUN_CONTEXT, get_all_environment_files, is_offline_run_context, \
|
||||
is_run_and_child_runs_completed
|
||||
from InnerEye.Azure.azure_runner import (DEFAULT_DOCKER_BASE_IMAGE, create_dataset_configs, create_experiment_name,
|
||||
create_runner_parser,
|
||||
get_git_tags,
|
||||
parse_args_and_add_yaml_variables,
|
||||
parse_arguments, additional_run_tags,
|
||||
set_environment_variables_for_multi_node)
|
||||
from InnerEye.Azure.azure_util import (RUN_CONTEXT, RUN_RECOVERY_ID_KEY_NAME, get_all_environment_files,
|
||||
is_offline_run_context)
|
||||
from InnerEye.Azure.run_pytest import download_pytest_result, run_pytest
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
|
||||
append_to_amlignore, disable_logging_to_file, is_linux, logging_to_stdout
|
||||
from InnerEye.Common.common_util import (FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE,
|
||||
disable_logging_to_file, is_linux, logging_to_stdout)
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
|
@ -54,6 +54,13 @@ from InnerEye.ML.run_ml import MLRunner, ModelDeploymentHookSignature, PostCross
|
|||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
# We change the current working directory before starting the actual training. However, this throws off starting
|
||||
# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
|
||||
# path.
|
||||
runner_path = Path(sys.argv[0])
|
||||
if not runner_path.is_absolute():
|
||||
sys.argv[0] = str(runner_path.absolute())
|
||||
|
||||
|
||||
def initialize_rpdb() -> None:
|
||||
"""
|
||||
|
@ -185,7 +192,7 @@ class Runner:
|
|||
logging.info("extra_code_directory is unset")
|
||||
return parser_result
|
||||
|
||||
def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]:
|
||||
def run(self) -> Tuple[Optional[DeepLearningConfig], AzureRunInfo]:
|
||||
"""
|
||||
The main entry point for training and testing models from the commandline. This chooses a model to train
|
||||
via a commandline argument, runs training or testing, and writes all required info to disk and logs.
|
||||
|
@ -201,64 +208,130 @@ class Runner:
|
|||
if self.lightning_container.perform_cross_validation:
|
||||
# force hyperdrive usage if performing cross validation
|
||||
self.azure_config.hyperdrive = True
|
||||
run_object: Optional[Run] = None
|
||||
if self.azure_config.azureml:
|
||||
run_object = self.submit_to_azureml()
|
||||
else:
|
||||
self.run_in_situ()
|
||||
azure_run_info = self.submit_to_azureml_if_needed()
|
||||
self.run_in_situ(azure_run_info)
|
||||
if self.model_config is None:
|
||||
return self.lightning_container, run_object
|
||||
return self.model_config, run_object
|
||||
return self.lightning_container, azure_run_info
|
||||
return self.model_config, azure_run_info
|
||||
|
||||
def submit_to_azureml(self) -> Run:
|
||||
def submit_to_azureml_if_needed(self) -> AzureRunInfo:
|
||||
"""
|
||||
Submit a job to AzureML, returning the resulting Run object, or exiting if we were asked to wait for
|
||||
completion and the Run did not succeed.
|
||||
"""
|
||||
# The adal package creates a logging.info line each time it gets an authentication token, avoid that.
|
||||
logging.getLogger('adal-python').setLevel(logging.WARNING)
|
||||
# Azure core prints full HTTP requests even in INFO mode
|
||||
logging.getLogger('azure').setLevel(logging.WARNING)
|
||||
# PyJWT prints out warnings that are beyond our control
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
if isinstance(self.model_config, DeepLearningConfig) and not self.lightning_container.azure_dataset_id:
|
||||
if self.azure_config.azureml and isinstance(self.model_config, DeepLearningConfig) \
|
||||
and not self.lightning_container.azure_dataset_id:
|
||||
raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
|
||||
"property must be set.")
|
||||
source_config = SourceConfig(
|
||||
root_folder=self.project_root,
|
||||
entry_script=Path(sys.argv[0]).resolve(),
|
||||
script_params=sys.argv[1:],
|
||||
conda_dependencies_files=get_all_environment_files(self.project_root),
|
||||
hyperdrive_config_func=(self.model_config.get_hyperdrive_config if self.model_config
|
||||
else self.lightning_container.get_hyperdrive_config),
|
||||
# For large jobs, upload of results can time out because of large checkpoint files. Default is 600
|
||||
upload_timeout_seconds=86400
|
||||
)
|
||||
source_config.set_script_params_except_submit_flag()
|
||||
# Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed
|
||||
# when running pytest.
|
||||
ignored_folders = []
|
||||
if not self.azure_config.pytest_mark:
|
||||
ignored_folders.extend(["Tests", "TestsOutsidePackage", "TestSubmodule"])
|
||||
ignored_folders.extend(["Tests", "TestsOutsidePackage"])
|
||||
if not self.lightning_container.regression_test_folder:
|
||||
ignored_folders.append("RegressionTestResults")
|
||||
with append_to_amlignore(ignored_folders):
|
||||
azure_run = submit_to_azureml(self.azure_config, source_config,
|
||||
self.lightning_container.all_azure_dataset_ids(),
|
||||
self.lightning_container.all_dataset_mountpoints())
|
||||
logging.info("Job submission to AzureML done.")
|
||||
if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
|
||||
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
|
||||
# A build step will pick up that file and publish it to Azure DevOps.
|
||||
# If pytest_mark is set, this file must exist.
|
||||
logging.info("Downloading pytest result file.")
|
||||
download_pytest_result(azure_run)
|
||||
else:
|
||||
logging.info("No pytest_mark present, hence not downloading the pytest result file.")
|
||||
# For PR builds where we wait for job completion, the job must have ended in a COMPLETED state.
|
||||
if self.azure_config.wait_for_completion and not is_run_and_child_runs_completed(azure_run):
|
||||
raise ValueError(f"Run {azure_run.id} in experiment {azure_run.experiment.name} or one of its child "
|
||||
"runs failed.")
|
||||
return azure_run
|
||||
|
||||
input_datasets = create_dataset_configs(self.azure_config,
|
||||
all_azure_dataset_ids=self.lightning_container.all_azure_dataset_ids(),
|
||||
all_dataset_mountpoints=self.lightning_container.all_dataset_mountpoints())
|
||||
|
||||
|
||||
def after_submission_hook(azure_run: Run) -> None:
|
||||
"""
|
||||
A function that will be called right after job submission.
|
||||
"""
|
||||
# Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
|
||||
# run in cross validation analysis
|
||||
recovery_id = create_run_recovery_id(azure_run)
|
||||
azure_run.tag(RUN_RECOVERY_ID_KEY_NAME, recovery_id)
|
||||
print("If this run fails, re-start runner.py and supply these additional arguments: "
|
||||
f"--run_recovery_id={recovery_id}")
|
||||
if self.azure_config.tensorboard:
|
||||
print("Starting TensorBoard now because you specified --tensorboard")
|
||||
monitor(monitor_config=AMLTensorBoardMonitorConfig(run_ids=[azure_run.id]),
|
||||
azure_config=self.azure_config)
|
||||
else:
|
||||
print(f"To monitor this run locally using TensorBoard, run the script: "
|
||||
f"InnerEye/Azure/tensorboard_monitor.py --run_ids={azure_run.id}")
|
||||
|
||||
if self.azure_config.wait_for_completion:
|
||||
# We want the job output to be visible on the console, but the program should not exit if the
|
||||
# job fails because we need to download the pytest result file.
|
||||
azure_run.wait_for_completion(show_output=True, raise_on_error=False)
|
||||
if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
|
||||
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
|
||||
# A build step will pick up that file and publish it to Azure DevOps.
|
||||
# If pytest_mark is set, this file must exist.
|
||||
logging.info("Downloading pytest result file.")
|
||||
download_pytest_result(azure_run)
|
||||
|
||||
hyperdrive_config = None
|
||||
if self.azure_config.hyperdrive:
|
||||
hyperdrive_config = self.lightning_container.get_hyperdrive_config(ScriptRunConfig(source_directory=""))
|
||||
|
||||
# Create a temporary file for the merged conda file, that will be removed after submission of the job.
|
||||
temp_conda: Optional[Path] = None
|
||||
try:
|
||||
if len(source_config.conda_dependencies_files) > 1:
|
||||
temp_conda = source_config.root_folder / f"temp_environment-{uuid.uuid4().hex[:8]}.yml"
|
||||
# Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not
|
||||
# be necessary if the innereye package is installed. It is necessary when working with an outer project
|
||||
# and InnerEye as a git submodule and submitting jobs from the local machine.
|
||||
# In case of version conflicts, the package version in the outer project is given priority.
|
||||
merge_conda_files(source_config.conda_dependencies_files, temp_conda)
|
||||
|
||||
# Calls like `self.azure_config.get_workspace()` will fail if we have no AzureML credentials set up, and so
|
||||
# we should only attempt them if we intend to elevate this to AzureML
|
||||
if self.azure_config.azureml:
|
||||
if not self.azure_config.cluster:
|
||||
raise ValueError("self.azure_config.cluster not set, but we need a compute_cluster_name to submit"
|
||||
"the script to run in AzureML")
|
||||
azure_run_info = submit_to_azure_if_needed(
|
||||
entry_script=source_config.entry_script,
|
||||
snapshot_root_directory=source_config.root_folder,
|
||||
script_params=source_config.script_params,
|
||||
conda_environment_file=temp_conda or source_config.conda_dependencies_files[0],
|
||||
aml_workspace=self.azure_config.get_workspace(),
|
||||
compute_cluster_name=self.azure_config.cluster,
|
||||
environment_variables=source_config.environment_variables,
|
||||
default_datastore=self.azure_config.azureml_datastore,
|
||||
experiment_name=to_azure_friendly_string(create_experiment_name(self.azure_config)),
|
||||
max_run_duration=self.azure_config.max_run_duration,
|
||||
input_datasets=input_datasets,
|
||||
num_nodes=self.azure_config.num_nodes,
|
||||
wait_for_completion=False,
|
||||
ignored_folders=ignored_folders,
|
||||
pip_extra_index_url=self.azure_config.pip_extra_index_url,
|
||||
submit_to_azureml=self.azure_config.azureml,
|
||||
docker_base_image=DEFAULT_DOCKER_BASE_IMAGE,
|
||||
docker_shm_size=self.azure_config.docker_shm_size,
|
||||
tags=additional_run_tags(
|
||||
azure_config=self.azure_config,
|
||||
commandline_args=" ".join(source_config.script_params)),
|
||||
after_submission=after_submission_hook,
|
||||
hyperdrive_config=hyperdrive_config)
|
||||
else:
|
||||
# compute_cluster_name is a required parameter in early versions of the HI-ML package
|
||||
azure_run_info = submit_to_azure_if_needed(
|
||||
input_datasets=input_datasets,
|
||||
submit_to_azureml=False,
|
||||
compute_cluster_name="")
|
||||
finally:
|
||||
if temp_conda:
|
||||
temp_conda.unlink()
|
||||
# submit_to_azure_if_needed calls sys.exit after submitting to AzureML. We only reach this when running
|
||||
# the script locally or in AzureML.
|
||||
return azure_run_info
|
||||
|
||||
def print_git_tags(self) -> None:
|
||||
"""
|
||||
|
@ -278,9 +351,11 @@ class Runner:
|
|||
for key, value in tags_to_print.items():
|
||||
logging.info(f" {key:20}: {value}")
|
||||
|
||||
def run_in_situ(self) -> None:
|
||||
def run_in_situ(self, azure_run_info: AzureRunInfo) -> None:
|
||||
"""
|
||||
Actually run the AzureML job; this method will typically run on an Azure VM.
|
||||
:param azure_run_info: Contains all information about the present run in AzureML, in particular where the
|
||||
datasets are mounted.
|
||||
"""
|
||||
# Only set the logging level now. Usually, when we set logging to DEBUG, we want diagnostics about the model
|
||||
# build itself, but not the tons of debug information that AzureML submissions create.
|
||||
|
@ -305,7 +380,7 @@ class Runner:
|
|||
# if it detects that it is not in a multi-node environment.
|
||||
set_environment_variables_for_multi_node()
|
||||
ml_runner = self.create_ml_runner()
|
||||
ml_runner.setup()
|
||||
ml_runner.setup(azure_run_info)
|
||||
ml_runner.start_logging_to_file()
|
||||
try:
|
||||
ml_runner.run()
|
||||
|
|
|
@ -11,15 +11,15 @@ from typing import Dict, List, Optional
|
|||
|
||||
import param
|
||||
import requests
|
||||
from azureml.core import Experiment, Model
|
||||
from azureml.core import Model, ScriptRunConfig
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig, SourceConfig
|
||||
from InnerEye.Azure.azure_runner import create_run_config
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Common.common_util import logging_to_stdout
|
||||
from InnerEye.Common.fixed_paths import DEFAULT_DATA_FOLDER, DEFAULT_RESULT_IMAGE_NAME, DEFAULT_RESULT_ZIP_DICOM_NAME, \
|
||||
DEFAULT_TEST_IMAGE_NAME, DEFAULT_TEST_ZIP_NAME, ENVIRONMENT_YAML_FILE_NAME, RUN_SCORING_SCRIPT, SCORE_SCRIPT, \
|
||||
SETTINGS_YAML_FILE, repository_root_directory, PYTHON_ENVIRONMENT_NAME
|
||||
DEFAULT_TEST_IMAGE_NAME, DEFAULT_TEST_ZIP_NAME, ENVIRONMENT_YAML_FILE_NAME, PYTHON_ENVIRONMENT_NAME, \
|
||||
RUN_SCORING_SCRIPT, SCORE_SCRIPT, SETTINGS_YAML_FILE, repository_root_directory
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
from health.azure.himl import create_run_configuration, submit_run
|
||||
|
||||
|
||||
class SubmitForInferenceConfig(GenericConfig):
|
||||
|
@ -29,8 +29,9 @@ class SubmitForInferenceConfig(GenericConfig):
|
|||
experiment_name: str = param.String(default="model_inference",
|
||||
doc="Name of experiment the run should belong to")
|
||||
model_id: str = param.String(doc="Id of model, e.g. Prostate:123. Mandatory.")
|
||||
image_file: Path = param.ClassSelector(class_=Path, doc="Image file to segment, ending in .nii.gz if use_dicom=False, "
|
||||
"or zip of a DICOM series otherwise. Mandatory.")
|
||||
image_file: Path = param.ClassSelector(class_=Path,
|
||||
doc="Image file to segment, ending in .nii.gz if use_dicom=False, "
|
||||
"or zip of a DICOM series otherwise. Mandatory.")
|
||||
settings: Path = param.ClassSelector(class_=Path,
|
||||
doc="File containing Azure settings (typically your settings.yml). If not "
|
||||
"provided, use the default settings file.")
|
||||
|
@ -66,7 +67,8 @@ class SubmitForInferenceConfig(GenericConfig):
|
|||
def copy_image_file(image: Path, destination_folder: Path, use_dicom: bool) -> Path:
|
||||
"""
|
||||
Copy the source image file into the given folder destination_folder.
|
||||
:param image: image file, must be Gzipped Nifti format with name ending .nii.gz if use_dicom=False or .zip otherwise.
|
||||
:param image: image file, must be Gzipped Nifti format with name ending .nii.gz if use_dicom=False or .zip
|
||||
otherwise.
|
||||
:param destination_folder: top-level directory to copy image into (as test.nii.gz or test.zip)
|
||||
:param use_dicom: True to treat as a zip file.
|
||||
:return: The full path of the image in the destination_folder
|
||||
|
@ -154,43 +156,45 @@ def submit_for_inference(args: SubmitForInferenceConfig, azure_config: AzureConf
|
|||
# clash.
|
||||
temp_folder = source_directory_path / "temp_for_scoring"
|
||||
conda_files = download_files_from_model(model_sas_urls, ENVIRONMENT_YAML_FILE_NAME, dir_path=temp_folder)
|
||||
if not conda_files:
|
||||
raise ValueError("At least 1 Conda environment definition must exist in the model.")
|
||||
if len(conda_files) != 1:
|
||||
raise ValueError("Exactly 1 Conda environment definition must exist in the model.")
|
||||
# Retrieve the name of the Python environment that the training run used. This environment should have been
|
||||
# registered. If no such environment exists, it will be re-create from the Conda files provided.
|
||||
python_environment_name = model.tags.get(PYTHON_ENVIRONMENT_NAME, "")
|
||||
if not python_environment_name:
|
||||
raise ValueError(f"The model did not contain tag {PYTHON_ENVIRONMENT_NAME} for the AzureML environment to use.")
|
||||
# Copy the scoring script from the repository. This will start the model download from Azure, and invoke the
|
||||
# scoring script.
|
||||
entry_script = source_directory_path / Path(RUN_SCORING_SCRIPT).name
|
||||
shutil.copyfile(str(repository_root_directory(RUN_SCORING_SCRIPT)),
|
||||
str(entry_script))
|
||||
source_config = SourceConfig(
|
||||
root_folder=source_directory_path,
|
||||
entry_script=entry_script,
|
||||
script_params=["--model-folder", ".",
|
||||
"--model-id", model_id,
|
||||
SCORE_SCRIPT,
|
||||
# The data folder must be relative to the root folder of the AzureML job. test_image_files
|
||||
# is then just the file relative to the data_folder
|
||||
"--data_folder", image.parent.name,
|
||||
"--image_files", image.name,
|
||||
"--use_dicom", str(args.use_dicom),
|
||||
"--model_id", model_id],
|
||||
conda_dependencies_files=conda_files,
|
||||
run_config = create_run_configuration(workspace=azure_config.get_workspace(),
|
||||
compute_cluster_name=azure_config.cluster,
|
||||
aml_environment_name=python_environment_name)
|
||||
script_run_config = ScriptRunConfig(
|
||||
source_directory=str(source_directory_path),
|
||||
script=entry_script.relative_to(source_directory_path),
|
||||
arguments=["--model-folder", ".",
|
||||
"--model-id", model_id,
|
||||
SCORE_SCRIPT,
|
||||
# The data folder must be relative to the root folder of the AzureML
|
||||
# job. image_files is then just the file relative to the data_folder
|
||||
"--data_folder", image.parent.name,
|
||||
"--image_files", image.name,
|
||||
"--use_dicom", str(args.use_dicom),
|
||||
"--model_id", model_id],
|
||||
run_config=run_config
|
||||
)
|
||||
run_config = create_run_config(azure_config, source_config, environment_name=python_environment_name,
|
||||
all_azure_dataset_ids=[], all_dataset_mountpoints=[])
|
||||
exp = Experiment(workspace=workspace, name=args.experiment_name)
|
||||
run = exp.submit(run_config)
|
||||
logging.info(f"Submitted run {run.id} in experiment {run.experiment.name}")
|
||||
logging.info(f"Run URL: {run.get_portal_url()}")
|
||||
|
||||
run = submit_run(workspace=workspace,
|
||||
experiment_name=args.experiment_name,
|
||||
script_run_config=script_run_config,
|
||||
wait_for_completion=True)
|
||||
if not args.keep_upload_folder:
|
||||
source_directory.cleanup()
|
||||
logging.info(f"Deleted submission directory {source_directory_path}")
|
||||
if args.download_folder is None:
|
||||
return None
|
||||
logging.info("Awaiting run completion")
|
||||
run.wait_for_completion()
|
||||
logging.info(f"Run has completed with status {run.get_status()}")
|
||||
download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if args.use_dicom else DEFAULT_RESULT_IMAGE_NAME
|
||||
download_path = choose_download_path(download_file, args.download_folder)
|
||||
|
|
|
@ -20,9 +20,9 @@ import numpy as np
|
|||
import pytest
|
||||
from azureml._restclient.constants import RunStatus
|
||||
from azureml.core import Model, Run
|
||||
from health.azure.himl import RUN_RECOVERY_FILE
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Azure.azure_runner import RUN_RECOVERY_FILE
|
||||
from InnerEye.Azure.azure_util import MODEL_ID_KEY_NAME, download_run_output_file, download_run_outputs_by_prefix, \
|
||||
get_comparison_baseline_paths, \
|
||||
is_running_on_azure_agent, to_azure_friendly_string
|
||||
|
@ -101,6 +101,7 @@ def get_most_recent_model_id(fallback_run_id_for_local_execution: str = FALLBACK
|
|||
azure_config = AzureConfig.from_yaml(fixed_paths.SETTINGS_YAML_FILE,
|
||||
project_root=fixed_paths.repository_root_directory())
|
||||
run = azure_config.fetch_run(most_recent_run)
|
||||
assert run.status == "Completed", f"AzureML run {run.id} did not complete successfully."
|
||||
tags = run.get_tags()
|
||||
model_id = tags.get(MODEL_ID_KEY_NAME, None)
|
||||
assert model_id, f"No model_id tag was found on run {most_recent_run}"
|
||||
|
@ -209,40 +210,41 @@ def test_check_dataset_mountpoint(test_output_dirs: OutputFolderForTests) -> Non
|
|||
|
||||
|
||||
@pytest.mark.inference
|
||||
@pytest.mark.parametrize("use_dicom", [False, True])
|
||||
def test_submit_for_inference(use_dicom: bool, test_output_dirs: OutputFolderForTests) -> None:
|
||||
def test_submit_for_inference(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Execute the submit_for_inference script on the model that was recently trained. This starts an AzureML job,
|
||||
and downloads the segmentation. Then check if the segmentation was actually produced.
|
||||
|
||||
:param use_dicom: True to test DICOM in/out, False otherwise.
|
||||
:param test_output_dirs: Test output directories.
|
||||
"""
|
||||
model = get_most_recent_model(fallback_run_id_for_local_execution=FALLBACK_SINGLE_RUN)
|
||||
assert PYTHON_ENVIRONMENT_NAME in model.tags, "Environment name not present in model properties"
|
||||
if use_dicom:
|
||||
size = (64, 64, 64)
|
||||
spacing = (1., 1., 2.5)
|
||||
image_file = test_output_dirs.root_dir / "temp_pack_dicom_series" / "dicom_series.zip"
|
||||
scratch_folder = test_output_dirs.root_dir / "temp_dicom_series"
|
||||
zip_random_dicom_series(size, spacing, image_file, scratch_folder)
|
||||
else:
|
||||
image_file = fixed_paths_for_tests.full_ml_test_data_path() / "train_and_test_data" / "id1_channel1.nii.gz"
|
||||
assert image_file.exists(), f"Image file not found: {image_file}"
|
||||
settings_file = fixed_paths.SETTINGS_YAML_FILE
|
||||
assert settings_file.exists(), f"Settings file not found: {settings_file}"
|
||||
args = ["--image_file", str(image_file),
|
||||
"--model_id", model.id,
|
||||
"--settings", str(settings_file),
|
||||
"--download_folder", str(test_output_dirs.root_dir),
|
||||
"--cluster", "training-nc12",
|
||||
"--experiment", get_experiment_name_from_environment() or "model_inference",
|
||||
"--use_dicom", str(use_dicom)]
|
||||
download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if use_dicom else DEFAULT_RESULT_IMAGE_NAME
|
||||
seg_path = test_output_dirs.root_dir / download_file
|
||||
assert not seg_path.exists(), f"Result file {seg_path} should not yet exist"
|
||||
submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
|
||||
assert seg_path.exists(), f"Result file {seg_path} was not created"
|
||||
# Both parts of this test rely on the same model that was trained in a previous run. If these tests are executed
|
||||
# independently (via pytest.mark.parametrize), get_most_recent_model would pick up the AML run that the
|
||||
# previously executed part of this test submitted.
|
||||
for use_dicom in [False, True]:
|
||||
if use_dicom:
|
||||
size = (64, 64, 64)
|
||||
spacing = (1., 1., 2.5)
|
||||
image_file = test_output_dirs.root_dir / "temp_pack_dicom_series" / "dicom_series.zip"
|
||||
scratch_folder = test_output_dirs.root_dir / "temp_dicom_series"
|
||||
zip_random_dicom_series(size, spacing, image_file, scratch_folder)
|
||||
else:
|
||||
image_file = fixed_paths_for_tests.full_ml_test_data_path() / "train_and_test_data" / "id1_channel1.nii.gz"
|
||||
assert image_file.exists(), f"Image file not found: {image_file}"
|
||||
settings_file = fixed_paths.SETTINGS_YAML_FILE
|
||||
assert settings_file.exists(), f"Settings file not found: {settings_file}"
|
||||
args = ["--image_file", str(image_file),
|
||||
"--model_id", model.id,
|
||||
"--settings", str(settings_file),
|
||||
"--download_folder", str(test_output_dirs.root_dir),
|
||||
"--cluster", "training-nc12",
|
||||
"--experiment", get_experiment_name_from_environment() or "model_inference",
|
||||
"--use_dicom", str(use_dicom)]
|
||||
download_file = DEFAULT_RESULT_ZIP_DICOM_NAME if use_dicom else DEFAULT_RESULT_IMAGE_NAME
|
||||
seg_path = test_output_dirs.root_dir / download_file
|
||||
assert not seg_path.exists(), f"Result file {seg_path} should not yet exist"
|
||||
submit_for_inference.main(args, project_root=fixed_paths.repository_root_directory())
|
||||
assert seg_path.exists(), f"Result file {seg_path} was not created"
|
||||
|
||||
|
||||
def _check_presence_cross_val_metrics_file(split: str, mode: ModelExecutionMode, available_files: List[str]) -> bool:
|
||||
|
@ -375,9 +377,7 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
|
|||
# Check diagnostic messages that show if DDP was set up correctly. This could fail if Lightning
|
||||
# changes its diagnostic outputs.
|
||||
assert "initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4" in log0_txt
|
||||
assert "initializing ddp: GLOBAL_RANK: 1, MEMBER: 2/4" in log0_txt
|
||||
assert "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
|
||||
assert "initializing ddp: GLOBAL_RANK: 3, MEMBER: 4/4" in log1_txt
|
||||
|
||||
|
||||
@pytest.mark.skip("The recovery job hangs after completing on AML")
|
||||
|
@ -395,8 +395,10 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
|
|||
"--tag", "recovery_on_2_nodes"
|
||||
]
|
||||
script = str(repository_root_directory() / "InnerEye" / "ML" / "runner.py")
|
||||
with mock.patch("sys.argv", [script] + args_list):
|
||||
main()
|
||||
# Submission of the recovery job will try to exit the process, catch that and check the submitted run.
|
||||
with pytest.raises(SystemExit):
|
||||
with mock.patch("sys.argv", [script] + args_list):
|
||||
main()
|
||||
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
|
||||
assert run.status == RunStatus.COMPLETED
|
||||
files = run.get_file_names()
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
import pytest
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Azure.azure_runner import create_dataset_consumptions
|
||||
from InnerEye.Azure.azure_runner import create_dataset_configs
|
||||
from Tests.ML.util import get_default_azure_config
|
||||
|
||||
|
||||
|
@ -20,7 +20,7 @@ def test_dataset_consumption1() -> None:
|
|||
Test that an empty dataset ID will not produce any dataset consumption.
|
||||
"""
|
||||
azure_config = get_default_azure_config()
|
||||
assert len(create_dataset_consumptions(azure_config, [""], [""])) == 0
|
||||
assert len(create_dataset_configs(azure_config, [""], [""])) == 0
|
||||
|
||||
|
||||
def test_dataset_consumption2() -> None:
|
||||
|
@ -29,7 +29,7 @@ def test_dataset_consumption2() -> None:
|
|||
"""
|
||||
azure_config = get_default_azure_config()
|
||||
with pytest.raises(ValueError) as ex:
|
||||
create_dataset_consumptions(azure_config, [""], ["foo"])
|
||||
create_dataset_configs(azure_config, [""], ["foo"])
|
||||
assert "but a mount point has been provided" in str(ex)
|
||||
|
||||
|
||||
|
@ -38,7 +38,7 @@ def test_dataset_consumption3() -> None:
|
|||
Test that a matching number of mount points is created.
|
||||
"""
|
||||
azure_config = get_default_azure_config()
|
||||
assert len(create_dataset_consumptions(azure_config, ["test-dataset", "test-dataset"], [])) == 2
|
||||
assert len(create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], [])) == 2
|
||||
|
||||
|
||||
def test_dataset_consumption4() -> None:
|
||||
|
@ -47,15 +47,5 @@ def test_dataset_consumption4() -> None:
|
|||
"""
|
||||
azure_config = get_default_azure_config()
|
||||
with pytest.raises(ValueError) as ex:
|
||||
create_dataset_consumptions(azure_config, ["test-dataset", "test-dataset"], ["foo"])
|
||||
create_dataset_configs(azure_config, ["test-dataset", "test-dataset"], ["foo"])
|
||||
assert "must equal the number of Azure dataset IDs" in str(ex)
|
||||
|
||||
|
||||
def test_dataset_consumption5() -> None:
|
||||
"""
|
||||
Test error handling for empty dataset IDs.
|
||||
"""
|
||||
azure_config = get_default_azure_config()
|
||||
with pytest.raises(ValueError) as ex:
|
||||
azure_config.get_or_create_dataset("")
|
||||
assert "No dataset ID provided" in str(ex)
|
||||
|
|
|
@ -2,25 +2,23 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Set
|
||||
|
||||
import pytest
|
||||
from azureml.core import Run
|
||||
from azureml.core.workspace import Workspace
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig, SourceConfig
|
||||
from InnerEye.Azure.azure_runner import create_experiment_name, get_or_create_python_environment
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Azure.azure_runner import create_experiment_name
|
||||
from InnerEye.Azure.azure_util import DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, fetch_child_runs, fetch_run, \
|
||||
get_cross_validation_split_index, is_cross_validation_child_run, is_run_and_child_runs_completed, \
|
||||
merge_conda_dependencies, merge_conda_files, to_azure_friendly_container_path
|
||||
from InnerEye.Common.common_util import logging_to_stdout, is_linux
|
||||
get_cross_validation_split_index, is_cross_validation_child_run, \
|
||||
to_azure_friendly_container_path
|
||||
from InnerEye.Common.common_util import logging_to_stdout
|
||||
from InnerEye.Common.fixed_paths import PRIVATE_SETTINGS_FILE, PROJECT_SECRETS_FILE, \
|
||||
get_environment_yaml_file, repository_root_directory
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
repository_root_directory
|
||||
from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, get_most_recent_run, get_most_recent_run_id
|
||||
from Tests.ML.util import get_default_azure_config, get_default_workspace
|
||||
from Tests.ML.util import get_default_workspace
|
||||
from health.azure.azure_util import is_run_and_child_runs_completed
|
||||
|
||||
|
||||
def test_os_path_to_azure_friendly_container_path() -> None:
|
||||
|
@ -85,66 +83,6 @@ def test_is_cross_validation_child_run_ensemble_run() -> None:
|
|||
assert all([is_cross_validation_child_run(x) for x in fetch_child_runs(run)])
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_linux(), reason="Spurious file read/write errors on linux build agents.")
|
||||
def test_merge_conda(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Tests the logic for merging Conda environment files.
|
||||
"""
|
||||
env1 = """
|
||||
channels:
|
||||
- defaults
|
||||
- pytorch
|
||||
dependencies:
|
||||
- conda1=1.0
|
||||
- conda2=2.0
|
||||
- conda_both=3.0
|
||||
- pip:
|
||||
- azureml-sdk==1.7.0
|
||||
- foo==1.0
|
||||
"""
|
||||
env2 = """
|
||||
channels:
|
||||
- defaults
|
||||
dependencies:
|
||||
- conda1=1.1
|
||||
- conda_both=3.0
|
||||
- pip:
|
||||
- azureml-sdk==1.6.0
|
||||
- bar==2.0
|
||||
"""
|
||||
# Spurious test failures on Linux build agents, saying that they can't write the file. Wait a bit.
|
||||
time.sleep(0.5)
|
||||
file1 = test_output_dirs.root_dir / "env1.yml"
|
||||
file1.write_text(env1)
|
||||
file2 = test_output_dirs.root_dir / "env2.yml"
|
||||
file2.write_text(env2)
|
||||
# Spurious test failures on Linux build agents, saying that they can't read the file. Wait a bit.
|
||||
time.sleep(0.5)
|
||||
files = [file1, file2]
|
||||
merged_file = test_output_dirs.root_dir / "merged.yml"
|
||||
merge_conda_files(files, merged_file)
|
||||
assert merged_file.read_text().splitlines() == """channels:
|
||||
- defaults
|
||||
- pytorch
|
||||
dependencies:
|
||||
- conda1=1.0
|
||||
- conda1=1.1
|
||||
- conda2=2.0
|
||||
- conda_both=3.0
|
||||
- pip:
|
||||
- azureml-sdk==1.6.0
|
||||
- azureml-sdk==1.7.0
|
||||
- bar==2.0
|
||||
- foo==1.0
|
||||
""".splitlines()
|
||||
conda_dep, _ = merge_conda_dependencies(files)
|
||||
# We expect to see the union of channels.
|
||||
assert list(conda_dep.conda_channels) == ["defaults", "pytorch"]
|
||||
# Package version conflicts are not resolved, both versions are retained.
|
||||
assert list(conda_dep.conda_packages) == ["conda1=1.0", "conda1=1.1", "conda2=2.0", "conda_both=3.0"]
|
||||
assert list(conda_dep.pip_packages) == ["azureml-sdk==1.6.0", "azureml-sdk==1.7.0", "bar==2.0", "foo==1.0"]
|
||||
|
||||
|
||||
def test_experiment_name() -> None:
|
||||
c = AzureConfig()
|
||||
c.build_branch = "branch"
|
||||
|
@ -196,29 +134,3 @@ def test_amlignore() -> None:
|
|||
test_variables = repository_root_directory(PROJECT_SECRETS_FILE)
|
||||
if test_variables.is_file():
|
||||
assert PROJECT_SECRETS_FILE in ignored, f"{PROJECT_SECRETS_FILE} is not in .amlignore"
|
||||
|
||||
|
||||
def test_create_python_env() -> None:
|
||||
"""
|
||||
Checks if environment variables in the SourceConfig are correctly passed through to the Python environment.
|
||||
Environment variables in SourceConfig are only used in the internal InnerEye repo.
|
||||
:return:
|
||||
"""
|
||||
foo = "foo"
|
||||
bar = "bar"
|
||||
entry_script = Path("something.py")
|
||||
conda_file = get_environment_yaml_file()
|
||||
s = SourceConfig(root_folder=Path(""), entry_script=entry_script, conda_dependencies_files=[conda_file],
|
||||
environment_variables={foo: bar})
|
||||
env = get_or_create_python_environment(source_config=s,
|
||||
azure_config=get_default_azure_config(),
|
||||
register_environment=False)
|
||||
assert foo in env.environment_variables
|
||||
assert env.environment_variables[foo] == bar
|
||||
|
||||
# Check that some of the basic packages that we expect to always exist are picked up correctly in the Conda env
|
||||
def remove_version_number(items: Iterator[str]) -> Set[str]:
|
||||
return set(c.split("=")[0] for c in items)
|
||||
|
||||
assert "pytorch" in remove_version_number(env.python.conda_dependencies.conda_packages)
|
||||
assert "pytorch-lightning" in remove_version_number(env.python.conda_dependencies.pip_packages)
|
||||
|
|
|
@ -3,13 +3,12 @@
|
|||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from InnerEye.Azure.azure_config import AZURECONFIG_SUBMIT_TO_AZUREML, AzureConfig, SourceConfig
|
||||
from InnerEye.Azure.azure_config import AZURECONFIG_SUBMIT_TO_AZUREML, AzureConfig
|
||||
from InnerEye.Azure.azure_runner import create_runner_parser, parse_args_and_add_yaml_variables, \
|
||||
run_duration_string_to_seconds
|
||||
from InnerEye.Azure.parser_util import _is_empty_or_empty_string_list
|
||||
|
@ -134,37 +133,6 @@ def test_azureml_submit_constant() -> None:
|
|||
assert hasattr(azure_config, AZURECONFIG_SUBMIT_TO_AZUREML)
|
||||
|
||||
|
||||
def test_source_config_set_params() -> None:
|
||||
"""
|
||||
Check that commandline arguments are set correctly when submitting the script to AzureML.
|
||||
In particular, the azureml flag should be omitted, irrespective of how the argument is written.
|
||||
"""
|
||||
s = SourceConfig(root_folder=Path(""), entry_script=Path("something.py"), conda_dependencies_files=[])
|
||||
|
||||
def assert_has_params(expected_args: str) -> None:
|
||||
assert s.script_params is not None
|
||||
# Arguments are in the keys of the dictionary only, and should have been added in the right order
|
||||
assert " ".join(s.script_params) == expected_args
|
||||
|
||||
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}=True", "--more"]):
|
||||
s.set_script_params_except_submit_flag()
|
||||
assert_has_params("some --param 1 --more")
|
||||
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "False", "--more"]):
|
||||
s.set_script_params_except_submit_flag()
|
||||
assert_has_params("some --param 1 --more")
|
||||
# Using the new syntax for boolean flags
|
||||
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "--more"]):
|
||||
s.set_script_params_except_submit_flag()
|
||||
assert_has_params("some --param 1 --more")
|
||||
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}"]):
|
||||
s.set_script_params_except_submit_flag()
|
||||
assert_has_params("some --param 1")
|
||||
# Arguments where azureml is just the prefix should not be removed.
|
||||
with mock.patch("sys.argv", ["", "some", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}foo", "False", "--more"]):
|
||||
s.set_script_params_except_submit_flag()
|
||||
assert_has_params(f"some --{AZURECONFIG_SUBMIT_TO_AZUREML}foo False --more")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["s", "expected"],
|
||||
[
|
||||
("1s", 1),
|
||||
|
|
|
@ -15,6 +15,7 @@ from InnerEye.ML.config import PhotometricNormalizationMethod, SegmentationModel
|
|||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
from InnerEye.ML.runner import Runner
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from health.azure.himl import AzureRunInfo
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_container", [True, False])
|
||||
|
@ -55,12 +56,19 @@ def test_create_ml_runner_args(is_container: bool,
|
|||
with mock.patch("sys.argv", [""] + args_list):
|
||||
with mock.patch("InnerEye.ML.deep_learning_config.is_offline_run_context", return_value=is_offline_run):
|
||||
with mock.patch("InnerEye.ML.run_ml.MLRunner.run", return_value=None):
|
||||
with mock.patch("InnerEye.ML.run_ml.MLRunner.mount_or_download_dataset", return_value=dataset_folder):
|
||||
with mock.patch("InnerEye.ML.run_ml.MLRunner.download_or_use_existing_dataset",
|
||||
return_value=dataset_folder):
|
||||
runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
|
||||
runner.parse_and_load_model()
|
||||
# Only when calling config.create_filesystem we expect to see the correct paths, and this happens
|
||||
# inside run_in_situ
|
||||
runner.run_in_situ()
|
||||
azure_run_info = AzureRunInfo(input_datasets=[None],
|
||||
output_datasets=[None],
|
||||
run=None,
|
||||
is_running_in_azure=False,
|
||||
output_folder=Path.cwd(),
|
||||
logs_folder=Path.cwd())
|
||||
runner.run_in_situ(azure_run_info)
|
||||
azure_config = runner.azure_config
|
||||
container_or_legacy_config = runner.lightning_container if is_container else runner.model_config
|
||||
assert azure_config.model == model_name
|
||||
|
|
|
@ -3,13 +3,15 @@
|
|||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from InnerEye.Common import common_util, fixed_paths
|
||||
from InnerEye.Common.common_util import (append_to_amlignore, change_working_directory, check_is_any_of,
|
||||
from InnerEye.Common import common_util
|
||||
from InnerEye.Common.common_util import (change_working_directory, check_is_any_of,
|
||||
is_private_field_name, namespace_to_path, path_to_namespace, print_exception)
|
||||
from InnerEye.Common.fixed_paths import add_submodules_to_path, repository_root_directory
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path, tests_root_directory
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
|
||||
|
@ -125,19 +127,15 @@ def test_change_dir(test_output_dirs: OutputFolderForTests) -> None:
|
|||
assert (new_dir / "bar.txt").is_file()
|
||||
|
||||
|
||||
def test_modify_amlignore() -> None:
|
||||
"""
|
||||
Test that we can change the .AMLignore file and change it back to what it was before.
|
||||
"""
|
||||
folder1 = "Added1"
|
||||
folder2 = "Added2"
|
||||
added_folders = [folder1, folder2]
|
||||
amlignore = fixed_paths.repository_root_directory(".amlignore")
|
||||
old_contents = amlignore.read_text()
|
||||
for f in added_folders:
|
||||
assert f not in old_contents
|
||||
with append_to_amlignore(added_folders):
|
||||
new_contents = amlignore.read_text()
|
||||
for f in added_folders:
|
||||
assert f in new_contents
|
||||
assert amlignore.read_text() == old_contents
|
||||
def test_add_submodules_to_path() -> None:
|
||||
original_sys_path = sys.path
|
||||
try:
|
||||
fastmri_folder = repository_root_directory() / "fastMRI"
|
||||
fastmri_str = str(fastmri_folder)
|
||||
assert fastmri_folder.is_dir()
|
||||
if fastmri_str in sys.path:
|
||||
sys.path.remove(fastmri_str)
|
||||
add_submodules_to_path()
|
||||
assert fastmri_str in sys.path
|
||||
finally:
|
||||
sys.path = original_sys_path
|
||||
|
|
|
@ -13,18 +13,13 @@ from typing import Any, Optional
|
|||
from _pytest.monkeypatch import MonkeyPatch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
|
||||
from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
|
||||
from InnerEye.ML.configs.other.fastmri_varnet import VarNetWithImageLogging
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
add_folder_to_sys_path_if_needed("fastMRI")
|
||||
|
||||
from fastMRI.tests.create_temp_data import create_temp_data
|
||||
from fastmri.data import SliceDataset
|
||||
from fastmri.data.subsample import create_mask_for_mask_type
|
||||
from fastmri.data.transforms import VarNetDataTransform
|
||||
from fastmri.pl_modules import FastMriDataModule
|
||||
# This import can fail if written as "from tests.create_temp_data, even though fastMRI is already in the path.
|
||||
from fastMRI.tests.create_temp_data import create_temp_data
|
||||
|
||||
|
||||
class FastMriRandomData(FastMriDataModule):
|
||||
|
|
|
@ -7,11 +7,9 @@ from typing import List
|
|||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from azureml.core import Run
|
||||
|
||||
from InnerEye.Common.common_util import logging_to_stdout, namespace_to_path
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling, generate_and_print_model_summary
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
|
@ -28,7 +26,8 @@ def find_models() -> List[str]:
|
|||
path = namespace_to_path(ModelConfigLoader.get_default_search_module())
|
||||
folders = [path / "segmentation", path / "classification", path / "regression"]
|
||||
names = [str(f.stem) for folder in folders for f in folder.glob("*.py") if folder.exists()]
|
||||
return [name for name in names if not (name.endswith("Base") or name.endswith("Paper")) and not name.startswith("__")]
|
||||
return [name for name in names if
|
||||
not (name.endswith("Base") or name.endswith("Paper")) and not name.startswith("__")]
|
||||
|
||||
|
||||
def test_any_models_found() -> None:
|
||||
|
@ -128,27 +127,6 @@ class MockDatasetConsumption:
|
|||
name = "dummy"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("container_name", ["DummyContainerWithAzureDataset",
|
||||
"DummyContainerWithoutDataset",
|
||||
"DummyContainerWithLocalDataset",
|
||||
"DummyContainerWithAzureAndLocalDataset"])
|
||||
def test_submit_container_to_azureml(container_name: str) -> None:
|
||||
"""
|
||||
Test if we can get the config loader to load a Lightning container model, and get it through the AzureML
|
||||
submission process.
|
||||
"""
|
||||
runner = default_runner()
|
||||
mock_run = Run.get_context()
|
||||
args = ["", f"--model={container_name}", "--azureml=True", "--model_configs_namespace=Tests.ML.configs"]
|
||||
with mock.patch("sys.argv", args):
|
||||
with mock.patch("InnerEye.Azure.azure_config.AzureConfig.get_dataset_consumption",
|
||||
return_value=MockDatasetConsumption):
|
||||
with mock.patch("azureml.core.Experiment.submit", return_value=mock_run):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run == mock_run
|
||||
assert isinstance(runner.lightning_container, LightningContainer)
|
||||
|
||||
|
||||
def test_load_container_with_arguments() -> None:
|
||||
"""
|
||||
Test if we can load a container and override a value in it via the commandline. Parameters can only be set at
|
||||
|
|
|
@ -377,7 +377,7 @@ def test_runner_restart(test_output_dirs: OutputFolderForTests) -> None:
|
|||
model_config.recovery_checkpoint_save_interval = 1
|
||||
model_config.recovery_checkpoints_save_last_k = -1
|
||||
runner = MLRunner(model_config=model_config)
|
||||
runner.setup(use_mount_or_download_dataset=False)
|
||||
runner.setup()
|
||||
# Epochs are 0 based for saving
|
||||
create_model_and_store_checkpoint(model_config,
|
||||
runner.container.checkpoint_folder / f"{RECOVERY_CHECKPOINT_FILE_NAME}_epoch="
|
||||
|
|
|
@ -396,14 +396,16 @@ def test_cross_validation_for_lighting_container_models_is_supported() -> None:
|
|||
to azure_runner's submit_to_azureml method.
|
||||
"""
|
||||
args_list = ["--model=HelloContainer", "--number_of_cross_validation_splits=5", "--azureml=True"]
|
||||
mock_run = mock.MagicMock()
|
||||
mock_run.id = "foo"
|
||||
mock_run.experiment.name = "bar"
|
||||
with mock.patch("sys.argv", [""] + args_list):
|
||||
runner = Runner(project_root=fixed_paths.repository_root_directory(),
|
||||
yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
|
||||
with mock.patch("InnerEye.Azure.azure_runner.create_and_submit_experiment",
|
||||
return_value=None) as create_and_submit_experiment_patch:
|
||||
runner.run()
|
||||
with mock.patch("health.azure.himl.submit_run", return_value=mock_run) as create_and_submit_experiment_patch:
|
||||
with pytest.raises(SystemExit):
|
||||
runner.run()
|
||||
assert runner.lightning_container.model_name == 'HelloContainer'
|
||||
assert runner.lightning_container.number_of_cross_validation_splits == 5
|
||||
args, _ = create_and_submit_experiment_patch.call_args
|
||||
script_run_config = args[1]
|
||||
assert isinstance(script_run_config, HyperDriveConfig)
|
||||
script_run_config_arg = create_and_submit_experiment_patch.call_args[1]["script_run_config"]
|
||||
assert isinstance(script_run_config_arg, HyperDriveConfig)
|
||||
|
|
|
@ -25,6 +25,7 @@ from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, FALLB
|
|||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from Tests.ML.configs.lightning_test_containers import DummyContainerWithDatasets
|
||||
from Tests.ML.util import get_default_azure_config
|
||||
from health.azure.himl import AzureRunInfo
|
||||
|
||||
logging_to_stdout(logging.DEBUG)
|
||||
|
||||
|
@ -89,26 +90,27 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
|
|||
# creation may need access to the dataset.
|
||||
with pytest.raises(ValueError) as ex:
|
||||
runner.setup()
|
||||
assert ex.value.args[0] == "The model must contain either local_dataset or azure_dataset_id."
|
||||
assert ex.value.args[0] == "Expecting that a dataset is available here."
|
||||
runner.project_root = test_output_dirs.root_dir
|
||||
|
||||
# Pointing the model to a dataset folder that does not exist should raise an Exception
|
||||
fake_folder = runner.project_root / "foo"
|
||||
runner.container.local_dataset = fake_folder
|
||||
with pytest.raises(FileNotFoundError):
|
||||
runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
|
||||
runner.download_or_use_existing_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
|
||||
|
||||
# If the local dataset folder exists, mount_or_download_dataset should not do anything.
|
||||
fake_folder.mkdir()
|
||||
local_dataset = runner.mount_or_download_dataset(runner.container.azure_dataset_id, runner.container.local_dataset)
|
||||
local_dataset = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
|
||||
runner.container.local_dataset)
|
||||
assert local_dataset == fake_folder
|
||||
|
||||
# Pointing the model to a dataset in Azure should trigger a download
|
||||
runner.container.local_dataset = None
|
||||
runner.container.azure_dataset_id = dataset_name
|
||||
with logging_section("Starting download"):
|
||||
result_path = runner.mount_or_download_dataset(runner.container.azure_dataset_id,
|
||||
runner.container.local_dataset)
|
||||
result_path = runner.download_or_use_existing_dataset(runner.container.azure_dataset_id,
|
||||
runner.container.local_dataset)
|
||||
# Download goes into <project_root> / "datasets" / "test_dataset"
|
||||
expected_path = runner.project_root / fixed_paths.DATASETS_DIR_NAME / dataset_name
|
||||
assert result_path == expected_path
|
||||
|
@ -156,17 +158,23 @@ def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests,
|
|||
|
||||
with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run):
|
||||
with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path):
|
||||
with mock.patch("InnerEye.ML.run_ml.try_to_mount_input_dataset", return_value=mount_path):
|
||||
runner = MLRunner(config, container=container,
|
||||
azure_config=None, project_root=test_output_dirs.root_dir)
|
||||
runner.setup()
|
||||
return runner.container
|
||||
runner = MLRunner(config, container=container,
|
||||
azure_config=None, project_root=test_output_dirs.root_dir)
|
||||
path_from_aml: List[Optional[Path]] = [None] if is_offline_run else [mount_path]
|
||||
runner.setup(azure_run_info=AzureRunInfo(input_datasets=path_from_aml,
|
||||
output_datasets=[],
|
||||
run=None,
|
||||
is_running_in_azure=False,
|
||||
output_folder=Path(),
|
||||
logs_folder=Path()
|
||||
))
|
||||
return runner.container
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("is_lightning_model", "expected_error"),
|
||||
[
|
||||
# A built-in InnerEye model must have either local dataset or azure dataset provided.
|
||||
(False, "The model must contain either local_dataset or azure_dataset_id"),
|
||||
(False, "Expecting that a dataset is available here."),
|
||||
# ... but this is OK for Lightning container models. A Lightning container could simply
|
||||
# download its data from the web before training.
|
||||
(True, "")
|
||||
|
|
|
@ -9,9 +9,9 @@ from unittest import mock
|
|||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from pytorch_lightning import LightningModule
|
||||
from azureml.core import ScriptRunConfig
|
||||
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
|
||||
from pytorch_lightning import LightningModule
|
||||
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
|
@ -21,9 +21,10 @@ from InnerEye.ML.lightning_container import LightningContainer
|
|||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from Tests.ML.configs.lightning_test_containers import DummyContainerWithAzureDataset, DummyContainerWithHooks, DummyContainerWithModel, \
|
||||
DummyContainerWithPlainLightning
|
||||
from Tests.ML.configs.lightning_test_containers import (DummyContainerWithAzureDataset, DummyContainerWithHooks,
|
||||
DummyContainerWithModel, DummyContainerWithPlainLightning)
|
||||
from Tests.ML.util import default_runner
|
||||
from health.azure.himl import AzureRunInfo
|
||||
|
||||
|
||||
def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
|
||||
|
@ -36,8 +37,7 @@ def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
|
|||
args = ["", "--model=DummyContainerWithModel", "--model_configs_namespace=Tests.ML.configs",
|
||||
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
|
||||
with mock.patch("sys.argv", args):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run is None
|
||||
runner.run()
|
||||
assert isinstance(runner.lightning_container, DummyContainerWithModel)
|
||||
# Test if the outputs folder is relative to the folder that we specified via the commandline
|
||||
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
|
||||
|
@ -81,8 +81,7 @@ def test_run_container_with_plain_lightning_in_situ(test_output_dirs: OutputFold
|
|||
args = ["", "--model=DummyContainerWithPlainLightning", "--model_configs_namespace=Tests.ML.configs",
|
||||
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
|
||||
with mock.patch("sys.argv", args):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run is None
|
||||
runner.run()
|
||||
assert isinstance(runner.lightning_container, DummyContainerWithPlainLightning)
|
||||
# Test if the outputs folder is relative to the folder that we specified via the commandline
|
||||
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
|
||||
|
@ -143,8 +142,8 @@ def test_run_fastmri_container(test_output_dirs: OutputFolderForTests) -> None:
|
|||
f"--output_to={test_output_dirs.root_dir}",
|
||||
"--model_configs_namespace=Tests.ML.configs"]
|
||||
with mock.patch("sys.argv", args):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run is None
|
||||
loaded_config, run_info = runner.run()
|
||||
assert isinstance(run_info, AzureRunInfo)
|
||||
from Tests.ML.configs.fastmri_random import FastMriOnRandomData
|
||||
assert isinstance(runner.lightning_container, FastMriOnRandomData)
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ def test_regression_test(test_output_dirs: OutputFolderForTests) -> None:
|
|||
container.local_dataset = test_output_dirs.root_dir
|
||||
container.regression_test_folder = Path(str(uuid.uuid4().hex))
|
||||
runner = MLRunner(container=container)
|
||||
runner.setup(use_mount_or_download_dataset=False)
|
||||
runner.setup()
|
||||
with pytest.raises(ValueError) as ex:
|
||||
runner.run()
|
||||
assert "Folder with expected files does not exist" in str(ex)
|
||||
|
|
|
@ -279,7 +279,7 @@ def model_train_unittest(config: Optional[DeepLearningConfig],
|
|||
# It will also set random seeds correctly. Later we use so initialized container.
|
||||
# For all tests running in AzureML, we need to skip the downloading of datasets that would otherwise happen,
|
||||
# because all unit test configs come with their own local dataset already.
|
||||
runner.setup(use_mount_or_download_dataset=False)
|
||||
runner.setup()
|
||||
if checkpoint_handler is None:
|
||||
azure_config = get_default_azure_config()
|
||||
checkpoint_handler = CheckpointHandler(azure_config=azure_config,
|
||||
|
|
|
@ -33,7 +33,8 @@ steps:
|
|||
displayName: Install InnerEye (Dev) Package
|
||||
|
||||
# First run all tests that require the InnerEye package. All code should be consumed via the InnerEye package,
|
||||
# hence don't set PYTHONPATH
|
||||
# hence don't set PYTHONPATH to InnerEye - but do set it to hi-ml if that has been included as a submodule for dev
|
||||
# work on the package
|
||||
- bash: |
|
||||
source activate InnerEye
|
||||
pytest ./Tests/ -m "not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)" --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-config=.coveragerc --cov-report=xml -n 2 --dist=loadscope --verbose
|
||||
|
@ -49,7 +50,6 @@ steps:
|
|||
source activate InnerEye
|
||||
pytest ./TestsOutsidePackage -n 2 --dist=loadscope --verbose --junitxml=junit/test-outside-package-results.xml
|
||||
env:
|
||||
PYTHONPATH: $(Build.SourcesDirectory)/
|
||||
APPLICATION_KEY: $(InnerEyeDeepLearningServicePrincipalKey)
|
||||
failOnStderr: false
|
||||
condition: succeededOrFailed()
|
||||
|
|
|
@ -13,8 +13,12 @@ from typing import Generator
|
|||
|
||||
import pytest
|
||||
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests, remove_and_create_folder
|
||||
from InnerEye.Common.fixed_paths import add_submodules_to_path
|
||||
from InnerEye.Common.fixed_paths_for_tests import TEST_OUTPUTS_PATH
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests, remove_and_create_folder
|
||||
|
||||
# This needs to be right at the start of conftest, so that already test collection has access to all submodules
|
||||
add_submodules_to_path()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True, scope='session')
|
||||
|
|
|
@ -13,7 +13,20 @@ This document describes guidelines for contributing to the InnerEye-DeepLearning
|
|||
- DO NOT mix independent and unrelated changes in one PR.
|
||||
|
||||
## Coding Style
|
||||
The coding style is enforced via flake8 and mypy
|
||||
The coding style is enforced via `flake8` and `mypy`. Before pushing any changes to a PR, run both tools on
|
||||
your dev box:
|
||||
* `flake8`
|
||||
* `python mypy_runner.py`
|
||||
|
||||
## Unit testing
|
||||
- DO write unit tests for each new function or class that you add.
|
||||
- DO extend unit tests for existing functions or classes if you change their core behaviour.
|
||||
- DO ensure that your tests are designed in a way that they can pass on the local box, even if they are relying on
|
||||
specific cloud features.
|
||||
- DO run all unit tests on your dev box before submitting your changes. The test suite is designed to pass completely
|
||||
also outside of cloud builds.
|
||||
- DO NOT rely only on the test builds in the cloud. Cloud builds trigger AzureML runs on GPU
|
||||
machines that have a higher CO2 footprint than your dev box.
|
||||
|
||||
## Creating Issues
|
||||
- DO use a descriptive title that identifies the issue or the requested feature.
|
||||
|
|
|
@ -188,3 +188,36 @@ You can do this, for example, to force an update of the `azureml-sdk` and all it
|
|||
If you want to take the second route:
|
||||
1. Use `conda env update -f environment.yml --prune` to refresh if you make changes in environment.yml
|
||||
1. To update packages use `conda update --all` and `pip-review --local --interactive`
|
||||
|
||||
|
||||
## Using the hi-ml package
|
||||
|
||||
To work on `hi-ml` package at the same time as `InnerEye-DeepLearning`, it can help to add the `hi-ml` package
|
||||
as a submodule, rather than a package from pypi. Any change to the package will require a full new docker image build,
|
||||
and that costs 20min per run.
|
||||
|
||||
* In the repository root, run `git submodule add https://github.com/microsoft/hi-ml`
|
||||
* In PyCharm's project browser, mark the folder `hi-ml/src` as Sources Root
|
||||
* Remove the entry for the `hi-ml` package from `environment.yml`
|
||||
* Modify the start of `InnerEye/ML/runner.py` to look like this:
|
||||
```python
|
||||
print(f"Starting InnerEye runner at {sys.argv[0]}")
|
||||
innereye_root = Path(__file__).absolute().parent.parent.parent
|
||||
if (innereye_root / "InnerEye").is_dir():
|
||||
innereye_root_str = str(innereye_root)
|
||||
if innereye_root_str not in sys.path:
|
||||
print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
|
||||
sys.path.insert(0, innereye_root_str)
|
||||
sys.path.append(str(innereye_root / "hi-ml" / "src"))
|
||||
```
|
||||
|
||||
Alternatively, you can consume a developer version of `hi-ml` from `test.pypi`:
|
||||
* Remove the entry for the `hi-ml` package from `environment.yml`
|
||||
* Add a section like this to `environment.yml`, to point pip to `test.pypi`, and a specific version of th package:
|
||||
```
|
||||
...
|
||||
- pip:
|
||||
- --extra-index-url https://test.pypi.org/simple/
|
||||
- hi-ml==0.1.0.post236
|
||||
...
|
||||
```
|
||||
|
|
|
@ -12,9 +12,9 @@ dependencies:
|
|||
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
|
||||
- azure-mgmt-resource==12.1.0
|
||||
- azure-mgmt-datafactory==1.1.0
|
||||
- azureml-mlflow==1.23.0
|
||||
- azureml-sdk==1.23.0
|
||||
- azureml-tensorboard==1.23.0
|
||||
- azureml-mlflow==1.32.0
|
||||
- azureml-sdk==1.32.0
|
||||
- azureml-tensorboard==1.32.0
|
||||
- conda-merge==0.1.5
|
||||
- cryptography==3.3.2
|
||||
- dataclasses-json==0.5.2
|
||||
|
@ -23,6 +23,7 @@ dependencies:
|
|||
- gitpython==3.1.7
|
||||
- gputil==1.4.0
|
||||
- h5py==2.10.0
|
||||
- hi-ml==0.1.2
|
||||
- InnerEye-DICOM-RT==1.0.1
|
||||
- joblib==0.16.0
|
||||
- jupyter==1.0.0
|
||||
|
@ -30,7 +31,7 @@ dependencies:
|
|||
- lightning-bolts==0.3.4
|
||||
- matplotlib==3.3.0
|
||||
- mlflow==1.17.0
|
||||
- mypy==0.812
|
||||
- mypy==0.910
|
||||
- mypy-extensions==0.4.3
|
||||
- numba==0.51.2
|
||||
- numpy==1.19.1
|
||||
|
|
|
@ -37,7 +37,7 @@ def run_mypy(files: List[str], mypy_executable_path: str) -> int:
|
|||
else:
|
||||
print("Skipping.")
|
||||
if mypy_args:
|
||||
command = [mypy_executable_path, "--config=mypy.ini", *mypy_args]
|
||||
command = [mypy_executable_path, "--install-types", "--non-interactive", "--config=mypy.ini", *mypy_args]
|
||||
# We pipe stdout and then print it, otherwise lines can appear in the wrong order in builds.
|
||||
process = subprocess.run(command)
|
||||
return_code = max(return_code, process.returncode)
|
||||
|
|
Загрузка…
Ссылка в новой задаче