Enable Bring-your-own-Lightning-model (#417)
- Enable brining arbitrary PyTorch-Lightning models to the InnerEye toolbox - Upgrade mypy and simplify the way we invoke it
This commit is contained in:
Родитель
780e420973
Коммит
0d479ba3d8
1
.flake8
1
.flake8
|
@ -2,3 +2,4 @@
|
|||
ignore = E226,E302,E41,W391, E701, W291, E722, W503, E128, E126, E127, E731, E401
|
||||
max-line-length = 160
|
||||
max-complexity = 25
|
||||
exclude = fastMRI/
|
||||
|
|
|
@ -45,6 +45,13 @@ jobs:
|
|||
PYTHONPATH: ${{ github.workspace }}
|
||||
if: always()
|
||||
|
||||
- name: Run HelloContainer model
|
||||
run: |
|
||||
$CONDA/envs/InnerEye/bin/python ./InnerEye/ML/runner.py --model=HelloContainer
|
||||
env:
|
||||
PYTHONPATH: ${{ github.workspace }}
|
||||
if: always()
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
[submodule "fastMRI"]
|
||||
path = fastMRI
|
||||
url = https://github.com/facebookresearch/fastMRI
|
|
@ -13,6 +13,9 @@ created.
|
|||
|
||||
### Added
|
||||
|
||||
- ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Added a generic way of adding PyTorch Lightning
|
||||
models to the toolbox. It is now possible to train almost any Lightning model with the InnerEye toolbox in AzureML,
|
||||
with only minimum code changes required. See [the MD documentation](docs/bring_your_own_model.md) for details.
|
||||
- ([#430](https://github.com/microsoft/InnerEye-DeepLearning/pull/430)) Update conversion to 1.0.1 InnerEye-DICOM-RT to
|
||||
add: manufacturer, SoftwareVersions, Interpreter and ROIInterpretedTypes.
|
||||
- ([#385](https://github.com/microsoft/InnerEye-DeepLearning/pull/385)) Add the ability to train a model on multiple
|
||||
|
@ -70,6 +73,7 @@ created.
|
|||
- ([#437](https://github.com/microsoft/InnerEye-DeepLearning/pull/437)) Fixed multi-node DDP bug in PL v1.2.8. Re-add
|
||||
end-to-end test for multi-node.
|
||||
### Removed
|
||||
- ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Removed an output file that only contains metadata for a legacy consumer
|
||||
|
||||
### Deprecated
|
||||
|
||||
|
|
|
@ -275,19 +275,6 @@ class SourceConfig:
|
|||
self.script_params = retained_args
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExperimentResultLocation:
|
||||
"""
|
||||
Information that is need to recover where the results of an experiment reside.
|
||||
"""
|
||||
results_container_name: Optional[str] = None
|
||||
results_uri: Optional[str] = None
|
||||
dataset_folder: Optional[str] = None
|
||||
dataset_uri: Optional[str] = None
|
||||
azure_job_name: Optional[str] = None
|
||||
commandline_overrides: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserResult:
|
||||
"""
|
||||
|
|
|
@ -19,17 +19,18 @@ from azureml.core.datastore import Datastore
|
|||
from azureml.core.runconfig import MpiConfiguration, RunConfiguration
|
||||
from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
|
||||
from azureml.data import FileDataset
|
||||
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
|
||||
|
||||
from InnerEye.Azure import azure_util
|
||||
from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
|
||||
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME, \
|
||||
RUN_RECOVERY_ID_KEY_NAME, \
|
||||
merge_conda_dependencies
|
||||
is_offline_run_context, merge_conda_dependencies
|
||||
from InnerEye.Azure.secrets_handling import read_all_settings
|
||||
from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
|
||||
SLEEP_TIME_SECONDS = 30
|
||||
INPUT_DATA_KEY = "input_data"
|
||||
|
@ -42,15 +43,12 @@ ENVIRONMENT_VERSION = "1"
|
|||
|
||||
def submit_to_azureml(azure_config: AzureConfig,
|
||||
source_config: SourceConfig,
|
||||
model_config_overrides: str,
|
||||
azure_dataset_id: str) -> Run:
|
||||
"""
|
||||
The main entry point. It creates an AzureML workspace if needed, submits an experiment using the code
|
||||
as specified in source_config, and waits for completion if needed.
|
||||
:param azure_config: azure related configurations to setup valid workspace
|
||||
:param source_config: The information about which code should be submitted, and which arguments should be used.
|
||||
:param model_config_overrides: A string that describes which model parameters were overwritten by commandline
|
||||
arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
|
||||
:param azure_dataset_id: The name of the dataset on blob storage to be used for this run.
|
||||
"""
|
||||
azure_run: Optional[Run] = None
|
||||
|
@ -68,8 +66,7 @@ def submit_to_azureml(azure_config: AzureConfig,
|
|||
for s in [signal.SIGINT, signal.SIGTERM]:
|
||||
signal.signal(s, interrupt_handler)
|
||||
# create train/test experiment
|
||||
azure_run = create_and_submit_experiment(azure_config, source_config, model_config_overrides,
|
||||
azure_dataset_id)
|
||||
azure_run = create_and_submit_experiment(azure_config, source_config, azure_dataset_id)
|
||||
|
||||
if azure_config.wait_for_completion:
|
||||
# We want the job output to be visible on the console, but the program should not exit if the
|
||||
|
@ -79,13 +76,12 @@ def submit_to_azureml(azure_config: AzureConfig,
|
|||
return azure_run
|
||||
|
||||
|
||||
def set_run_tags(run: Run, azure_config: AzureConfig, model_config_overrides: str) -> None:
|
||||
def set_run_tags(run: Run, azure_config: AzureConfig, commandline_args: str) -> None:
|
||||
"""
|
||||
Set metadata for the run
|
||||
:param run: Run to set metadata for.
|
||||
:param azure_config: The configurations for the present AzureML job
|
||||
:param model_config_overrides: A string that describes which model parameters were overwritten by commandline
|
||||
arguments in the present run.
|
||||
:param commandline_args: A string that holds all commandline arguments that were used for the present run.
|
||||
"""
|
||||
git_information = azure_config.get_git_information()
|
||||
run.set_tags({
|
||||
|
@ -103,7 +99,7 @@ def set_run_tags(run: Run, azure_config: AzureConfig, model_config_overrides: st
|
|||
"source_message": git_information.commit_message,
|
||||
"source_author": git_information.commit_author,
|
||||
"source_dirty": str(git_information.is_dirty),
|
||||
"overrides": model_config_overrides,
|
||||
"commandline_args": commandline_args,
|
||||
CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: -1,
|
||||
})
|
||||
|
||||
|
@ -125,14 +121,11 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
|
|||
def create_and_submit_experiment(
|
||||
azure_config: AzureConfig,
|
||||
source_config: SourceConfig,
|
||||
model_config_overrides: str,
|
||||
azure_dataset_id: str) -> Run:
|
||||
"""
|
||||
Creates an AzureML experiment in the workspace and submits it for execution.
|
||||
:param azure_config: azure related configurations to setup valid workspace
|
||||
:param source_config: The information about which code should be submitted, and which arguments should be used.
|
||||
:param model_config_overrides: A string that describes which model parameters were overwritten by commandline
|
||||
arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
|
||||
:param azure_dataset_id: The name of the dataset in blob storage to be used for this run.
|
||||
:returns: Run object for the submitted AzureML run
|
||||
"""
|
||||
|
@ -144,8 +137,12 @@ def create_and_submit_experiment(
|
|||
# submit a training/testing run associated with the experiment
|
||||
run: Run = exp.submit(script_run_config)
|
||||
|
||||
# set metadata for the run
|
||||
set_run_tags(run, azure_config, model_config_overrides)
|
||||
if is_offline_run_context(run):
|
||||
# This codepath will only be executed in unit tests, when exp.submit is mocked.
|
||||
return run
|
||||
|
||||
# Set metadata for the run.
|
||||
set_run_tags(run, azure_config, commandline_args=(" ".join(source_config.script_params)))
|
||||
|
||||
print("\n==============================================================================")
|
||||
print(f"Successfully queued new run {run.id} in experiment: {exp.name}")
|
||||
|
@ -276,6 +273,21 @@ def get_or_create_python_environment(azure_config: AzureConfig,
|
|||
return env
|
||||
|
||||
|
||||
def get_dataset_consumption(azure_config: AzureConfig, azure_dataset_id: str) -> DatasetConsumptionConfig:
|
||||
"""
|
||||
Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
|
||||
dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
|
||||
:param azure_config: azure related configurations to use for model scale-out behaviour
|
||||
:param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
|
||||
string to not use any datasets.
|
||||
"""
|
||||
azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
|
||||
if not azureml_dataset:
|
||||
raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
|
||||
named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
|
||||
return named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
|
||||
|
||||
|
||||
def create_run_config(azure_config: AzureConfig,
|
||||
source_config: SourceConfig,
|
||||
azure_dataset_id: str = "",
|
||||
|
@ -292,11 +304,7 @@ def create_run_config(azure_config: AzureConfig,
|
|||
:return: The configured script run.
|
||||
"""
|
||||
if azure_dataset_id:
|
||||
azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
|
||||
if not azureml_dataset:
|
||||
raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
|
||||
named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
|
||||
dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
|
||||
dataset_consumption = get_dataset_consumption(azure_config, azure_dataset_id)
|
||||
else:
|
||||
dataset_consumption = None
|
||||
# AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
|
||||
|
@ -354,8 +362,7 @@ def create_runner_parser(model_config_class: type = None) -> argparse.ArgumentPa
|
|||
def parse_args_and_add_yaml_variables(parser: ArgumentParser,
|
||||
yaml_config_file: Optional[Path] = None,
|
||||
project_root: Optional[Path] = None,
|
||||
fail_on_unknown_args: bool = False,
|
||||
args: List[str] = None) -> ParserResult:
|
||||
fail_on_unknown_args: bool = False) -> ParserResult:
|
||||
"""
|
||||
Reads arguments from sys.argv, modifies them with secrets from local YAML files,
|
||||
and parses them using the given argument parser.
|
||||
|
@ -364,14 +371,12 @@ def parse_args_and_add_yaml_variables(parser: ArgumentParser,
|
|||
:param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv.
|
||||
:param fail_on_unknown_args: If True, raise an exception if the parser encounters an argument that it does not
|
||||
recognize. If False, unrecognized arguments will be ignored, and added to the "unknown" field of the parser result.
|
||||
:param args: arguments to parse
|
||||
:return: The parsed arguments, and overrides
|
||||
"""
|
||||
settings_from_yaml = read_all_settings(yaml_config_file, project_root=project_root)
|
||||
return parse_arguments(parser,
|
||||
settings_from_yaml=settings_from_yaml,
|
||||
fail_on_unknown_args=fail_on_unknown_args,
|
||||
args=args)
|
||||
fail_on_unknown_args=fail_on_unknown_args)
|
||||
|
||||
|
||||
def _create_default_namespace(parser: ArgumentParser) -> Namespace:
|
||||
|
@ -471,7 +476,7 @@ def run_duration_string_to_seconds(s: str) -> Optional[int]:
|
|||
elif suffix == "d":
|
||||
multiplier = 24 * 60 * 60
|
||||
else:
|
||||
raise ArgumentError("s", f"Invalid suffix: Must be one of 's', 'm', 'h', 'd', but got: {s}")
|
||||
raise ArgumentError("s", f"Invalid suffix: Must be one of 's', 'm', 'h', 'd', but got: {s}") # type: ignore
|
||||
return int(float(s[:-1]) * multiplier)
|
||||
|
||||
|
||||
|
|
|
@ -45,15 +45,6 @@ INNEREYE_SDK_NAME = "innereye"
|
|||
INNEREYE_SDK_VERSION = "1.0"
|
||||
|
||||
|
||||
def get_results_blob_path(run_id: str) -> str:
|
||||
"""
|
||||
Creates the name of the top level folder that contains the results for a given AzureML run.
|
||||
:param run_id: The AzureML run ID for which the folder should be created.
|
||||
:return: A full Azure blob storage path, starting with the container name.
|
||||
"""
|
||||
return AZUREML_RUN_FOLDER + run_id
|
||||
|
||||
|
||||
def create_run_recovery_id(run: Run) -> str:
|
||||
"""
|
||||
Creates an recovery id for a run so it's checkpoints could be recovered for training/testing
|
||||
|
@ -293,6 +284,21 @@ def merge_conda_files(files: List[Path], result_file: Path) -> None:
|
|||
ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
|
||||
|
||||
|
||||
def get_all_environment_files(project_root: Path) -> List[Path]:
|
||||
"""
|
||||
Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
|
||||
and possibly a second environment.yml file that lives at the project root folder.
|
||||
:param project_root: The root folder of the code that starts the present training run.
|
||||
:return: A list with 1 or 2 entries that are conda environment files.
|
||||
"""
|
||||
innereye_yaml = fixed_paths.get_environment_yaml_file()
|
||||
project_yaml = project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME
|
||||
files = [innereye_yaml]
|
||||
if innereye_yaml != project_yaml:
|
||||
files.append(project_yaml)
|
||||
return files
|
||||
|
||||
|
||||
def merge_conda_dependencies(files: List[Path]) -> Tuple[CondaDependencies, str]:
|
||||
"""
|
||||
Creates a CondaDependencies object from the Conda environments specified in one or more files.
|
||||
|
|
|
@ -1,52 +0,0 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig, ExperimentResultLocation
|
||||
|
||||
BUILDINFORMATION_JSON = "buildinformation.json"
|
||||
|
||||
|
||||
def build_information_to_dot_net_json(azure_config: AzureConfig, result_location: ExperimentResultLocation) -> str:
|
||||
"""
|
||||
Converts the build metadata to a JSON string.
|
||||
:param azure_config: Azure configuration file with build information.
|
||||
:param result_location: ExperimentResultLocation object with result locations.
|
||||
"""
|
||||
git_information = azure_config.get_git_information()
|
||||
return json.dumps({
|
||||
"BuildNumber": azure_config.build_number,
|
||||
"BuildRequestedFor": azure_config.build_user,
|
||||
"BuildSourceBranchName": git_information.branch,
|
||||
"BuildSourceVersion": git_information.commit_id,
|
||||
"BuildSourceAuthor": git_information.commit_author,
|
||||
"ModelName": azure_config.model,
|
||||
"ResultsContainerName": result_location.results_container_name,
|
||||
"ResultsUri": result_location.results_uri,
|
||||
"DatasetFolder": result_location.dataset_folder,
|
||||
"DatasetFolderUri": result_location.dataset_uri,
|
||||
"AzureBatchJobName": result_location.azure_job_name})
|
||||
|
||||
|
||||
def build_information_to_dot_net_json_file(azure_config: AzureConfig,
|
||||
result_location: ExperimentResultLocation,
|
||||
folder: Optional[Path] = None) -> None:
|
||||
"""
|
||||
Writes the build metadata to a file called buildinformation.json in the given folder.
|
||||
:param azure_config: Azure configuration file
|
||||
:param result_location: ExperimentResultLocation object with result locations.
|
||||
:param folder: Results are written to this folder, if not None. Else, results are written in the root folder.
|
||||
"""
|
||||
filename = Path(BUILDINFORMATION_JSON)
|
||||
|
||||
if folder is not None:
|
||||
if not folder.exists():
|
||||
folder.mkdir(parents=True)
|
||||
|
||||
full_file = filename if folder is None else folder / filename
|
||||
with full_file.open("w") as f:
|
||||
f.write(build_information_to_dot_net_json(azure_config, result_location))
|
|
@ -389,3 +389,29 @@ def remove_file_or_directory(pth: Path) -> None:
|
|||
pth.rmdir()
|
||||
elif pth.exists():
|
||||
pth.unlink()
|
||||
|
||||
|
||||
def add_folder_to_sys_path_if_needed(folder_under_repo_root: str) -> None:
|
||||
"""
|
||||
Checks if the Python paths in sys.path already contain the given folder, which is expected to be relative
|
||||
to the repository root. If that folder is not yet in sys.path, add it.
|
||||
"""
|
||||
full_folder = repository_root_directory() / folder_under_repo_root
|
||||
for path_str in sys.path:
|
||||
path = Path(path_str)
|
||||
if path == full_folder:
|
||||
return
|
||||
print(f"Adding {full_folder} to sys.path")
|
||||
sys.path.append(str(full_folder))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def change_working_directory(path_or_str: PathOrString) -> Generator:
|
||||
"""
|
||||
Context manager for changing the current working directory
|
||||
"""
|
||||
new_path = Path(path_or_str).expanduser()
|
||||
old_path = Path.cwd()
|
||||
os.chdir(new_path)
|
||||
yield
|
||||
os.chdir(old_path)
|
||||
|
|
|
@ -34,6 +34,8 @@ DEFAULT_RESULT_ZIP_DICOM_NAME = "segmentation.dcm.zip"
|
|||
DEFAULT_AML_LOGS_DIR = "azureml-logs"
|
||||
|
||||
DEFAULT_LOGS_DIR_NAME = "logs"
|
||||
LOG_FILE_NAME = "stdout.txt"
|
||||
|
||||
DEFAULT_MODEL_SUMMARIES_DIR_PATH = Path(DEFAULT_LOGS_DIR_NAME) / "model_summaries"
|
||||
# The folder at the project root directory that holds datasets for local execution.
|
||||
DATASETS_DIR_NAME = "datasets"
|
||||
|
|
|
@ -32,17 +32,6 @@ def full_ml_test_data_path(path: str = "") -> Path:
|
|||
return _full_test_data_path("ML", path)
|
||||
|
||||
|
||||
def full_azure_test_data_path(path: str = "") -> Path:
|
||||
"""
|
||||
Takes a relative path inside of the Azure/tests/test_data folder, and returns its
|
||||
full absolute path.
|
||||
|
||||
:param path: A path relative to the Tests/Azure/test_data
|
||||
:return: The full absolute path of the argument.
|
||||
"""
|
||||
return _full_test_data_path("Azure", path)
|
||||
|
||||
|
||||
def _full_test_data_path(prefix: str, suffix: str) -> Path:
|
||||
root = tests_root_directory()
|
||||
return root / prefix / "test_data" / suffix
|
||||
|
|
|
@ -279,3 +279,21 @@ class GenericConfig(param.Parameterized):
|
|||
reason = f"parameter is {reason}"
|
||||
# We could raise an error here instead - to be discussed.
|
||||
logging.warning(f"Override {key}={desired} failed: {reason} in class {self.__class__.name}")
|
||||
|
||||
|
||||
def create_from_matching_params(from_object: param.Parameterized, cls_: Type[T]) -> T:
|
||||
"""
|
||||
Creates an object of the given target class, and then copies all attributes from the `from_object` to
|
||||
the newly created object, if there is a matching attribute. The target class must be a subclass of
|
||||
param.Parameterized.
|
||||
:param from_object: The object to read attributes from.
|
||||
:param cls_: The name of the class for the newly created object.
|
||||
:return: An instance of cls_
|
||||
"""
|
||||
c = cls_()
|
||||
if not isinstance(c, param.Parameterized):
|
||||
raise ValueError(f"The created object must be a subclass of param.Parameterized, but got {type(c)}")
|
||||
for param_name, p in c.params().items():
|
||||
if not p.constant and not p.readonly:
|
||||
setattr(c, param_name, getattr(from_object, param_name))
|
||||
return c
|
||||
|
|
|
@ -9,16 +9,17 @@ from dataclasses import dataclass
|
|||
from enum import Enum, unique
|
||||
from math import isclose
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
||||
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import param
|
||||
from azureml.core import ScriptRunConfig
|
||||
from azureml.core import Model, ScriptRunConfig
|
||||
from azureml.train.hyperdrive import HyperDriveConfig
|
||||
from pandas import DataFrame
|
||||
|
||||
from InnerEye.Common.common_util import any_pairwise_larger, any_smaller_or_equal_than, check_is_any_of
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Common.common_util import ModelProcessing, any_pairwise_larger, any_smaller_or_equal_than, check_is_any_of
|
||||
from InnerEye.Common.generic_parsing import IntTuple
|
||||
from InnerEye.Common.type_annotations import TupleFloat2, TupleFloat3, TupleInt3, TupleStringOptionalFloat
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
|
@ -264,7 +265,9 @@ class SegmentationModelBase(ModelConfigBase):
|
|||
|
||||
#: The number of image levels used in Unet (in encoding and decoding paths).
|
||||
num_downsampling_paths: int = param.Integer(4, bounds=(1, None),
|
||||
instantiate=False, doc="The number of levels used in a UNet architecture in encoding and decoding paths.")
|
||||
instantiate=False,
|
||||
doc="The number of levels used in a UNet architecture in encoding and "
|
||||
"decoding paths.")
|
||||
|
||||
#: The size of the random crops that will be drawn from the input images during training. This is also the
|
||||
#: input size of the model.
|
||||
|
@ -666,7 +669,7 @@ class SegmentationModelBase(ModelConfigBase):
|
|||
"""
|
||||
Loads a dataset from the dataset_csv file, and stores it in the present object.
|
||||
"""
|
||||
assert self.local_dataset is not None # for mypy
|
||||
assert self.local_dataset is not None, "The dataset must be provided in self.local_dataset"
|
||||
self.dataset_data_frame = pd.read_csv(self.local_dataset / self.dataset_csv,
|
||||
dtype=str,
|
||||
converters=self.col_type_converters,
|
||||
|
@ -793,3 +796,7 @@ class SegmentationModelBase(ModelConfigBase):
|
|||
By default no transformation is performed.
|
||||
"""
|
||||
return ModelTransformsPerExecutionMode()
|
||||
|
||||
|
||||
PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
|
||||
ModelDeploymentHookSignature = Callable[[SegmentationModelBase, AzureConfig, Model, ModelProcessing], Any]
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from torch.optim import Adam, Optimizer
|
||||
from torch.optim.lr_scheduler import StepLR, _LRScheduler
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
from InnerEye.Common import fixed_paths_for_tests
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
|
||||
class HelloDataset(Dataset):
|
||||
"""
|
||||
A simple 1dim regression task, read from a data file stored in the test data folder.
|
||||
"""
|
||||
# Creating the data file:
|
||||
# import numpy as np
|
||||
# import torch
|
||||
#
|
||||
# N = 100
|
||||
# x = torch.rand((N, 1)) * 10
|
||||
# y = 0.2 * x + 0.1 * torch.randn(x.size())
|
||||
# xy = torch.cat((x, y), dim=1)
|
||||
# np.savetxt("Tests/ML/test_data/hellocontainer.csv", xy.numpy(), delimiter=",")
|
||||
def __init__(self, root_folder: Path, start_index: int, end_index: int) -> None:
|
||||
"""
|
||||
Creates the 1-dim regression dataset.
|
||||
:param root_folder: The folder in which the data file lives ("hellocontainer.csv")
|
||||
:param start_index: The first row to read.
|
||||
:param end_index: The last row to read (exclusive)
|
||||
"""
|
||||
super().__init__()
|
||||
raw_data = np.loadtxt(root_folder / "hellocontainer.csv", delimiter=",")[start_index:end_index]
|
||||
self.data = torch.tensor(raw_data, dtype=torch.float)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.data.shape[0]
|
||||
|
||||
def __getitem__(self, item: int) -> Dict[str, torch.Tensor]:
|
||||
return {'x': self.data[item][0:1], 'y': self.data[item][1:2]}
|
||||
|
||||
|
||||
class HelloDataModule(LightningDataModule):
|
||||
"""
|
||||
A data module that gives the training, validation and test data for a simple 1-dim regression task.
|
||||
"""
|
||||
def __init__(self, root_folder: Path) -> None:
|
||||
super().__init__()
|
||||
self.train = HelloDataset(root_folder, start_index=0, end_index=50)
|
||||
self.val = HelloDataset(root_folder, start_index=50, end_index=70)
|
||||
self.test = HelloDataset(root_folder, start_index=70, end_index=100)
|
||||
|
||||
def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(self.train, batch_size=5)
|
||||
|
||||
def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(self.val, batch_size=5)
|
||||
|
||||
def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(self.test, batch_size=5)
|
||||
|
||||
|
||||
class HelloRegression(LightningModule):
|
||||
"""
|
||||
A simple 1-dim regression model.
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.model = torch.nn.Linear(in_features=1, out_features=1, bias=True)
|
||||
self.test_mse: List[torch.Tensor] = []
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore
|
||||
return self.model(x)
|
||||
|
||||
def training_step(self, batch: Dict[str, torch.Tensor], *args: Any, **kwargs: Any) -> torch.Tensor: # type: ignore
|
||||
input = batch["x"]
|
||||
target = batch["y"]
|
||||
prediction = self.forward(input)
|
||||
loss = torch.nn.functional.mse_loss(prediction, target)
|
||||
self.log("loss", loss, on_epoch=True, on_step=False)
|
||||
return loss
|
||||
|
||||
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
|
||||
optimizer = Adam(self.parameters(), lr=1e-1)
|
||||
scheduler = StepLR(optimizer, step_size=20, gamma=0.5)
|
||||
return [optimizer], [scheduler]
|
||||
|
||||
def on_test_epoch_start(self) -> None:
|
||||
self.test_mse = []
|
||||
|
||||
def test_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor: # type: ignore
|
||||
input = batch["x"]
|
||||
target = batch["y"]
|
||||
prediction = self.forward(input)
|
||||
loss = torch.nn.functional.mse_loss(prediction, target)
|
||||
self.test_mse.append(loss)
|
||||
return loss
|
||||
|
||||
def on_test_epoch_end(self) -> None:
|
||||
average_mse = torch.mean(torch.stack(self.test_mse))
|
||||
Path("test_mse.txt").write_text(str(average_mse.item()))
|
||||
|
||||
|
||||
class HelloContainer(LightningContainer):
|
||||
"""
|
||||
An example for using the InnerEye functionality to "bring your own lightning model". This container has methods
|
||||
to generate the actual Lightning model, and read out the datamodule that will be used for training.
|
||||
The number of training epochs is controlled at container level.
|
||||
You can train this model by running `python InnerEye/ML/runner.py --model=HelloContainer` on the local box,
|
||||
or via `python InnerEye/ML/runner.py --model=HelloContainer --azureml=True` in AzureML
|
||||
"""
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.local_dataset = fixed_paths_for_tests.full_ml_test_data_path()
|
||||
self.num_epochs = 20
|
||||
|
||||
# This method must be overridden by any subclass of LightningContainer
|
||||
def create_model(self) -> LightningModule:
|
||||
return HelloRegression()
|
||||
|
||||
# This method must be overridden by any subclass of LightningContainer
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
assert self.local_dataset is not None
|
||||
return HelloDataModule(root_folder=self.local_dataset) # type: ignore
|
||||
|
||||
# This is an optional override: This report creation method can read out any files that were written during
|
||||
# training, and cook them into a nice looking report. Here, the report is a simple text file.
|
||||
def create_report(self) -> None:
|
||||
# This just prints out the test MSE, but you could also generate a Jupyter notebook here, for example.
|
||||
test_mse = float(Path("test_mse.txt").read_text())
|
||||
report = f"Performance on test set: MSE = {test_mse}"
|
||||
print(report)
|
||||
Path("report.txt").write_text(report)
|
|
@ -0,0 +1,68 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
||||
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
|
||||
# individual warnings only.
|
||||
# flake8: noqa
|
||||
from typing import Optional
|
||||
|
||||
import param
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from torch.utils.tensorboard import SummaryWriter
|
||||
|
||||
from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
add_folder_to_sys_path_if_needed("fastMRI")
|
||||
|
||||
from fastmri.data.subsample import create_mask_for_mask_type
|
||||
from fastmri.data.transforms import VarNetDataTransform
|
||||
from fastmri.pl_modules import FastMriDataModule, VarNetModule
|
||||
|
||||
|
||||
class VarNetWithImageLogging(VarNetModule):
|
||||
"""
|
||||
A clone of the VarNet model that logs images to only the Tensorboard loggers. The original VarNet hardcodes
|
||||
a single logger that must be Tensorboard.
|
||||
"""
|
||||
|
||||
def log_image(self, name: str, image: torch.Tensor) -> None:
|
||||
experiments = self.logger.experiment if isinstance(self.logger.experiment, list) \
|
||||
else [self.logger.experiment]
|
||||
for experiment in experiments:
|
||||
if isinstance(experiment, SummaryWriter):
|
||||
experiment.add_image(name, image, global_step=self.global_step)
|
||||
|
||||
|
||||
class FastMri(LightningContainer):
|
||||
# All fields that are declared here will be automatically available as commandline arguments.
|
||||
challenge: str = param.String(default="multicoil", doc="Chooses between the singlecoil or multicoil"
|
||||
"acquisition setup.")
|
||||
sample_rate: Optional[float] = param.Number(default=None, doc="Fraction of slices of the training data split to "
|
||||
"use. Default: 1.0")
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.azure_dataset_id = "fastmrimini_brain"
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return VarNetWithImageLogging()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
mask = create_mask_for_mask_type(mask_type_str="equispaced",
|
||||
center_fractions=[0.08],
|
||||
accelerations=[4])
|
||||
# use random masks for train transform, fixed masks for val transform
|
||||
train_transform = VarNetDataTransform(mask_func=mask, use_seed=False)
|
||||
val_transform = VarNetDataTransform(mask_func=mask)
|
||||
test_transform = VarNetDataTransform()
|
||||
|
||||
return FastMriDataModule(data_path=self.local_dataset,
|
||||
challenge=self.challenge,
|
||||
sample_rate=self.sample_rate,
|
||||
train_transform=train_transform,
|
||||
val_transform=val_transform,
|
||||
test_transform=test_transform)
|
|
@ -7,7 +7,7 @@ from typing import Any
|
|||
|
||||
from azureml.core import ScriptRunConfig
|
||||
from azureml.train.hyperdrive import BanditPolicy, HyperDriveConfig, PrimaryMetricGoal, RandomParameterSampling, uniform
|
||||
from networkx.tests.test_convert_pandas import pd
|
||||
import pandas as pd
|
||||
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
|
||||
from InnerEye.Common.metrics_constants import TrackedMetrics
|
||||
|
|
|
@ -183,39 +183,186 @@ class DeepLearningFileSystemConfig(Parameterized):
|
|||
logs_folder=logs_folder,
|
||||
project_root=self.project_root
|
||||
)
|
||||
raise ValueError("This method should only be called for offline runs, when the logs folder is inside the "
|
||||
"outputs folder.")
|
||||
raise ValueError("This method should only be called for runs outside AzureML, when the logs folder is "
|
||||
"inside the outputs folder.")
|
||||
|
||||
|
||||
class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
||||
class WorkflowParams(param.Parameterized):
|
||||
"""
|
||||
A class that holds all settings that are shared across segmentation models and regression/classification models.
|
||||
This class contains all parameters that affect how the whole training and testing workflow is executed.
|
||||
"""
|
||||
_model_category: ModelCategory = param.ClassSelector(class_=ModelCategory,
|
||||
doc="The high-level model category described by this config.")
|
||||
_model_name: str = param.String(None, doc="The human readable name of the model (for example, Liver). This is "
|
||||
"usually set from the class name.")
|
||||
|
||||
random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
|
||||
azure_dataset_id: str = param.String(doc="If provided, the ID of the dataset to use. This dataset must exist as a "
|
||||
"folder of the same name in the 'datasets' "
|
||||
"container in the datasets storage account.")
|
||||
local_dataset: Optional[Path] = param.ClassSelector(class_=Path,
|
||||
default=None,
|
||||
allow_None=True,
|
||||
doc="The path of the dataset to use, when training is running "
|
||||
"outside Azure.")
|
||||
num_dataload_workers: int = param.Integer(8, bounds=(0, None),
|
||||
doc="The number of data loading workers (processes). When set to 0,"
|
||||
"data loading is running in the same process (no process startup "
|
||||
"cost, hence good for use in unit testing. However, it "
|
||||
"does not give the same result as running with 1 worker process)")
|
||||
shuffle: bool = param.Boolean(True, doc="If true, the dataset will be shuffled randomly during training.")
|
||||
num_epochs: int = param.Integer(100, bounds=(1, None), doc="Number of epochs to train.")
|
||||
start_epoch: int = param.Integer(0, bounds=(0, None), doc="The first epoch to train. Set to 0 to start a new "
|
||||
"training. Set to a value larger than zero for starting"
|
||||
" from a checkpoint.")
|
||||
number_of_cross_validation_splits: int = param.Integer(0, bounds=(0, None),
|
||||
doc="Number of cross validation splits for k-fold cross "
|
||||
"validation")
|
||||
cross_validation_split_index: int = param.Integer(DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, bounds=(-1, None),
|
||||
doc="The index of the cross validation fold this model is "
|
||||
"associated with when performing k-fold cross validation")
|
||||
perform_training_set_inference: bool = \
|
||||
param.Boolean(False,
|
||||
doc="If True, run full image inference on the training set at the end of training. If False and "
|
||||
"perform_validation_and_test_set_inference is True (default), only run inference on "
|
||||
"validation and test set. If both flags are False do not run inference.")
|
||||
perform_validation_and_test_set_inference: bool = \
|
||||
param.Boolean(True,
|
||||
doc="If True (default), run full image inference on validation and test set after training.")
|
||||
weights_url: str = param.String(doc="If provided, a url from which weights will be downloaded and used for model "
|
||||
"initialization.")
|
||||
local_weights_path: Optional[Path] = param.ClassSelector(class_=Path,
|
||||
default=None,
|
||||
allow_None=True,
|
||||
doc="The path to the weights to use for model "
|
||||
"initialization, when training outside AzureML.")
|
||||
generate_report: bool = param.Boolean(default=True,
|
||||
doc="If True (default), write a modelling report in HTML format. If False,"
|
||||
"do not write that report.")
|
||||
# The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
|
||||
# "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
|
||||
# can reduce the chance of stuck jobs.
|
||||
multiprocessing_start_method: MultiprocessingStartMethod = \
|
||||
param.ClassSelector(class_=MultiprocessingStartMethod,
|
||||
default=(MultiprocessingStartMethod.spawn if is_windows()
|
||||
else MultiprocessingStartMethod.fork),
|
||||
doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
|
||||
"fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
|
||||
"Set to forkserver as a possible remedy for stuck jobs.")
|
||||
monitoring_interval_seconds: int = param.Integer(0, doc="Seconds delay between logging GPU/CPU resource "
|
||||
"statistics. If 0 or less, do not log any resource "
|
||||
"statistics.")
|
||||
|
||||
def validate(self) -> None:
|
||||
if self.weights_url and self.local_weights_path:
|
||||
raise ValueError("Cannot specify both local_weights_path and weights_url.")
|
||||
|
||||
if self.number_of_cross_validation_splits == 1:
|
||||
raise ValueError("At least two splits required to perform cross validation, but got "
|
||||
f"{self.number_of_cross_validation_splits}. To train without cross validation, set "
|
||||
"number_of_cross_validation_splits=0.")
|
||||
if 0 < self.number_of_cross_validation_splits <= self.cross_validation_split_index:
|
||||
raise ValueError(f"Cross validation split index is out of bounds: {self.cross_validation_split_index}, "
|
||||
f"which is invalid for CV with {self.number_of_cross_validation_splits} splits.")
|
||||
elif self.number_of_cross_validation_splits == 0 and self.cross_validation_split_index != -1:
|
||||
raise ValueError(f"Cross validation split index must be -1 for a non cross validation run, "
|
||||
f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
|
||||
f"and cross_validation_split_index={self.cross_validation_split_index}")
|
||||
|
||||
@property
|
||||
def is_offline_run(self) -> bool:
|
||||
"""
|
||||
Returns True if the run is executing outside AzureML, or False if inside AzureML.
|
||||
"""
|
||||
return is_offline_run_context(RUN_CONTEXT)
|
||||
|
||||
@property
|
||||
def perform_cross_validation(self) -> bool:
|
||||
"""
|
||||
True if cross validation will be be performed as part of the training procedure.
|
||||
:return:
|
||||
"""
|
||||
return self.number_of_cross_validation_splits > 1
|
||||
|
||||
def get_effective_random_seed(self) -> int:
|
||||
"""
|
||||
Returns the random seed set as part of this configuration. If the configuration corresponds
|
||||
to a cross validation split, then the cross validation fold index will be added to the
|
||||
set random seed in order to return the effective random seed.
|
||||
:return:
|
||||
"""
|
||||
seed = self.random_seed
|
||||
if self.perform_cross_validation:
|
||||
# offset the random seed based on the cross validation split index so each
|
||||
# fold has a different initial random state.
|
||||
seed += self.cross_validation_split_index
|
||||
return seed
|
||||
|
||||
|
||||
class DatasetParams(param.Parameterized):
|
||||
azure_dataset_id: str = param.String(doc="If provided, the ID of the dataset to use when running in AzureML. "
|
||||
"This dataset must exist as a folder of the same name in the 'datasets' "
|
||||
"container in the datasets storage account. This dataset will be mounted "
|
||||
"and made available at the 'local_dataset' path when running in AzureML.")
|
||||
local_dataset: Optional[Path] = \
|
||||
param.ClassSelector(class_=Path, default=None, allow_None=True,
|
||||
doc="The path of the dataset to use, when training is running outside Azure.")
|
||||
|
||||
|
||||
class OutputParams(param.Parameterized):
|
||||
output_to: str = param.String(default="",
|
||||
doc="If provided, the run outputs will be written to the given folder. If not "
|
||||
"provided, outputs will go into a subfolder of the project root folder.")
|
||||
file_system_config: DeepLearningFileSystemConfig = param.ClassSelector(default=DeepLearningFileSystemConfig(),
|
||||
class_=DeepLearningFileSystemConfig,
|
||||
instantiate=False,
|
||||
doc="File system related configs")
|
||||
_model_name: str = param.String("", doc="The human readable name of the model (for example, Liver). This is "
|
||||
"usually set from the class name.")
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
"""
|
||||
Gets the human readable name of the model (e.g., Liver). This is usually set from the class name.
|
||||
:return: A model name as a string.
|
||||
"""
|
||||
return self._model_name
|
||||
|
||||
def set_output_to(self, output_to: PathOrString) -> None:
|
||||
"""
|
||||
Adjusts the file system settings in the present object such that all outputs are written to the given folder.
|
||||
:param output_to: The absolute path to a folder that should contain the outputs.
|
||||
"""
|
||||
if isinstance(output_to, Path):
|
||||
output_to = str(output_to)
|
||||
self.output_to = output_to
|
||||
self.create_filesystem()
|
||||
|
||||
def create_filesystem(self, project_root: Path = fixed_paths.repository_root_directory()) -> None:
|
||||
"""
|
||||
Creates new file system settings (outputs folder, logs folder) based on the information stored in the
|
||||
present object. If any of the folders do not yet exist, they are created.
|
||||
:param project_root: The root folder for the codebase that triggers the training run.
|
||||
"""
|
||||
self.file_system_config = DeepLearningFileSystemConfig.create(
|
||||
project_root=project_root,
|
||||
model_name=self.model_name,
|
||||
is_offline_run=is_offline_run_context(RUN_CONTEXT),
|
||||
output_to=self.output_to
|
||||
)
|
||||
|
||||
@property
|
||||
def outputs_folder(self) -> Path:
|
||||
"""Gets the full path in which the model outputs should be stored."""
|
||||
return self.file_system_config.outputs_folder
|
||||
|
||||
@property
|
||||
def logs_folder(self) -> Path:
|
||||
"""Gets the full path in which the model logs should be stored."""
|
||||
return self.file_system_config.logs_folder
|
||||
|
||||
@property
|
||||
def checkpoint_folder(self) -> Path:
|
||||
"""Gets the full path in which the model checkpoints should be stored during training."""
|
||||
return self.outputs_folder / CHECKPOINT_FOLDER
|
||||
|
||||
@property
|
||||
def visualization_folder(self) -> Path:
|
||||
"""Gets the full path in which the visualizations notebooks should be saved during training."""
|
||||
return self.outputs_folder / VISUALIZATION_FOLDER
|
||||
|
||||
def get_path_to_checkpoint(self) -> Path:
|
||||
"""
|
||||
Returns the full path to a recovery checkpoint.
|
||||
"""
|
||||
return create_recovery_checkpoint_path(self.checkpoint_folder)
|
||||
|
||||
def get_path_to_best_checkpoint(self) -> Path:
|
||||
"""
|
||||
Returns the full path to a checkpoint file that was found to be best during training, whatever criterion
|
||||
was applied there.
|
||||
"""
|
||||
return get_best_checkpoint_path(self.checkpoint_folder)
|
||||
|
||||
|
||||
class OptimizerParams(param.Parameterized):
|
||||
l_rate: float = param.Number(1e-4, doc="The initial learning rate", bounds=(0, None))
|
||||
_min_l_rate: float = param.Number(0.0, doc="The minimum learning rate for the Polynomial and Cosine schedulers.",
|
||||
bounds=(0.0, None))
|
||||
|
@ -256,35 +403,87 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
doc="The betas parameter of Adam, default is (0.9, 0.999)")
|
||||
momentum: float = param.Number(0.6, doc="The momentum parameter of the optimizers")
|
||||
weight_decay: float = param.Number(1e-4, doc="The weight decay used to control L2 regularization")
|
||||
|
||||
def validate(self) -> None:
|
||||
if len(self.adam_betas) < 2:
|
||||
raise ValueError(
|
||||
"The adam_betas parameter should be the coefficients used for computing running averages of "
|
||||
"gradient and its square")
|
||||
|
||||
if self.l_rate_scheduler == LRSchedulerType.MultiStep:
|
||||
if not self.l_rate_multi_step_milestones:
|
||||
raise ValueError("Must specify l_rate_multi_step_milestones to use LR scheduler MultiStep")
|
||||
if sorted(set(self.l_rate_multi_step_milestones)) != self.l_rate_multi_step_milestones:
|
||||
raise ValueError("l_rate_multi_step_milestones must be a strictly increasing list")
|
||||
if self.l_rate_multi_step_milestones[0] <= 0:
|
||||
raise ValueError("l_rate_multi_step_milestones cannot be negative or 0.")
|
||||
|
||||
@property
|
||||
def min_l_rate(self) -> float:
|
||||
return self._min_l_rate
|
||||
|
||||
@min_l_rate.setter
|
||||
def min_l_rate(self, value: float) -> None:
|
||||
if value > self.l_rate:
|
||||
raise ValueError("l_rate must be >= min_l_rate, found: {}, {}".format(self.l_rate, value))
|
||||
self._min_l_rate = value
|
||||
|
||||
|
||||
class TrainerParams(CudaAwareConfig):
|
||||
num_epochs: int = param.Integer(100, bounds=(1, None), doc="Number of epochs to train.")
|
||||
recovery_checkpoint_save_interval: int = param.Integer(10, bounds=(0, None),
|
||||
doc="Save epoch checkpoints when epoch number is a multiple "
|
||||
"of recovery_checkpoint_save_interval. The intended use "
|
||||
"is to allow restore training from failed runs.")
|
||||
train_batch_size: int = param.Integer(4, bounds=(0, None),
|
||||
doc="The number of crops that make up one minibatch during training.")
|
||||
detect_anomaly: bool = param.Boolean(False, doc="If true, test gradients for anomalies (NaN or Inf) during "
|
||||
"training.")
|
||||
use_mixed_precision: bool = param.Boolean(False, doc="If true, mixed precision training is activated during "
|
||||
"training.")
|
||||
max_num_gpus: int = param.Integer(default=-1, doc="The maximum number of GPUS to use. If set to a value < 0, use"
|
||||
"all available GPUs.")
|
||||
pl_progress_bar_refresh_rate: Optional[int] = \
|
||||
param.Integer(default=None,
|
||||
doc="PyTorch Lightning trainer flag 'progress_bar_refresh_rate': How often to refresh progress "
|
||||
"bar (in steps). Value 0 disables progress bar. Value None chooses automatically.")
|
||||
pl_num_sanity_val_steps: int = \
|
||||
param.Integer(default=0,
|
||||
doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
|
||||
"steps to run before training, to identify possible problems")
|
||||
pl_deterministic: bool = \
|
||||
param.Integer(default=True,
|
||||
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
|
||||
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
|
||||
"you may see training speed increases.")
|
||||
start_epoch: int = param.Integer(0, bounds=(0, None), doc="The first epoch to train. Set to 0 to start a new "
|
||||
"training. Set to a value larger than zero for starting"
|
||||
" from a checkpoint.")
|
||||
|
||||
|
||||
class DeepLearningConfig(WorkflowParams,
|
||||
DatasetParams,
|
||||
OutputParams,
|
||||
OptimizerParams,
|
||||
TrainerParams,
|
||||
CudaAwareConfig,
|
||||
GenericConfig):
|
||||
"""
|
||||
A class that holds all settings that are shared across segmentation models and regression/classification models.
|
||||
"""
|
||||
_model_category: ModelCategory = param.ClassSelector(class_=ModelCategory,
|
||||
doc="The high-level model category described by this config.")
|
||||
|
||||
num_dataload_workers: int = param.Integer(8, bounds=(0, None),
|
||||
doc="The number of data loading workers (processes). When set to 0,"
|
||||
"data loading is running in the same process (no process startup "
|
||||
"cost, hence good for use in unit testing. However, it "
|
||||
"does not give the same result as running with 1 worker process)")
|
||||
shuffle: bool = param.Boolean(True, doc="If true, the dataset will be shuffled randomly during training.")
|
||||
train_batch_size: int = param.Integer(4, bounds=(0, None),
|
||||
doc="The number of crops that make up one minibatch during training.")
|
||||
use_model_parallel: bool = param.Boolean(False, doc="If true, neural network model is partitioned across all "
|
||||
"available GPUs to fit in a large model. It shall not be used "
|
||||
"together with data parallel.")
|
||||
monitoring_interval_seconds: int = param.Integer(0, doc="Seconds delay between logging GPU/CPU resource "
|
||||
"statistics. If 0 or less, do not log any resource "
|
||||
"statistics.")
|
||||
number_of_cross_validation_splits: int = param.Integer(0, bounds=(0, None),
|
||||
doc="Number of cross validation splits for k-fold cross "
|
||||
"validation")
|
||||
cross_validation_split_index: int = param.Integer(DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, bounds=(-1, None),
|
||||
doc="The index of the cross validation fold this model is "
|
||||
"associated with when performing k-fold cross validation")
|
||||
file_system_config: DeepLearningFileSystemConfig = param.ClassSelector(default=DeepLearningFileSystemConfig(),
|
||||
class_=DeepLearningFileSystemConfig,
|
||||
instantiate=False,
|
||||
doc="File system related configs")
|
||||
pin_memory: bool = param.Boolean(True, doc="Value of pin_memory argument to DataLoader")
|
||||
_overrides: Dict[str, Any] = param.Dict(instantiate=True,
|
||||
doc="Model config properties that were overridden from the commandline")
|
||||
restrict_subjects: Optional[str] = \
|
||||
param.String(doc="Use at most this number of subjects for train, val, or test set (must be > 0 or None). "
|
||||
"If None, do not modify the train, val, or test sets. If a string of the form 'i,j,k' where "
|
||||
|
@ -294,14 +493,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
"limit test set to 5. If any of i,j,k is '+', discarded members of the other sets are added "
|
||||
"to that set.",
|
||||
allow_None=True)
|
||||
perform_training_set_inference: bool = \
|
||||
param.Boolean(False,
|
||||
doc="If True, run full image inference on the training set at the end of training. If False and "
|
||||
"perform_validation_and_test_set_inference is True (default), only run inference on "
|
||||
"validation and test set. If both flags are False do not run inference.")
|
||||
perform_validation_and_test_set_inference: bool = \
|
||||
param.Boolean(True,
|
||||
doc="If True (default), run full image inference on validation and test set after training.")
|
||||
_dataset_data_frame: Optional[DataFrame] = \
|
||||
param.DataFrame(default=None,
|
||||
doc="The dataframe that contains the dataset for the model. This is usually read from disk "
|
||||
|
@ -315,19 +506,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
"on Linux, inference is currently disabled as the data loaders hang. "
|
||||
"If False, use the default data loader logic that starts new processes for "
|
||||
"each epoch.")
|
||||
# The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
|
||||
# "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
|
||||
# can reduce the chance of stuck jobs.
|
||||
multiprocessing_start_method: MultiprocessingStartMethod = \
|
||||
param.ClassSelector(class_=MultiprocessingStartMethod,
|
||||
default=(MultiprocessingStartMethod.spawn if is_windows()
|
||||
else MultiprocessingStartMethod.fork),
|
||||
doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
|
||||
"fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
|
||||
"Set to forkserver as a possible remedy for stuck jobs.")
|
||||
output_to: str = param.String(default="",
|
||||
doc="If provided, the run outputs will be written to the given folder. If not "
|
||||
"provided, outputs will go into a subfolder of the project root folder.")
|
||||
max_batch_grad_cam: int = param.Integer(default=0, doc="Max number of validation batches for which "
|
||||
"to save gradCam images. By default "
|
||||
"visualizations are saved for all images "
|
||||
|
@ -336,7 +514,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
doc="Target smoothing value for label smoothing")
|
||||
log_to_parent_run: bool = param.Boolean(default=False, doc="If true, hyperdrive child runs will log their metrics"
|
||||
"to their parent run.")
|
||||
|
||||
use_imbalanced_sampler_for_training: bool = param.Boolean(default=False,
|
||||
doc="If True, use an imbalanced sampler during training.")
|
||||
drop_last_batch_in_training: bool = param.Boolean(default=False,
|
||||
|
@ -358,28 +535,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
"weights are updated using mean_teacher_"
|
||||
"weight = alpha * (mean_teacher_weight) "
|
||||
" + (1-alpha) * (current_student_weights). ")
|
||||
weights_url: str = param.String(doc="If provided, a url from which weights will be downloaded and used for model "
|
||||
"initialization.")
|
||||
local_weights_path: Optional[Path] = param.ClassSelector(class_=Path,
|
||||
default=None,
|
||||
allow_None=True,
|
||||
doc="The path to the weights to use for model "
|
||||
"initialization, "
|
||||
"when training is running outside Azure.")
|
||||
max_num_gpus: int = param.Integer(default=-1, doc="The maximum number of GPUS to use. If set to a value < 0, use"
|
||||
"all available GPUs.")
|
||||
generate_report: bool = param.Boolean(default=True,
|
||||
doc="If True (default), write a modelling report in HTML format. If False,"
|
||||
"do not write that report.")
|
||||
pl_num_sanity_val_steps: int = \
|
||||
param.Integer(default=0, doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
|
||||
"steps to run before training, to identify possible problems")
|
||||
pl_deterministic: bool = \
|
||||
param.Integer(default=True,
|
||||
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
|
||||
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
|
||||
"you may see training speed increases.")
|
||||
|
||||
#: Name of the csv file providing information on the dataset to be used.
|
||||
dataset_csv: str = param.String(
|
||||
DATASET_CSV_FILE_NAME,
|
||||
|
@ -394,49 +549,19 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
super().__init__(throw_if_unknown_param=True, **params)
|
||||
logging.info("Creating the default output folder structure.")
|
||||
self.create_filesystem(fixed_paths.repository_root_directory())
|
||||
# Disable the PL progress bar because all InnerEye models have their own console output
|
||||
self.pl_progress_bar_refresh_rate = 0
|
||||
|
||||
def validate(self) -> None:
|
||||
"""
|
||||
Validates the parameters stored in the present object.
|
||||
"""
|
||||
if len(self.adam_betas) < 2:
|
||||
raise ValueError(
|
||||
"The adam_betas parameter should be the coefficients used for computing running averages of "
|
||||
"gradient and its square")
|
||||
WorkflowParams.validate(self)
|
||||
OptimizerParams.validate(self)
|
||||
|
||||
if self.azure_dataset_id is None and self.local_dataset is None:
|
||||
raise ValueError("Either of local_dataset or azure_dataset_id must be set.")
|
||||
|
||||
if self.weights_url and self.local_weights_path:
|
||||
raise ValueError("Cannot specify both local_weights_path and weights_url.")
|
||||
|
||||
if self.number_of_cross_validation_splits == 1:
|
||||
raise ValueError(f"At least two splits required to perform cross validation found "
|
||||
f"number_of_cross_validation_splits={self.number_of_cross_validation_splits}")
|
||||
if 0 < self.number_of_cross_validation_splits <= self.cross_validation_split_index:
|
||||
raise ValueError(f"Cross validation split index is out of bounds: {self.cross_validation_split_index}, "
|
||||
f"which is invalid for CV with {self.number_of_cross_validation_splits} splits.")
|
||||
elif self.number_of_cross_validation_splits == 0 and self.cross_validation_split_index != -1:
|
||||
raise ValueError(f"Cross validation split index must be -1 for a non cross validation run, "
|
||||
f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
|
||||
f"and cross_validation_split_index={self.cross_validation_split_index}")
|
||||
|
||||
if self.l_rate_scheduler == LRSchedulerType.MultiStep:
|
||||
if not self.l_rate_multi_step_milestones:
|
||||
raise ValueError("Must specify l_rate_multi_step_milestones to use LR scheduler MultiStep")
|
||||
if sorted(set(self.l_rate_multi_step_milestones)) != self.l_rate_multi_step_milestones:
|
||||
raise ValueError("l_rate_multi_step_milestones must be a strictly increasing list")
|
||||
if self.l_rate_multi_step_milestones[0] <= 0:
|
||||
raise ValueError("l_rate_multi_step_milestones cannot be negative or 0.")
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
"""
|
||||
Gets the human readable name of the model (e.g., Liver). This is usually set from the class name.
|
||||
:return: A model name as a string.
|
||||
"""
|
||||
return self._model_name
|
||||
|
||||
@property
|
||||
def model_category(self) -> ModelCategory:
|
||||
"""
|
||||
|
@ -463,48 +588,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
def compute_grad_cam(self) -> bool:
|
||||
return self.max_batch_grad_cam > 0
|
||||
|
||||
@property
|
||||
def min_l_rate(self) -> float:
|
||||
return self._min_l_rate
|
||||
|
||||
@min_l_rate.setter
|
||||
def min_l_rate(self, value: float) -> None:
|
||||
if value > self.l_rate:
|
||||
raise ValueError("l_rate must be >= min_l_rate, found: {}, {}".format(self.l_rate, value))
|
||||
self._min_l_rate = value
|
||||
|
||||
@property
|
||||
def outputs_folder(self) -> Path:
|
||||
"""Gets the full path in which the model outputs should be stored."""
|
||||
return self.file_system_config.outputs_folder
|
||||
|
||||
@property
|
||||
def logs_folder(self) -> Path:
|
||||
"""Gets the full path in which the model logs should be stored."""
|
||||
return self.file_system_config.logs_folder
|
||||
|
||||
@property
|
||||
def checkpoint_folder(self) -> Path:
|
||||
"""Gets the full path in which the model checkpoints should be stored during training."""
|
||||
return self.outputs_folder / CHECKPOINT_FOLDER
|
||||
|
||||
@property
|
||||
def visualization_folder(self) -> Path:
|
||||
"""Gets the full path in which the visualizations notebooks should be saved during training."""
|
||||
return self.outputs_folder / VISUALIZATION_FOLDER
|
||||
|
||||
@property
|
||||
def perform_cross_validation(self) -> bool:
|
||||
"""
|
||||
True if cross validation will be be performed as part of the training procedure.
|
||||
:return:
|
||||
"""
|
||||
return self.number_of_cross_validation_splits > 1
|
||||
|
||||
@property
|
||||
def overrides(self) -> Optional[Dict[str, Any]]:
|
||||
return self._overrides
|
||||
|
||||
@property
|
||||
def dataset_data_frame(self) -> Optional[DataFrame]:
|
||||
"""
|
||||
|
@ -521,29 +604,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
"""
|
||||
self._dataset_data_frame = data_frame
|
||||
|
||||
def set_output_to(self, output_to: PathOrString) -> None:
|
||||
"""
|
||||
Adjusts the file system settings in the present object such that all outputs are written to the given folder.
|
||||
:param output_to: The absolute path to a folder that should contain the outputs.
|
||||
"""
|
||||
if isinstance(output_to, Path):
|
||||
output_to = str(output_to)
|
||||
self.output_to = output_to
|
||||
self.create_filesystem()
|
||||
|
||||
def create_filesystem(self, project_root: Path = fixed_paths.repository_root_directory()) -> None:
|
||||
"""
|
||||
Creates new file system settings (outputs folder, logs folder) based on the information stored in the
|
||||
present object. If any of the folders do not yet exist, they are created.
|
||||
:param project_root: The root folder for the codebase that triggers the training run.
|
||||
"""
|
||||
self.file_system_config = DeepLearningFileSystemConfig.create(
|
||||
project_root=project_root,
|
||||
model_name=self.model_name,
|
||||
is_offline_run=self.is_offline_run,
|
||||
output_to=self.output_to
|
||||
)
|
||||
|
||||
def get_train_epochs(self) -> List[int]:
|
||||
"""
|
||||
Returns the epochs for which training will be performed.
|
||||
|
@ -565,34 +625,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
"""
|
||||
return self.get_total_number_of_training_epochs()
|
||||
|
||||
def get_path_to_checkpoint(self) -> Path:
|
||||
"""
|
||||
Returns full path to a recovery checkpoint.
|
||||
:return: path to a checkpoint given an epoch
|
||||
"""
|
||||
return create_recovery_checkpoint_path(self.checkpoint_folder)
|
||||
|
||||
def get_path_to_best_checkpoint(self) -> Path:
|
||||
"""
|
||||
Returns full path to a checkpoint given an epoch
|
||||
:return: path to a checkpoint given an epoch
|
||||
"""
|
||||
return get_best_checkpoint_path(self.checkpoint_folder)
|
||||
|
||||
def get_effective_random_seed(self) -> int:
|
||||
"""
|
||||
Returns the random seed set as part of this configuration. If the configuration corresponds
|
||||
to a cross validation split, then the cross validation fold index will be added to the
|
||||
set random seed in order to return the effective random seed.
|
||||
:return:
|
||||
"""
|
||||
seed = self.random_seed
|
||||
if self.perform_cross_validation:
|
||||
# offset the random seed based on the cross validation split index so each
|
||||
# fold has a different initial random state.
|
||||
seed += self.cross_validation_split_index
|
||||
return seed
|
||||
|
||||
@property # type: ignore
|
||||
def use_gpu(self) -> bool: # type: ignore
|
||||
"""
|
||||
|
@ -617,30 +649,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
raise ValueError("Can't set use_gpu to True if there is not CUDA capable GPU present.")
|
||||
self._use_gpu = value
|
||||
|
||||
def write_args_file(self) -> None:
|
||||
"""
|
||||
Writes the current config to disk in the default output folder.
|
||||
"""
|
||||
self.outputs_folder.mkdir(exist_ok=True, parents=True)
|
||||
dst = self.outputs_folder / ARGS_TXT
|
||||
dst.write_text(data=str(self))
|
||||
|
||||
def should_wait_for_other_cross_val_child_runs(self) -> bool:
|
||||
"""
|
||||
Returns True if the current run is an online run and is the 0th cross validation split.
|
||||
In this case, this will be the run that will wait for all other child runs to finish in order
|
||||
to aggregate their results.
|
||||
:return:
|
||||
"""
|
||||
return (not self.is_offline_run) and self.cross_validation_split_index == 0
|
||||
|
||||
@property
|
||||
def is_offline_run(self) -> bool:
|
||||
"""
|
||||
Returns True if the run is executing outside AzureML, or False if inside AzureML.
|
||||
"""
|
||||
return is_offline_run_context(RUN_CONTEXT)
|
||||
|
||||
@property
|
||||
def compute_mean_teacher_model(self) -> bool:
|
||||
"""
|
||||
|
@ -654,7 +662,7 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
# Avoid callable params, the bindings that are printed out can be humongous.
|
||||
# Avoid dataframes
|
||||
skip_params = {name for name, value in self.param.params().items()
|
||||
if isinstance(value, (param.Callable, DataFrame))}
|
||||
if isinstance(value, (param.Callable, param.DataFrame))}
|
||||
for key, value in self.param.get_param_values():
|
||||
if key not in skip_params:
|
||||
arguments_str += f"\t{key:40}: {value}\n"
|
||||
|
@ -677,7 +685,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
See https://pytorch.org/tutorials/beginner/saving_loading_models.html#warmstarting-model-using-parameters
|
||||
-from-a-different-model
|
||||
for an explanation on why strict=False is useful when loading parameters from other models.
|
||||
|
||||
:param path_to_checkpoint: Path to the checkpoint file.
|
||||
:return: Dictionary with model and optimizer state dicts. The dict should have at least the following keys:
|
||||
1. Key ModelAndInfo.MODEL_STATE_DICT_KEY and value set to the model state dict.
|
||||
|
@ -685,7 +692,15 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
|
|||
Other (optional) entries corresponding to keys ModelAndInfo.OPTIMIZER_STATE_DICT_KEY and
|
||||
ModelAndInfo.MEAN_TEACHER_STATE_DICT_KEY are also supported.
|
||||
"""
|
||||
import torch
|
||||
map_location = None if self.use_gpu else 'cpu'
|
||||
checkpoint = torch.load(str(path_to_checkpoint), map_location=map_location)
|
||||
return checkpoint
|
||||
return load_checkpoint(path_to_checkpoint=path_to_checkpoint, use_gpu=self.use_gpu)
|
||||
|
||||
|
||||
def load_checkpoint(path_to_checkpoint: Path, use_gpu: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Loads a Torch checkpoint from the given file. If use_gpu==False, map all parameters to the GPU, otherwise
|
||||
left the device of all parameters unchanged.
|
||||
"""
|
||||
import torch
|
||||
map_location = None if use_gpu else 'cpu'
|
||||
checkpoint = torch.load(str(path_to_checkpoint), map_location=map_location)
|
||||
return checkpoint
|
||||
|
|
|
@ -4,20 +4,25 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
import logging
|
||||
import numbers
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import param
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from pytorch_lightning.utilities import rank_zero_only
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.lr_scheduler import _LRScheduler
|
||||
from torch.utils.data import DataLoader
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
from InnerEye.Common.common_util import EPOCH_METRICS_FILE_NAME
|
||||
from InnerEye.Common.common_util import EPOCH_METRICS_FILE_NAME, logging_section
|
||||
from InnerEye.Common.metrics_constants import LoggingColumns, MetricType, TRAIN_PREFIX, VALIDATION_PREFIX
|
||||
from InnerEye.Common.type_annotations import DictStrFloat
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.deep_learning_config import DatasetParams, DeepLearningConfig, WorkflowParams, OutputParams, \
|
||||
TrainerParams
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.lightning_loggers import StoringLogger
|
||||
from InnerEye.ML.metrics import EpochTimers, MAX_ITEM_LOAD_TIME_SEC, store_epoch_metrics
|
||||
from InnerEye.ML.metrics_dict import DataframeLogger
|
||||
|
@ -25,20 +30,43 @@ from InnerEye.ML.model_config_base import ModelConfigBase
|
|||
from InnerEye.ML.utils import model_util
|
||||
from InnerEye.ML.utils.device_aware_module import DeviceAwareModule
|
||||
from InnerEye.ML.utils.lr_scheduler import SchedulerWithWarmUp
|
||||
from InnerEye.ML.utils.ml_util import RandomStateSnapshot, set_random_seed
|
||||
from InnerEye.ML.utils.ml_util import RandomStateSnapshot, set_random_seed, validate_dataset_paths
|
||||
from InnerEye.ML.utils.model_util import generate_and_print_model_summary
|
||||
from InnerEye.ML.visualizers.patch_sampling import visualize_random_crops_for_dataset
|
||||
|
||||
|
||||
class TrainingAndValidationDataLightning(LightningDataModule):
|
||||
class TrainAndValDataLightning(LightningDataModule):
|
||||
"""
|
||||
A class that wraps training and validation data from an InnerEye model configuration to a Lightning data module.
|
||||
When doing inference on the trained models, we use InferenceDataLightning. This is particularly important for
|
||||
segmentation models, where training and validation happens on equal sized patches, but inference is running on
|
||||
images of arbitrary size.
|
||||
"""
|
||||
|
||||
def _init__(self, config: ModelConfigBase) -> None:
|
||||
def __init__(self, config: ModelConfigBase) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.data_loaders: Dict[ModelExecutionMode, DataLoader] = {}
|
||||
|
||||
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
|
||||
"""
|
||||
Writes the dataset files for later use in cross validation analysis. This is only executed once per
|
||||
distributed training run.
|
||||
"""
|
||||
# Save the dataset files for later use in cross validation analysis
|
||||
self.config.write_dataset_files()
|
||||
|
||||
def setup(self, stage: Optional[str] = None) -> None:
|
||||
"""
|
||||
Checks if the dataset folder is present, and the dataset file exists. This is execute on each node in
|
||||
distributed training.
|
||||
"""
|
||||
# Check for existing dataset.csv file in the correct locations. Skip that if a dataset has already been
|
||||
# loaded (typically only during tests)
|
||||
if self.config.dataset_data_frame is None:
|
||||
assert self.config.local_dataset is not None
|
||||
validate_dataset_paths(self.config.local_dataset, self.config.dataset_csv)
|
||||
self.config.read_dataset_if_needed()
|
||||
self.data_loaders = self.config.create_data_loaders()
|
||||
|
||||
def train_dataloader(self) -> DataLoader: # type: ignore
|
||||
|
@ -48,7 +76,91 @@ class TrainingAndValidationDataLightning(LightningDataModule):
|
|||
return self.data_loaders[ModelExecutionMode.VAL]
|
||||
|
||||
def test_dataloader(self) -> DataLoader: # type: ignore
|
||||
raise NotImplementedError("For segmentation models, the test dataset should not be evaluated patch-wise.")
|
||||
raise NotImplementedError("There is no test dataset stored here, because this object is only meant to be "
|
||||
"used for training and validation.")
|
||||
|
||||
|
||||
class InferenceDataLightning(LightningDataModule):
|
||||
"""
|
||||
A class that wraps data for running model inference on InnerEye models, as a Lightning data module.
|
||||
Note that training and validation data is handled by TrainAndValDataLightning.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ModelConfigBase) -> None:
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self.train_data: Dataset = Dataset()
|
||||
self.val_data: Dataset = Dataset()
|
||||
self.test_data: Dataset = Dataset()
|
||||
|
||||
def setup(self, stage: Optional[str] = None) -> None:
|
||||
"""
|
||||
Initializes the datasets stored in the present object, by calling the config object to
|
||||
prepare the torch Dataset objects for train/val/test.
|
||||
"""
|
||||
self.train_data = self.config.get_torch_dataset_for_inference(ModelExecutionMode.TRAIN)
|
||||
self.val_data = self.config.get_torch_dataset_for_inference(ModelExecutionMode.VAL)
|
||||
self.test_data = self.config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
|
||||
|
||||
def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(self.train_data)
|
||||
|
||||
def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(self.val_data)
|
||||
|
||||
def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(self.test_data)
|
||||
|
||||
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class InnerEyeContainer(LightningContainer):
|
||||
"""
|
||||
A container that wraps the creation of Lightning datasets for the built-in InnerEye models.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ModelConfigBase):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
self._model_name = config.model_name
|
||||
# Fields like cross validation index are defined at container level, but the InnerEye models define them
|
||||
# at model level. Copy everything over.
|
||||
for type_to_copy in [WorkflowParams, DatasetParams, TrainerParams, OutputParams]:
|
||||
assert issubclass(type_to_copy, param.Parameterized)
|
||||
self.apply_overrides({p: getattr(config, p) for p in type_to_copy.params()}, # type: ignore
|
||||
should_validate=False)
|
||||
|
||||
def setup(self) -> None:
|
||||
"""
|
||||
This hook reads the dataset file, and possibly sets required pre-processing objects, like one-hot encoder
|
||||
for categorical features, that need to be available before creating the model.
|
||||
"""
|
||||
self.config.read_dataset_if_needed()
|
||||
|
||||
def create_model(self) -> LightningModule: # type: ignore
|
||||
from InnerEye.ML.lightning_models import create_lightning_model
|
||||
return create_lightning_model(self.config)
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
return TrainAndValDataLightning(self.config) # type: ignore
|
||||
|
||||
def get_inference_data_module(self) -> LightningDataModule:
|
||||
return InferenceDataLightning(self.config) # type: ignore
|
||||
|
||||
def before_training_on_rank_zero(self) -> None:
|
||||
# Save the dataset files for later use in cross validation analysis
|
||||
self.config.write_dataset_files()
|
||||
if isinstance(self.config, SegmentationModelBase):
|
||||
with logging_section("Visualizing the effect of sampling random crops for training"):
|
||||
visualize_random_crops_for_dataset(self.config)
|
||||
|
||||
# Print out a detailed breakdown of layers, memory consumption and time.
|
||||
assert isinstance(self.model, InnerEyeLightning)
|
||||
generate_and_print_model_summary(self.config, self.model.model)
|
||||
|
||||
def load_checkpoint_and_modify(self, path_to_checkpoint: Path) -> Dict[str, Any]:
|
||||
return self.config.load_checkpoint_and_modify(path_to_checkpoint=path_to_checkpoint)
|
||||
|
||||
|
||||
class InnerEyeLightning(LightningModule):
|
||||
|
@ -61,6 +173,7 @@ class InnerEyeLightning(LightningModule):
|
|||
def __init__(self, config: DeepLearningConfig, *args: Any, **kwargs: Any) -> None:
|
||||
super().__init__(*args, **kwargs)
|
||||
self.outputs_folder = config.outputs_folder
|
||||
self.checkpoint_folder = config.checkpoint_folder
|
||||
self.model: DeviceAwareModule = DeviceAwareModule()
|
||||
# These two will be set later in set_optimizer_and_scheduler.
|
||||
# The ddp_spawn accelerator only works if the model configuration object is
|
||||
|
@ -85,20 +198,17 @@ class InnerEyeLightning(LightningModule):
|
|||
fixed_columns=fixed_logger_columns)
|
||||
self.val_epoch_metrics_logger = DataframeLogger(self.val_metrics_folder / EPOCH_METRICS_FILE_NAME,
|
||||
fixed_columns=fixed_logger_columns)
|
||||
# Fields to store diagnostics for unit testing
|
||||
self.train_diagnostics: List[Any] = []
|
||||
self.val_diagnostics: List[Any] = []
|
||||
# Stores information the checkpoint that created this model, if any.
|
||||
self.checkpoint_loading_message = ""
|
||||
|
||||
def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None:
|
||||
self.optimizer = model_util.create_optimizer(config, self.model.parameters())
|
||||
self.l_rate_scheduler = SchedulerWithWarmUp(config, self.optimizer)
|
||||
self.l_rate_scheduler = SchedulerWithWarmUp(config, self.optimizer, num_epochs=config.num_epochs)
|
||||
|
||||
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
|
||||
return [self.optimizer], [self.l_rate_scheduler] # type: ignore
|
||||
|
||||
def close_all_loggers(self) -> None:
|
||||
def on_fit_end(self) -> None:
|
||||
"""
|
||||
Flushes all logger objects that the present object holds.
|
||||
"""
|
||||
|
|
|
@ -0,0 +1,292 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import abc
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple
|
||||
|
||||
import param
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.lr_scheduler import _LRScheduler
|
||||
|
||||
from InnerEye.Common.generic_parsing import GenericConfig, create_from_matching_params
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.deep_learning_config import DatasetParams, OptimizerParams, OutputParams, TrainerParams, \
|
||||
WorkflowParams, load_checkpoint
|
||||
from InnerEye.ML.utils import model_util
|
||||
from InnerEye.ML.utils.lr_scheduler import SchedulerWithWarmUp
|
||||
|
||||
|
||||
class InnerEyeInference(abc.ABC):
|
||||
"""
|
||||
A base class that defines the methods that need to be present for doing inference on a trained model. This
|
||||
form of inference is slightly different from what PyTorch Lightning does in its `Trainer.test` method. In
|
||||
particular, this inference can be executed on any of the training, validation, or test set.
|
||||
|
||||
The inference code calls the methods in this order:
|
||||
|
||||
model.on_inference_start()
|
||||
for dataset_split in [Train, Val, Test]
|
||||
model.on_inference_epoch_start(dataset_split, is_ensemble_model=False)
|
||||
for batch_idx, item in enumerate(dataloader[dataset_split])):
|
||||
model_outputs = model.forward(item)
|
||||
model.inference_step(item, batch_idx, model_outputs)
|
||||
model.on_inference_epoch_end()
|
||||
model.on_inference_end()
|
||||
"""
|
||||
|
||||
def on_inference_start(self) -> None:
|
||||
"""
|
||||
Runs initialization for everything that inference might require. This can initialize
|
||||
output files, set up metric computation, etc. This is run only once.
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_inference_epoch_start(self, dataset_split: ModelExecutionMode, is_ensemble_model: bool) -> None:
|
||||
"""
|
||||
Runs initialization for inference, when starting inference on a new dataset split (train/val/test).
|
||||
Depending on the settings, this can be called anywhere between 0 (no inference at all) to 3 times (inference
|
||||
on all of train/val/test split).
|
||||
:param dataset_split: Indicates whether the item comes from the training, validation or test set.
|
||||
:param is_ensemble_model: If False, the model_outputs come from an individual model. If True, the model
|
||||
outputs come from multiple models.
|
||||
"""
|
||||
pass
|
||||
|
||||
def inference_step(self, batch: Any, batch_idx: int, model_output: torch.Tensor) -> None:
|
||||
"""
|
||||
This hook is called when the model has finished making a prediction. It can write the results to a file,
|
||||
or compute metrics and store them.
|
||||
:param batch: The batch of data for which the model made a prediction.
|
||||
:param model_output: The model outputs. This would usually be a torch.Tensor, but can be any datatype.
|
||||
"""
|
||||
# We don't want abstract methods here, it avoids class creation for unit tests, and we also want this
|
||||
# method to be left optional (it should be possible to also use Lightning's native test_step method)
|
||||
raise NotImplementedError("Method on_inference_start must be overwritten in a derived class.")
|
||||
|
||||
def on_inference_epoch_end(self) -> None:
|
||||
"""
|
||||
Called when the inference on one of the dataset splits (train/val/test) has finished.
|
||||
Depending on the settings, this can be called anywhere between 0 (no inference at all) to 3 times (inference
|
||||
on all of train/val/test split).
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_inference_end(self) -> None:
|
||||
"""
|
||||
Called when all inference epochs have finished. This can write all metrics to disk, for example. This method
|
||||
is called exactly once.
|
||||
"""
|
||||
pass
|
||||
|
||||
def aggregate_ensemble_model_outputs(self, model_outputs: Iterator[torch.Tensor]) -> torch.Tensor:
|
||||
"""
|
||||
Aggregates the outputs of multiple models when using an ensemble model. In the default implementation,
|
||||
this averages the tensors coming from all the models.
|
||||
:param model_outputs: An iterator over the model outputs for all ensemble members.
|
||||
:return: The aggregate model outputs.
|
||||
"""
|
||||
aggregate_output: Optional[torch.Tensor] = None
|
||||
count = 0
|
||||
for m in model_outputs:
|
||||
count += 1
|
||||
if aggregate_output is None:
|
||||
aggregate_output = m
|
||||
else:
|
||||
aggregate_output += m
|
||||
if count == 0 or aggregate_output is None:
|
||||
raise ValueError("There were no results to aggregate.")
|
||||
aggregate_output /= count
|
||||
return aggregate_output
|
||||
|
||||
|
||||
class LightningModuleWithOptimizer(LightningModule):
|
||||
"""
|
||||
A base class that supplies a method to configure optimizers and LR schedulers. To use this in your model,
|
||||
inherit from this class instead of from LightningModule.
|
||||
If this class is used, all configuration options for the optimizers and LR schedulers will be also available as
|
||||
commandline arguments (for example, you can supply the InnerEye runner with "--l_rate=1e-2" to change the learning
|
||||
rate.
|
||||
"""
|
||||
# These fields will be set by the LightningContainer when the model is created.
|
||||
_optimizer_params = OptimizerParams()
|
||||
_trainer_params = TrainerParams()
|
||||
|
||||
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
|
||||
"""
|
||||
This is the default implementation of the method that provides the optimizer and LR scheduler for
|
||||
PyTorch Lightning. It reads out the optimizer and scheduler settings from the model fields,
|
||||
and creates the two objects.
|
||||
Override this method for full flexibility to define any optimizer and scheduler.
|
||||
:return: A tuple of (optimizer, LR scheduler)
|
||||
"""
|
||||
optimizer = model_util.create_optimizer(self._optimizer_params, self.parameters())
|
||||
l_rate_scheduler = SchedulerWithWarmUp(self._optimizer_params, optimizer,
|
||||
num_epochs=self._trainer_params.num_epochs)
|
||||
return [optimizer], [l_rate_scheduler]
|
||||
|
||||
|
||||
class LightningContainer(GenericConfig,
|
||||
WorkflowParams,
|
||||
DatasetParams,
|
||||
OutputParams,
|
||||
TrainerParams,
|
||||
OptimizerParams):
|
||||
"""
|
||||
A LightningContainer contains all information to train a user-specified PyTorch Lightning model. The model that
|
||||
should be trained is returned by the `create_model` method. The training data must be returned in the form of
|
||||
a LightningDataModule, by the `get_data_module` method.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self._model: Optional[LightningModule] = None
|
||||
self._model_name = type(self).__name__
|
||||
|
||||
def validate(self) -> None:
|
||||
WorkflowParams.validate(self)
|
||||
OptimizerParams.validate(self)
|
||||
|
||||
def setup(self) -> None:
|
||||
"""
|
||||
This method is called as one of the first operations of the training/testing workflow, before any other
|
||||
operations on the present object. At the point when called, the dataset is already available in
|
||||
the location given by self.local_dataset. Use this method to prepare datasets or data loaders, for example.
|
||||
"""
|
||||
pass
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
"""
|
||||
This method must create the actual Lightning model that will be trained. It can read out parameters from the
|
||||
container and pass them into the model, for example.
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
"""
|
||||
Gets the data that is used for the training, validation, and test steps.
|
||||
This should read a dataset from the self.local_dataset folder or download from a web location.
|
||||
The format of the data is not specified any further.
|
||||
The method must take cross validation into account, and ensure that logic to create training and validation
|
||||
sets takes cross validation with a given number of splits is correctly taken care of.
|
||||
:return: A LightningDataModule
|
||||
"""
|
||||
return None # type: ignore
|
||||
|
||||
def get_inference_data_module(self) -> LightningDataModule:
|
||||
"""
|
||||
Gets the data that is used to evaluate the trained model. By default, this returns the value
|
||||
of get_data_module(), but you can override this to get for example full image datasets for
|
||||
segmentation models.
|
||||
This should read a dataset from the self.local_dataset folder or download from a web location.
|
||||
The format of the data is not specified any further.
|
||||
The method must take cross validation into account, and ensure that logic to create training and validation
|
||||
sets takes cross validation with a given number of splits is correctly taken care of.
|
||||
:return: A LightningDataModule
|
||||
"""
|
||||
# You can override this if inference uses different data, for example segmentation models use
|
||||
# full images rather than equal sized crops.
|
||||
return self.get_data_module()
|
||||
|
||||
def get_trainer_arguments(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Gets additional parameters that will be passed on to the PyTorch Lightning trainer.
|
||||
"""
|
||||
return dict()
|
||||
|
||||
def create_report(self) -> None:
|
||||
"""
|
||||
This method is called after training and testing has been completed. It can aggregate all files that were
|
||||
written during training and testing, and compile them into some helpful overarching output.
|
||||
The report should be written to self.
|
||||
"""
|
||||
pass
|
||||
|
||||
def before_training_on_rank_zero(self) -> None:
|
||||
"""
|
||||
A hook that will be called before starting model training, before creating the Lightning Trainer object.
|
||||
In distributed training, this is only run on rank zero. It is executed after the before_training_on_all_ranks
|
||||
hook.
|
||||
"""
|
||||
pass
|
||||
|
||||
def before_training_on_all_ranks(self) -> None:
|
||||
"""
|
||||
A hook that will be called before starting model training.
|
||||
In distributed training, this hook will be called on all ranks. It is executed before the
|
||||
the before_training_on_rank_zero hook.
|
||||
"""
|
||||
pass
|
||||
|
||||
def load_checkpoint_and_modify(self, path_to_checkpoint: Path) -> Dict[str, Any]:
|
||||
"""
|
||||
This method is called when a file with weights for network initialization is supplied at container level,
|
||||
in the self.weights_url or self.local_weights_path fields. It can load that file as a Torch checkpoint,
|
||||
and rename parameters.
|
||||
|
||||
By default, uses torch.load to read and return the state dict from the checkpoint file, and does no modification
|
||||
of the checkpoint file.
|
||||
|
||||
Overloading this function:
|
||||
When weights_url or local_weights_path is set, the file downloaded may not be in the exact
|
||||
format expected by the model's load_state_dict() - for example, pretrained Imagenet weights for networks
|
||||
may have mismatched layer names in different implementations.
|
||||
In such cases, you can overload this function to extract the state dict from the checkpoint.
|
||||
|
||||
NOTE: The model checkpoint will be loaded using the torch function load_state_dict() with argument strict=False,
|
||||
so extra care needs to be taken to check that the state dict is valid.
|
||||
Check the logs for warnings related to missing and unexpected keys.
|
||||
See https://pytorch.org/tutorials/beginner/saving_loading_models.html#warmstarting-model-using-parameters
|
||||
-from-a-different-model
|
||||
for an explanation on why strict=False is useful when loading parameters from other models.
|
||||
:param path_to_checkpoint: Path to the checkpoint file.
|
||||
:return: Dictionary with model and optimizer state dicts. The dict should have at least the following keys:
|
||||
1. Key ModelAndInfo.MODEL_STATE_DICT_KEY and value set to the model state dict.
|
||||
2. Key ModelAndInfo.EPOCH_KEY and value set to the checkpoint epoch.
|
||||
Other (optional) entries corresponding to keys ModelAndInfo.OPTIMIZER_STATE_DICT_KEY and
|
||||
ModelAndInfo.MEAN_TEACHER_STATE_DICT_KEY are also supported.
|
||||
"""
|
||||
return load_checkpoint(path_to_checkpoint=path_to_checkpoint, use_gpu=self.use_gpu)
|
||||
|
||||
# The code from here on does not need to be modified.
|
||||
|
||||
@property
|
||||
def model(self) -> LightningModule:
|
||||
"""
|
||||
Returns the PyTorch Lightning module that the present container object manages.
|
||||
:return: A PyTorch Lightning module
|
||||
"""
|
||||
if self._model is None:
|
||||
raise ValueError("No Lightning module has been set yet.")
|
||||
return self._model
|
||||
|
||||
def create_lightning_module_and_store(self) -> None:
|
||||
"""
|
||||
Creates the Lightning model by calling `create_lightning_module` and stores it in the `lightning_module`
|
||||
property.
|
||||
"""
|
||||
self._model = self.create_model()
|
||||
if isinstance(self._model, LightningModuleWithOptimizer):
|
||||
self._model._optimizer_params = create_from_matching_params(self, OptimizerParams)
|
||||
self._model._trainer_params = create_from_matching_params(self, TrainerParams)
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Returns a string describing the present object, as a list of key: value strings."""
|
||||
arguments_str = "\nContainer:\n"
|
||||
# Avoid callable params, the bindings that are printed out can be humongous.
|
||||
# Avoid dataframes
|
||||
skip_params = {name for name, value in self.param.params().items()
|
||||
if isinstance(value, (param.Callable, param.DataFrame))}
|
||||
for key, value in self.param.get_param_values():
|
||||
if key not in skip_params:
|
||||
arguments_str += f"\t{key:40}: {value}\n"
|
||||
# Print out all other separate vars that are not under the guidance of the params library,
|
||||
# skipping the two that are introduced by params
|
||||
skip_vars = {"param", "initialized"}
|
||||
for key, value in vars(self).items():
|
||||
if key not in skip_vars and key[0] != "_":
|
||||
arguments_str += f"\t{key:40}: {value}\n"
|
||||
return arguments_str
|
|
@ -8,31 +8,11 @@ from pathlib import Path
|
|||
import torch
|
||||
|
||||
from InnerEye.ML.lightning_base import InnerEyeLightning
|
||||
from InnerEye.ML.lightning_models import ScalarLightning, SegmentationLightning
|
||||
from InnerEye.ML.lightning_models import create_lightning_model
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
|
||||
|
||||
|
||||
def create_lightning_model(config: ModelConfigBase, set_optimizer_and_scheduler: bool = True) -> InnerEyeLightning:
|
||||
"""
|
||||
Creates a PyTorch Lightning model that matches the provided InnerEye model configuration object.
|
||||
The `optimizer` and `l_rate_scheduler` object of the Lightning model will also be populated.
|
||||
:param set_optimizer_and_scheduler: If True (default), initialize the optimizer and LR scheduler of the model.
|
||||
If False, skip that step (this is only meant to be used for unit tests.)
|
||||
:param config: An InnerEye model configuration object
|
||||
:return: A PyTorch Lightning model object.
|
||||
"""
|
||||
if config.is_segmentation_model:
|
||||
model: InnerEyeLightning = SegmentationLightning(config)
|
||||
elif config.is_scalar_model:
|
||||
model = ScalarLightning(config)
|
||||
else:
|
||||
raise NotImplementedError(f"Don't know how to handle config of type {type(config)}")
|
||||
if set_optimizer_and_scheduler:
|
||||
model.set_optimizer_and_scheduler(config)
|
||||
return model
|
||||
|
||||
|
||||
def load_from_lightning_checkpoint(config: ModelConfigBase, checkpoint_path: Path) -> InnerEyeLightning:
|
||||
"""
|
||||
Reads a PyTorch model from a checkpoint. First, a PyTorch Lightning model is created matching the InnerEye
|
||||
|
|
|
@ -2,12 +2,13 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
from typing import Any, Dict, Iterable, Optional
|
||||
from typing import Any, Dict, Iterable, List, Optional
|
||||
|
||||
from pytorch_lightning.loggers import LightningLoggerBase
|
||||
from pytorch_lightning.utilities import rank_zero_only
|
||||
|
||||
from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context
|
||||
from InnerEye.Common.metrics_constants import TRAIN_PREFIX, VALIDATION_PREFIX
|
||||
from InnerEye.Common.type_annotations import DictStrFloat
|
||||
|
||||
|
||||
|
@ -21,6 +22,9 @@ class StoringLogger(LightningLoggerBase):
|
|||
super().__init__()
|
||||
self.results: Dict[int, DictStrFloat] = {}
|
||||
self.hyperparams: Any = None
|
||||
# Fields to store diagnostics for unit testing
|
||||
self.train_diagnostics: List[Any] = []
|
||||
self.val_diagnostics: List[Any] = []
|
||||
|
||||
@rank_zero_only
|
||||
def log_metrics(self, metrics: DictStrFloat, step: Optional[int] = None) -> None:
|
||||
|
@ -44,7 +48,7 @@ class StoringLogger(LightningLoggerBase):
|
|||
self.hyperparams = params
|
||||
|
||||
def experiment(self) -> Any:
|
||||
return ""
|
||||
return None
|
||||
|
||||
def name(self) -> Any:
|
||||
return ""
|
||||
|
@ -93,6 +97,48 @@ class StoringLogger(LightningLoggerBase):
|
|||
"""
|
||||
return {epoch: self.extract_by_prefix(epoch, prefix_filter) for epoch in self.epochs}
|
||||
|
||||
def get_metric(self, is_training: bool, metric_type: str) -> List[float]:
|
||||
"""
|
||||
Gets a scalar metric out of either the list of training or the list of validation results. This returns
|
||||
the value that a specific metric attains in all of the epochs.
|
||||
:param is_training: If True, read metrics that have a "train/" prefix, otherwise those that have a "val/"
|
||||
prefix.
|
||||
:param metric_type: The metric to extract.
|
||||
:return: A list of floating point numbers, with one entry per entry in the the training or validation results.
|
||||
"""
|
||||
full_metric_name = (TRAIN_PREFIX if is_training else VALIDATION_PREFIX) + metric_type
|
||||
return [self.results[epoch][full_metric_name] for epoch in self.epochs]
|
||||
|
||||
def get_train_metric(self, metric_type: str) -> List[float]:
|
||||
"""
|
||||
Gets a scalar metric from the list of training results. This returns
|
||||
the value that a specific metric attains in all of the epochs.
|
||||
:param metric_type: The metric to extract.
|
||||
:return: A list of floating point numbers, with one entry per entry in the the training results.
|
||||
"""
|
||||
return self.get_metric(is_training=True, metric_type=metric_type)
|
||||
|
||||
def get_val_metric(self, metric_type: str) -> List[float]:
|
||||
"""
|
||||
Gets a scalar metric from the list of validation results. This returns
|
||||
the value that a specific metric attains in all of the epochs.
|
||||
:param metric_type: The metric to extract.
|
||||
:return: A list of floating point numbers, with one entry per entry in the the validation results.
|
||||
"""
|
||||
return self.get_metric(is_training=False, metric_type=metric_type)
|
||||
|
||||
def train_results_per_epoch(self) -> List[DictStrFloat]:
|
||||
"""
|
||||
Gets the full set of training metrics that the logger stores, as a list of dictionaries per epoch.
|
||||
"""
|
||||
return list(self.to_metrics_dicts(prefix_filter=TRAIN_PREFIX).values())
|
||||
|
||||
def val_results_per_epoch(self) -> List[DictStrFloat]:
|
||||
"""
|
||||
Gets the full set of validation metrics that the logger stores, as a list of dictionaries per epoch.
|
||||
"""
|
||||
return list(self.to_metrics_dicts(prefix_filter=VALIDATION_PREFIX).values())
|
||||
|
||||
|
||||
class AzureMLLogger(LightningLoggerBase):
|
||||
"""
|
||||
|
@ -115,7 +161,7 @@ class AzureMLLogger(LightningLoggerBase):
|
|||
pass
|
||||
|
||||
def experiment(self) -> Any:
|
||||
return ""
|
||||
return None
|
||||
|
||||
def name(self) -> Any:
|
||||
return ""
|
||||
|
|
|
@ -21,6 +21,7 @@ from InnerEye.ML.lightning_metrics import Accuracy05, AccuracyAtOptimalThreshold
|
|||
OptimalThreshold, ScalarMetricsBase
|
||||
from InnerEye.ML.metrics import compute_dice_across_patches
|
||||
from InnerEye.ML.metrics_dict import DataframeLogger, MetricsDict, SequenceMetricsDict
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.scalar_config import ScalarModelBase
|
||||
from InnerEye.ML.sequence_config import SequenceModelBase
|
||||
from InnerEye.ML.utils import image_util, metrics_util, model_util
|
||||
|
@ -129,9 +130,9 @@ class SegmentationLightning(InnerEyeLightning):
|
|||
if isinstance(center_indices, torch.Tensor):
|
||||
center_indices = center_indices.cpu().numpy()
|
||||
if is_training:
|
||||
self.train_diagnostics.append(center_indices)
|
||||
self.storing_logger.train_diagnostics.append(center_indices)
|
||||
else:
|
||||
self.val_diagnostics.append(center_indices)
|
||||
self.storing_logger.val_diagnostics.append(center_indices)
|
||||
# if self.train_val_params.in_training_mode:
|
||||
# # store the sample train patch from this epoch for visualization
|
||||
# if batch_index == self.example_to_save and self.config.store_dataset_sample:
|
||||
|
@ -380,3 +381,23 @@ def transfer_batch_to_device(batch: Any, device: torch.device) -> Any:
|
|||
return batch
|
||||
else:
|
||||
return move_data_to_device(batch, device)
|
||||
|
||||
|
||||
def create_lightning_model(config: ModelConfigBase, set_optimizer_and_scheduler: bool = True) -> InnerEyeLightning:
|
||||
"""
|
||||
Creates a PyTorch Lightning model that matches the provided InnerEye model configuration object.
|
||||
The `optimizer` and `l_rate_scheduler` object of the Lightning model will also be populated.
|
||||
:param set_optimizer_and_scheduler: If True (default), initialize the optimizer and LR scheduler of the model.
|
||||
If False, skip that step (this is only meant to be used for unit tests.)
|
||||
:param config: An InnerEye model configuration object
|
||||
:return: A PyTorch Lightning model object.
|
||||
"""
|
||||
if config.is_segmentation_model:
|
||||
model: InnerEyeLightning = SegmentationLightning(config)
|
||||
elif config.is_scalar_model:
|
||||
model = ScalarLightning(config)
|
||||
else:
|
||||
raise NotImplementedError(f"Don't know how to handle config of type {type(config)}")
|
||||
if set_optimizer_and_scheduler:
|
||||
model.set_optimizer_and_scheduler(config)
|
||||
return model
|
||||
|
|
|
@ -8,7 +8,7 @@ import subprocess
|
|||
import sys
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
from typing import Optional, Tuple, TypeVar
|
||||
from typing import Any, Dict, Optional, Tuple, TypeVar
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -18,27 +18,18 @@ from pytorch_lightning.loggers import TensorBoardLogger
|
|||
from pytorch_lightning.plugins import DDPPlugin
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
|
||||
from InnerEye.Azure.azure_util import RUN_CONTEXT
|
||||
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, logging_section
|
||||
from InnerEye.Common.metrics_constants import TRAIN_PREFIX, VALIDATION_PREFIX
|
||||
from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context
|
||||
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, change_working_directory
|
||||
from InnerEye.Common.resource_monitor import ResourceMonitor
|
||||
from InnerEye.ML.common import ModelExecutionMode, RECOVERY_CHECKPOINT_FILE_NAME, cleanup_checkpoint_folder
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.deep_learning_config import VISUALIZATION_FOLDER
|
||||
from InnerEye.ML.lightning_base import TrainingAndValidationDataLightning
|
||||
from InnerEye.ML.lightning_helpers import create_lightning_model
|
||||
from InnerEye.ML.deep_learning_config import ARGS_TXT, VISUALIZATION_FOLDER
|
||||
from InnerEye.ML.lightning_base import InnerEyeContainer, InnerEyeLightning
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.lightning_loggers import AzureMLLogger, StoringLogger
|
||||
from InnerEye.ML.lightning_models import SUBJECT_OUTPUT_PER_RANK_PREFIX, ScalarLightning, \
|
||||
get_subject_output_file_per_rank
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.utils import ml_util
|
||||
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
|
||||
from InnerEye.ML.utils.model_util import generate_and_print_model_summary
|
||||
from InnerEye.ML.utils.training_util import ModelTrainingResults
|
||||
from InnerEye.ML.visualizers.patch_sampling import visualize_random_crops_for_dataset
|
||||
|
||||
MAX_ITEM_LOAD_TIME_SEC = 0.5
|
||||
MAX_LOAD_TIME_WARNINGS = 3
|
||||
TEMP_PREFIX = "temp/"
|
||||
|
||||
T = TypeVar('T')
|
||||
|
@ -68,23 +59,38 @@ def upload_output_file_as_temp(file_path: Path, outputs_folder: Path) -> None:
|
|||
upload_name = TEMP_PREFIX + str(file_path.relative_to(outputs_folder))
|
||||
RUN_CONTEXT.upload_file(upload_name, path_or_stream=str(file_path))
|
||||
|
||||
def create_lightning_trainer(config: ModelConfigBase,
|
||||
|
||||
def write_args_file(config: Any, outputs_folder: Path) -> None:
|
||||
"""
|
||||
Writes the given config to disk in plain text in the default output folder.
|
||||
"""
|
||||
output = str(config)
|
||||
outputs_folder.mkdir(exist_ok=True, parents=True)
|
||||
dst = outputs_folder / ARGS_TXT
|
||||
dst.write_text(output)
|
||||
logging.info(output)
|
||||
|
||||
|
||||
def create_lightning_trainer(container: LightningContainer,
|
||||
resume_from_checkpoint: Optional[Path] = None,
|
||||
num_nodes: int = 1) -> Tuple[Trainer, StoringLogger]:
|
||||
num_nodes: int = 1,
|
||||
**kwargs: Dict[str, Any]) -> \
|
||||
Tuple[Trainer, Optional[StoringLogger]]:
|
||||
"""
|
||||
Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
|
||||
and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
|
||||
return value.
|
||||
:param config: The model configuration.
|
||||
:param container: The container with model and data.
|
||||
:param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
|
||||
:param num_nodes: The number of nodes to use in distributed training.
|
||||
:param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
|
||||
:return: A tuple [Trainer object, diagnostic logger]
|
||||
"""
|
||||
# For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation
|
||||
# models, this still appears to be the best way of choosing them because validation loss on the relatively small
|
||||
# training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
|
||||
# not for the HeadAndNeck model.
|
||||
best_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder),
|
||||
best_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
|
||||
# filename=BEST_CHECKPOINT_FILE_NAME,
|
||||
# monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}",
|
||||
# save_top_k=1,
|
||||
|
@ -93,144 +99,141 @@ def create_lightning_trainer(config: ModelConfigBase,
|
|||
# Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs. Due to a bug in Lightning, this
|
||||
# will still write alternate files recovery.ckpt and recovery-v0.ckpt, which are cleaned up later in
|
||||
# cleanup_checkpoint_folder
|
||||
recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder),
|
||||
recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
|
||||
filename=RECOVERY_CHECKPOINT_FILE_NAME,
|
||||
period=config.recovery_checkpoint_save_interval
|
||||
period=container.recovery_checkpoint_save_interval
|
||||
)
|
||||
|
||||
num_gpus = torch.cuda.device_count() if config.use_gpu else 0
|
||||
num_gpus = torch.cuda.device_count() if container.use_gpu else 0
|
||||
logging.info(f"Number of available GPUs: {num_gpus}")
|
||||
if config.max_num_gpus >= 0 and config.max_num_gpus < num_gpus:
|
||||
num_gpus = config.max_num_gpus
|
||||
if 0 <= container.max_num_gpus < num_gpus:
|
||||
num_gpus = container.max_num_gpus
|
||||
logging.info(f"Restricting the number of GPUs to {num_gpus}")
|
||||
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
|
||||
# For unit tests, only "ddp_spawn" works
|
||||
accelerator = "ddp" if num_gpus * num_nodes > 1 else None
|
||||
plugins = [InnerEyeDDPPlugin(num_nodes=num_nodes, sync_batchnorm=True)] if num_gpus * num_nodes > 1 else None
|
||||
logging.info(f"Using {num_gpus} GPUs with accelerator '{accelerator}'")
|
||||
storing_logger = StoringLogger()
|
||||
tensorboard_logger = TensorBoardLogger(save_dir=str(config.logs_folder), name="Lightning", version="")
|
||||
loggers = [storing_logger, tensorboard_logger, AzureMLLogger()]
|
||||
# This leads to problems with run termination.
|
||||
# if not is_offline_run_context(RUN_CONTEXT):
|
||||
# mlflow_logger = MLFlowLogger(experiment_name=RUN_CONTEXT.experiment.name,
|
||||
# tracking_uri=RUN_CONTEXT.experiment.workspace.get_mlflow_tracking_uri())
|
||||
# # The MLFlow logger needs to get its ID from the AzureML run context, otherwise there will be two sets of
|
||||
# # results for each run, one from native AzureML and one from the MLFlow logger.
|
||||
# mlflow_logger._run_id = RUN_CONTEXT.id
|
||||
# loggers.append(mlflow_logger)
|
||||
tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="")
|
||||
loggers = [tensorboard_logger, AzureMLLogger()]
|
||||
storing_logger: Optional[StoringLogger]
|
||||
if isinstance(container, InnerEyeContainer):
|
||||
storing_logger = StoringLogger()
|
||||
loggers.append(storing_logger)
|
||||
else:
|
||||
storing_logger = None
|
||||
# Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
|
||||
precision = 32 if num_gpus == 0 else 16 if config.use_mixed_precision else 32
|
||||
precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
|
||||
# The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html
|
||||
# For the classification models, we observed only a small performance deterioration (increase in 10sec on total
|
||||
# training time of 22min) when switching to deterministic.
|
||||
if config.pl_deterministic:
|
||||
if container.pl_deterministic:
|
||||
deterministic = True
|
||||
benchmark = False
|
||||
else:
|
||||
deterministic = False
|
||||
benchmark = True
|
||||
trainer = Trainer(default_root_dir=str(config.outputs_folder),
|
||||
# Read out additional model-specific args here.
|
||||
# We probably want to keep essential ones like numgpu and logging.
|
||||
trainer = Trainer(default_root_dir=str(container.outputs_folder),
|
||||
deterministic=deterministic,
|
||||
benchmark=benchmark,
|
||||
accelerator=accelerator,
|
||||
max_epochs=config.num_epochs,
|
||||
num_sanity_val_steps=config.pl_num_sanity_val_steps,
|
||||
max_epochs=container.num_epochs,
|
||||
num_sanity_val_steps=container.pl_num_sanity_val_steps,
|
||||
callbacks=[best_checkpoint_callback, recovery_checkpoint_callback],
|
||||
logger=loggers,
|
||||
progress_bar_refresh_rate=0, # Disable the progress bar completely
|
||||
progress_bar_refresh_rate=container.pl_progress_bar_refresh_rate,
|
||||
num_nodes=num_nodes,
|
||||
gpus=num_gpus,
|
||||
precision=precision,
|
||||
sync_batchnorm=True,
|
||||
terminate_on_nan=config.detect_anomaly,
|
||||
terminate_on_nan=container.detect_anomaly,
|
||||
resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
|
||||
plugins=plugins
|
||||
)
|
||||
plugins=plugins,
|
||||
**kwargs)
|
||||
return trainer, storing_logger
|
||||
|
||||
|
||||
def model_train(config: ModelConfigBase,
|
||||
checkpoint_handler: CheckpointHandler,
|
||||
num_nodes: int = 1) -> ModelTrainingResults:
|
||||
def start_resource_monitor(config: LightningContainer) -> ResourceMonitor:
|
||||
# initialize and start GPU monitoring
|
||||
gpu_tensorboard = config.logs_folder / "gpu_utilization"
|
||||
# Result file in CSV format should NOT live in the logs folder, the streaming upload that is
|
||||
# used for this folder might corrupt the file.
|
||||
gpu_csv = config.outputs_folder / "gpu_utilization"
|
||||
gpu_csv.mkdir(parents=True, exist_ok=True)
|
||||
logging.info(f"Starting resource monitor. GPU utilization will be written to Tensorboard in "
|
||||
f"{gpu_tensorboard}, aggregate metrics to {gpu_csv}")
|
||||
resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds,
|
||||
tensorboard_folder=gpu_tensorboard,
|
||||
csv_results_folder=gpu_csv)
|
||||
resource_monitor.start()
|
||||
return resource_monitor
|
||||
|
||||
|
||||
def model_train(checkpoint_handler: CheckpointHandler,
|
||||
container: LightningContainer,
|
||||
num_nodes: int = 1) -> Tuple[Trainer, Optional[StoringLogger]]:
|
||||
"""
|
||||
The main training loop. It creates the Pytorch model based on the configuration options passed in,
|
||||
creates a Pytorch Lightning trainer, and trains the model.
|
||||
If a checkpoint was specified, then it loads the checkpoint before resuming training.
|
||||
:param config: The arguments which specify all required information.
|
||||
:param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
|
||||
:param num_nodes: The number of nodes to use in distributed training.
|
||||
:param container: A container object that holds the training data in PyTorch Lightning format
|
||||
and the model to train.
|
||||
:return: A tuple of [Trainer, StoringLogger]. Trainer is the Lightning Trainer object that was used for fitting
|
||||
the model. The StoringLogger object is returned when training an InnerEye built-in model, this is None when
|
||||
fitting other models.
|
||||
"""
|
||||
# Get the path to the checkpoint to recover from
|
||||
checkpoint_path = checkpoint_handler.get_recovery_path_train()
|
||||
# This reads the dataset file, and possibly sets required pre-processing objects, like one-hot encoder
|
||||
# for categorical features, that need to be available before creating the model.
|
||||
config.read_dataset_if_needed()
|
||||
lightning_model = container.model
|
||||
|
||||
container.before_training_on_all_ranks()
|
||||
resource_monitor: Optional[ResourceMonitor] = None
|
||||
# Execute some bookkeeping tasks only once if running distributed:
|
||||
if is_rank_zero():
|
||||
logging.info(f"Model checkpoints are saved at {container.checkpoint_folder}")
|
||||
container.before_training_on_rank_zero()
|
||||
write_args_file(container.config if isinstance(container, InnerEyeContainer) else container,
|
||||
outputs_folder=container.outputs_folder)
|
||||
if container.monitoring_interval_seconds > 0:
|
||||
resource_monitor = start_resource_monitor(container)
|
||||
|
||||
# Create the trainer object. Backup the environment variables before doing that, in case we need to run a second
|
||||
# training in the unit tests.d
|
||||
old_environ = dict(os.environ)
|
||||
seed_everything(config.get_effective_random_seed())
|
||||
trainer, storing_logger = create_lightning_trainer(config, checkpoint_path, num_nodes=num_nodes)
|
||||
|
||||
# Set random seeds just before training. For segmentation models, we have
|
||||
# something that changes the random seed in the before_training_on_rank_zero hook.
|
||||
seed_everything(container.get_effective_random_seed())
|
||||
trainer, storing_logger = create_lightning_trainer(container,
|
||||
checkpoint_path,
|
||||
num_nodes=num_nodes,
|
||||
**container.get_trainer_arguments())
|
||||
logging.info(f"GLOBAL_RANK: {os.getenv('GLOBAL_RANK')}, LOCAL_RANK {os.getenv('LOCAL_RANK')}. "
|
||||
f"trainer.global_rank: {trainer.global_rank}")
|
||||
logging.debug("Creating the PyTorch model.")
|
||||
lightning_model = create_lightning_model(config)
|
||||
lightning_model.storing_logger = storing_logger
|
||||
# InnerEye models use this logger for diagnostics
|
||||
if isinstance(lightning_model, InnerEyeLightning):
|
||||
if storing_logger is None:
|
||||
raise ValueError("InnerEye models require the storing_logger for diagnostics")
|
||||
lightning_model.storing_logger = storing_logger
|
||||
|
||||
resource_monitor = None
|
||||
# Execute some bookkeeping tasks only once if running distributed:
|
||||
if is_rank_zero():
|
||||
config.write_args_file()
|
||||
logging.info(str(config))
|
||||
# Save the dataset files for later use in cross validation analysis
|
||||
config.write_dataset_files()
|
||||
logging.info(f"Model checkpoints are saved at {config.checkpoint_folder}")
|
||||
|
||||
# set the random seed for all libraries
|
||||
ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization")
|
||||
# Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't
|
||||
# want training to depend on how many patients we visualized, and hence set the random seed again right after.
|
||||
if isinstance(config, SegmentationModelBase):
|
||||
with logging_section("Visualizing the effect of sampling random crops for training"):
|
||||
visualize_random_crops_for_dataset(config)
|
||||
|
||||
# Print out a detailed breakdown of layers, memory consumption and time.
|
||||
generate_and_print_model_summary(config, lightning_model.model)
|
||||
|
||||
if config.monitoring_interval_seconds > 0:
|
||||
# initialize and start GPU monitoring
|
||||
gpu_tensorboard = config.logs_folder / "gpu_utilization"
|
||||
# Result file in CSV format should NOT live in the logs folder, the streaming upload that is
|
||||
# used for this folder might corrupt the file.
|
||||
gpu_csv = config.outputs_folder / "gpu_utilization"
|
||||
gpu_csv.mkdir(parents=True, exist_ok=True)
|
||||
logging.info(f"Starting resource monitor. GPU utilization will be written to Tensorboard in "
|
||||
f"{gpu_tensorboard}, aggregate metrics to {gpu_csv}")
|
||||
resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds,
|
||||
tensorboard_folder=gpu_tensorboard,
|
||||
csv_results_folder=gpu_csv)
|
||||
resource_monitor.start()
|
||||
|
||||
# Training loop
|
||||
logging.info("Starting training")
|
||||
|
||||
lightning_data = TrainingAndValidationDataLightning(config) # type: ignore
|
||||
# When trying to store the config object in the constructor, it does not appear to get stored at all, later
|
||||
# reference of the object simply fail. Hence, have to set explicitly here.
|
||||
lightning_data.config = config
|
||||
trainer.fit(lightning_model, datamodule=lightning_data)
|
||||
trainer.logger.close() # type: ignore
|
||||
lightning_model.close_all_loggers()
|
||||
# When training models that are not built-in InnerEye models, we have no guarantee that they write
|
||||
# files to the right folder. Best guess is to change the current working directory to where files should go.
|
||||
data_module = container.get_data_module()
|
||||
with change_working_directory(container.outputs_folder):
|
||||
trainer.fit(lightning_model, datamodule=data_module)
|
||||
trainer.logger.close() # type: ignore
|
||||
world_size = getattr(trainer, "world_size", 0)
|
||||
is_azureml_run = not config.is_offline_run
|
||||
is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
|
||||
# Per-subject model outputs for regression models are written per rank, and need to be aggregated here.
|
||||
# Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them.
|
||||
if is_azureml_run and world_size > 1 and isinstance(lightning_model, ScalarLightning):
|
||||
upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, config.outputs_folder)
|
||||
upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, config.outputs_folder)
|
||||
upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, container.outputs_folder)
|
||||
upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, container.outputs_folder)
|
||||
# DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training.
|
||||
# We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set
|
||||
# all necessary properties.
|
||||
|
@ -239,7 +242,7 @@ def model_train(config: ModelConfigBase,
|
|||
sys.exit()
|
||||
|
||||
logging.info("Choosing the best checkpoint and removing redundant files.")
|
||||
cleanup_checkpoint_folder(config.checkpoint_folder)
|
||||
cleanup_checkpoint_folder(container.checkpoint_folder)
|
||||
# Lightning modifies a ton of environment variables. If we first run training and then the test suite,
|
||||
# those environment variables will mislead the training runs in the test suite, and make them crash.
|
||||
# Hence, restore the original environment after training.
|
||||
|
@ -254,17 +257,9 @@ def model_train(config: ModelConfigBase,
|
|||
for rank in range(world_size):
|
||||
for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]:
|
||||
file = mode.value + "/" + get_subject_output_file_per_rank(rank)
|
||||
RUN_CONTEXT.download_file(name=TEMP_PREFIX + file, output_file_path=config.outputs_folder / file)
|
||||
RUN_CONTEXT.download_file(name=TEMP_PREFIX + file, output_file_path=container.outputs_folder / file)
|
||||
# Concatenate all temporary file per execution mode
|
||||
aggregate_and_create_subject_metrics_file(config.outputs_folder)
|
||||
|
||||
model_training_results = ModelTrainingResults(
|
||||
train_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=TRAIN_PREFIX).values()),
|
||||
val_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=VALIDATION_PREFIX).values()),
|
||||
train_diagnostics=lightning_model.train_diagnostics,
|
||||
val_diagnostics=lightning_model.val_diagnostics,
|
||||
optimal_temperature_scale_values_per_checkpoint_epoch=[]
|
||||
)
|
||||
aggregate_and_create_subject_metrics_file(container.outputs_folder)
|
||||
|
||||
logging.info("Finished training")
|
||||
|
||||
|
@ -272,20 +267,20 @@ def model_train(config: ModelConfigBase,
|
|||
# checkpoints correctly.
|
||||
checkpoint_handler.additional_training_done()
|
||||
|
||||
# Upload visualization directory to AML run context to be able to see it
|
||||
# in the Azure UI.
|
||||
if config.max_batch_grad_cam > 0 and config.visualization_folder.exists():
|
||||
RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder))
|
||||
# Upload visualization directory to AML run context to be able to see it in the Azure UI.
|
||||
if isinstance(container, InnerEyeContainer):
|
||||
if container.config.max_batch_grad_cam > 0 and container.visualization_folder.exists():
|
||||
RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(container.visualization_folder))
|
||||
|
||||
if resource_monitor:
|
||||
logging.info("Shutting down the resource monitor process.")
|
||||
if not config.is_offline_run:
|
||||
if is_azureml_run:
|
||||
for gpu_name, metrics_per_gpu in resource_monitor.read_aggregate_metrics().items():
|
||||
# Log as a table, with GPU being the first column
|
||||
RUN_CONTEXT.log_row("GPU utilization", GPU=gpu_name, **metrics_per_gpu)
|
||||
resource_monitor.kill()
|
||||
|
||||
return model_training_results
|
||||
return trainer, storing_logger
|
||||
|
||||
|
||||
def aggregate_and_create_subject_metrics_file(outputs_folder: Path) -> None:
|
||||
|
@ -298,16 +293,15 @@ def aggregate_and_create_subject_metrics_file(outputs_folder: Path) -> None:
|
|||
for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]:
|
||||
temp_files = (outputs_folder / mode.value).rglob(SUBJECT_OUTPUT_PER_RANK_PREFIX + "*")
|
||||
result_file = outputs_folder / mode.value / SUBJECT_METRICS_FILE_NAME
|
||||
result_file = result_file.open("a")
|
||||
for i, file in enumerate(temp_files):
|
||||
temp_file_contents = file.read_text()
|
||||
if i == 0:
|
||||
# Copy the first file as-is, including the first line with the column headers
|
||||
result_file.write(temp_file_contents)
|
||||
else:
|
||||
# For all files but the first one, cut off the header line.
|
||||
result_file.write(os.linesep + os.linesep.join(temp_file_contents.splitlines()[1:]))
|
||||
result_file.close()
|
||||
with result_file.open("a") as f:
|
||||
for i, file in enumerate(temp_files):
|
||||
temp_file_contents = file.read_text()
|
||||
if i == 0:
|
||||
# Copy the first file as-is, including the first line with the column headers
|
||||
f.write(temp_file_contents)
|
||||
else:
|
||||
# For all files but the first one, cut off the header line.
|
||||
f.write(os.linesep + os.linesep.join(temp_file_contents.splitlines()[1:]))
|
||||
|
||||
|
||||
class InnerEyeDDPPlugin(DDPPlugin):
|
||||
|
|
|
@ -21,7 +21,7 @@ from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources
|
|||
from InnerEye.ML.deep_learning_config import ARGS_TXT
|
||||
from InnerEye.ML.photometric_normalization import PhotometricNormalization
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.utils.io_util import load_images_from_dataset_source
|
||||
|
||||
|
||||
|
@ -73,7 +73,7 @@ def main(yaml_file_path: Path) -> None:
|
|||
In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
|
||||
"""
|
||||
config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
|
||||
local_dataset = MLRunner(config, runner_config).mount_or_download_dataset()
|
||||
local_dataset = MLRunner(config, azure_config=runner_config).mount_or_download_dataset()
|
||||
assert local_dataset is not None
|
||||
dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
|
||||
normalizer_config = NormalizeAndVisualizeConfig(**args)
|
||||
|
|
|
@ -449,7 +449,7 @@ class InferenceBatch(CTImagesMaskedBatch):
|
|||
|
||||
@inbatch_parallel(init='indices', post='_post_custom_components', target='threads')
|
||||
def set_component(self, batch_idx: int, component: InferenceBatch.Components, data: np.ndarray) \
|
||||
-> Dict[InferenceBatch.Components, Any]:
|
||||
-> Dict[str, Any]:
|
||||
logging.debug("Updated data in pipeline component: {}, for batch: {}.".format(component.value, batch_idx))
|
||||
return {
|
||||
component.value: {'type': component.value, 'data': data}
|
||||
|
|
|
@ -16,6 +16,9 @@ from azureml._restclient.constants import RunStatus
|
|||
from azureml.core import Environment, Run
|
||||
from azureml.core.model import Model
|
||||
from azureml.data import FileDataset
|
||||
from pytorch_lightning import LightningModule, Trainer, seed_everything
|
||||
from pytorch_lightning.utilities.cloud_io import load as pl_load
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from InnerEye.Azure import azure_util
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
|
@ -23,30 +26,33 @@ from InnerEye.Azure.azure_runner import ENVIRONMENT_VERSION, INPUT_DATA_KEY, get
|
|||
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, \
|
||||
DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, EFFECTIVE_RANDOM_SEED_KEY_NAME, IS_ENSEMBLE_KEY_NAME, \
|
||||
MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT, RUN_RECOVERY_FROM_ID_KEY_NAME, \
|
||||
RUN_RECOVERY_ID_KEY_NAME, create_run_recovery_id, get_results_blob_path, merge_conda_files
|
||||
RUN_RECOVERY_ID_KEY_NAME, create_run_recovery_id, is_offline_run_context, \
|
||||
merge_conda_files
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.build_config import ExperimentResultLocation, build_information_to_dot_net_json_file
|
||||
from InnerEye.Common.common_util import BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE, \
|
||||
CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, ModelProcessing, \
|
||||
CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
|
||||
ModelProcessing, \
|
||||
OTHER_RUNS_SUBDIR_NAME, SCATTERPLOTS_SUBDIR_NAME, SUBJECT_METRICS_FILE_NAME, \
|
||||
get_best_epoch_results_path, is_windows, logging_section, print_exception, remove_file_or_directory
|
||||
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PYTHON_ENVIRONMENT_NAME
|
||||
change_working_directory, get_best_epoch_results_path, is_windows, logging_section, logging_to_file, \
|
||||
print_exception, remove_file_or_directory
|
||||
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, LOG_FILE_NAME, PYTHON_ENVIRONMENT_NAME
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, \
|
||||
ModelCategory, MultiprocessingStartMethod
|
||||
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, FINAL_ENSEMBLE_MODEL_FOLDER, \
|
||||
FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod
|
||||
from InnerEye.ML.lightning_base import InnerEyeContainer
|
||||
from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
|
||||
from InnerEye.ML.metrics import InferenceMetrics, InferenceMetricsForSegmentation
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.model_inference_config import ModelInferenceConfig
|
||||
from InnerEye.ML.model_testing import model_test
|
||||
from InnerEye.ML.model_training import model_train
|
||||
from InnerEye.ML.reports.notebook_report import get_ipynb_report_name, generate_classification_crossval_notebook, \
|
||||
from InnerEye.ML.model_training import create_lightning_trainer, model_train
|
||||
from InnerEye.ML.reports.notebook_report import generate_classification_crossval_notebook, \
|
||||
generate_classification_multilabel_notebook, generate_classification_notebook, generate_segmentation_notebook, \
|
||||
reports_folder
|
||||
get_ipynb_report_name, reports_folder
|
||||
from InnerEye.ML.runner import ModelDeploymentHookSignature, PostCrossValidationHookSignature, get_all_environment_files
|
||||
from InnerEye.ML.scalar_config import ScalarModelBase
|
||||
from InnerEye.ML.sequence_config import SequenceModelBase
|
||||
from InnerEye.ML.utils import ml_util
|
||||
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
|
||||
from InnerEye.ML.visualizers import activation_maps
|
||||
from InnerEye.ML.visualizers.plot_cross_validation import \
|
||||
|
@ -78,7 +84,8 @@ def download_dataset(azure_dataset_id: str,
|
|||
contains a dataset csv file, no download is started.
|
||||
:param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace.
|
||||
:param target_folder: The folder in which to download the dataset from Azure.
|
||||
:param dataset_csv: Name of the csv file describing the dataset.
|
||||
:param dataset_csv: Name of the csv file describing the dataset. This is only used to check if the dataset has been
|
||||
downloaded already.
|
||||
:param azure_config: All Azure-related configuration options.
|
||||
:return: A path on the local machine that contains the dataset.
|
||||
"""
|
||||
|
@ -88,11 +95,18 @@ def download_dataset(azure_dataset_id: str,
|
|||
raise ValueError(f"Expected to get a FileDataset, but got {type(azure_dataset)}")
|
||||
# The downloaded dataset may already exist from a previous run.
|
||||
expected_dataset_path = target_folder / azure_dataset_id
|
||||
expected_dataset_file = expected_dataset_path / dataset_csv
|
||||
logging.info(f"Model training will use dataset '{azure_dataset_id}' in Azure.")
|
||||
if expected_dataset_path.is_dir() and expected_dataset_file.is_file():
|
||||
logging.info(f"The dataset appears to be downloaded already in {expected_dataset_path}. Skipping.")
|
||||
return expected_dataset_path
|
||||
if expected_dataset_path.is_dir():
|
||||
if dataset_csv:
|
||||
if (expected_dataset_path / dataset_csv).is_file():
|
||||
logging.info(f"The file {dataset_csv} is already downloaded in {expected_dataset_path}. Skipping.")
|
||||
return expected_dataset_path
|
||||
else:
|
||||
existing_files = sum(1 for _ in expected_dataset_path.rglob("*"))
|
||||
if existing_files > 1:
|
||||
logging.info(f"There are already {existing_files} files in {expected_dataset_path}. Skipping.")
|
||||
return expected_dataset_path
|
||||
|
||||
logging.info("Starting to download the dataset - WARNING, this could take very long!")
|
||||
with logging_section("Downloading dataset"):
|
||||
t0 = time.perf_counter()
|
||||
|
@ -121,15 +135,20 @@ def log_metrics(val_metrics: Optional[InferenceMetricsForSegmentation],
|
|||
class MLRunner:
|
||||
|
||||
def __init__(self,
|
||||
model_config: ModelConfigBase,
|
||||
model_config: Optional[DeepLearningConfig] = None,
|
||||
container: Optional[LightningContainer] = None,
|
||||
azure_config: Optional[AzureConfig] = None,
|
||||
project_root: Optional[Path] = None,
|
||||
post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
|
||||
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None) -> None:
|
||||
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None,
|
||||
output_subfolder: str = "") -> None:
|
||||
"""
|
||||
Driver class to run a ML experiment. Note that the project root argument MUST be supplied when using InnerEye
|
||||
as a package!
|
||||
:param model_config: Model related configurations
|
||||
:param model_config: If None, run the training as per the `container` argument (bring-your-own-model). If not
|
||||
None, this is the model configuration for a built-in InnerEye model.
|
||||
:param container: The LightningContainer object to use for training. If None, assume that the training is
|
||||
for a built-in InnerEye model.
|
||||
:param azure_config: Azure related configurations
|
||||
:param project_root: Project root. This should only be omitted if calling run_ml from the test suite. Supplying
|
||||
it is crucial when using InnerEye as a package or submodule!
|
||||
|
@ -138,50 +157,112 @@ class MLRunner:
|
|||
:param model_deployment_hook: an optional function for deploying a model in an application-specific way.
|
||||
If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML
|
||||
Model as arguments, and return an optional Path and a further object of any type.
|
||||
:param output_subfolder: If provided, the output folder structure will have an additional subfolder,
|
||||
when running outside AzureML.
|
||||
"""
|
||||
if model_config is not None and container is not None:
|
||||
raise ValueError("Only one of the two arguments 'model_config', 'container' must be provided.")
|
||||
self.model_config = model_config
|
||||
if container is None:
|
||||
assert isinstance(model_config, ModelConfigBase), \
|
||||
"When using a built-in InnerEye model, the configuration should be an instance of ModelConfigBase"
|
||||
container = InnerEyeContainer(model_config)
|
||||
self.container = container
|
||||
self.azure_config: AzureConfig = azure_config or AzureConfig()
|
||||
self.project_root: Path = project_root or fixed_paths.repository_root_directory()
|
||||
self.post_cross_validation_hook = post_cross_validation_hook
|
||||
self.model_deployment_hook = model_deployment_hook
|
||||
self.output_subfolder = output_subfolder
|
||||
self._has_setup_run = False
|
||||
|
||||
def setup(self, use_mount_or_download_dataset: bool = True) -> None:
|
||||
"""
|
||||
If the present object is using one of the InnerEye built-in models, create a (fake) container for it
|
||||
and call the setup method. It sets the random seeds, and then creates the actual Lightning modules.
|
||||
:param use_mount_or_download_dataset: If True, try to download or mount the dataset that is used by the model.
|
||||
If False, assume that the dataset is already available (this should only be used for unit tests).
|
||||
"""
|
||||
if self._has_setup_run:
|
||||
return
|
||||
if (not self.azure_config.only_register_model) and use_mount_or_download_dataset:
|
||||
# Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails
|
||||
# and config.local_dataset was not already set.
|
||||
# This must happen before container setup because that could already read datasets.
|
||||
self.container.local_dataset = self.mount_or_download_dataset()
|
||||
# Ensure that we use fixed seeds before initializing the PyTorch models
|
||||
seed_everything(self.container.get_effective_random_seed())
|
||||
# Creating the folder structure must happen before the LightningModule is created, because the output
|
||||
# parameters of the container will be copied into the module.
|
||||
if self.output_subfolder:
|
||||
# This codepath is only executed for cross validation runs outside AzureML: The folder structure
|
||||
# uses an existing folder structure set by the caller, and just a subfolder is added.
|
||||
self.container.file_system_config = self.container.file_system_config.add_subfolder(self.output_subfolder)
|
||||
else:
|
||||
self.container.create_filesystem(self.project_root)
|
||||
# A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
|
||||
if isinstance(self.container, InnerEyeContainer):
|
||||
self.container.config.local_dataset = self.container.local_dataset
|
||||
self.container.config.file_system_config = self.container.file_system_config
|
||||
self.container.setup()
|
||||
self.container.create_lightning_module_and_store()
|
||||
self._has_setup_run = True
|
||||
|
||||
@property
|
||||
def is_offline_run(self) -> bool:
|
||||
"""
|
||||
Returns True if the present run is outside of AzureML, and False if it is inside of AzureML.
|
||||
:return:
|
||||
"""
|
||||
return is_offline_run_context(RUN_CONTEXT)
|
||||
|
||||
@property
|
||||
def innereye_config(self) -> DeepLearningConfig:
|
||||
"""
|
||||
Gets the model configuration object for all built-in InnerEye models. Raises an exception if the present
|
||||
object trains a LightningContainer that is not a built-in InnerEye model.
|
||||
"""
|
||||
if self.model_config is None or not isinstance(self.model_config, DeepLearningConfig):
|
||||
raise ValueError("This property should only be used with built-in InnerEye models, but model "
|
||||
f"configuration is of type {type(self.model_config)}")
|
||||
return self.model_config
|
||||
|
||||
def start_logging_to_file(self) -> None:
|
||||
if self.container is None:
|
||||
self.setup()
|
||||
logging_to_file(self.container.logs_folder / LOG_FILE_NAME)
|
||||
|
||||
def is_offline_cross_val_parent_run(self) -> bool:
|
||||
"""
|
||||
Returns true if the current run is an offline run with cross validation splits > 0
|
||||
and cross_validation_split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX (ie: a parent)
|
||||
"""
|
||||
return self.model_config.cross_validation_split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX and \
|
||||
self.model_config.perform_cross_validation and self.model_config.is_offline_run
|
||||
return self.container.cross_validation_split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX and \
|
||||
self.container.perform_cross_validation and self.is_offline_run
|
||||
|
||||
def spawn_offline_cross_val_classification_child_runs(self) -> None:
|
||||
"""
|
||||
Trains and Tests k models based on their respective data splits sequentially.
|
||||
Stores the results on the Validation set to the outputs directory of the parent run.
|
||||
"""
|
||||
_config = self.model_config
|
||||
assert isinstance(_config, ScalarModelBase)
|
||||
parent_run_file_system = _config.file_system_config
|
||||
assert isinstance(self.innereye_config, ScalarModelBase)
|
||||
|
||||
def _spawn_run(cross_val_split_index: int) -> None:
|
||||
split_model_config = copy.deepcopy(_config)
|
||||
assert isinstance(split_model_config, ScalarModelBase)
|
||||
split_model_config.cross_validation_split_index = cross_val_split_index
|
||||
|
||||
_local_split_folder_name = str(cross_val_split_index)
|
||||
split_model_config.file_system_config = parent_run_file_system.add_subfolder(_local_split_folder_name)
|
||||
|
||||
split_config = copy.deepcopy(self.innereye_config)
|
||||
split_config.cross_validation_split_index = cross_val_split_index
|
||||
logging.info(f"Running model train and test on cross validation split: {cross_val_split_index}")
|
||||
split_ml_runner = MLRunner(model_config=split_model_config,
|
||||
split_ml_runner = MLRunner(model_config=split_config,
|
||||
container=None,
|
||||
azure_config=self.azure_config,
|
||||
project_root=self.project_root,
|
||||
post_cross_validation_hook=self.post_cross_validation_hook,
|
||||
model_deployment_hook=self.model_deployment_hook)
|
||||
model_deployment_hook=self.model_deployment_hook,
|
||||
output_subfolder=str(cross_val_split_index))
|
||||
split_ml_runner.run()
|
||||
|
||||
for i in range(_config.number_of_cross_validation_splits):
|
||||
for i in range(self.innereye_config.number_of_cross_validation_splits):
|
||||
_spawn_run(i)
|
||||
|
||||
config_and_files = get_config_and_results_for_offline_runs(self.model_config)
|
||||
config_and_files = get_config_and_results_for_offline_runs(self.innereye_config)
|
||||
plot_cross_validation_from_files(config_and_files, Path(config_and_files.config.outputs_directory))
|
||||
|
||||
def set_run_tags_from_parent(self) -> None:
|
||||
|
@ -208,8 +289,8 @@ class MLRunner:
|
|||
]
|
||||
new_tags = {tag: run_tags_parent.get(tag, "") for tag in tags_to_copy}
|
||||
new_tags[RUN_RECOVERY_ID_KEY_NAME] = create_run_recovery_id(run=RUN_CONTEXT)
|
||||
new_tags[CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY] = str(self.model_config.cross_validation_split_index)
|
||||
new_tags[EFFECTIVE_RANDOM_SEED_KEY_NAME] = str(self.model_config.get_effective_random_seed())
|
||||
new_tags[CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY] = str(self.container.cross_validation_split_index)
|
||||
new_tags[EFFECTIVE_RANDOM_SEED_KEY_NAME] = str(self.container.get_effective_random_seed())
|
||||
RUN_CONTEXT.set_tags(new_tags)
|
||||
|
||||
def run(self) -> None:
|
||||
|
@ -217,66 +298,113 @@ class MLRunner:
|
|||
Driver function to run a ML experiment. If an offline cross validation run is requested, then
|
||||
this function is recursively called for each cross validation split.
|
||||
"""
|
||||
self.setup()
|
||||
if self.is_offline_cross_val_parent_run():
|
||||
if self.model_config.is_segmentation_model:
|
||||
if self.innereye_config.is_segmentation_model:
|
||||
raise NotImplementedError("Offline cross validation is only supported for classification models.")
|
||||
self.spawn_offline_cross_val_classification_child_runs()
|
||||
return
|
||||
|
||||
# Get the AzureML context in which the script is running
|
||||
if not self.model_config.is_offline_run and PARENT_RUN_CONTEXT is not None:
|
||||
if not self.is_offline_run and PARENT_RUN_CONTEXT is not None:
|
||||
logging.info("Setting tags from parent run.")
|
||||
self.set_run_tags_from_parent()
|
||||
|
||||
self.save_build_info_for_dotnet_consumers()
|
||||
|
||||
# Set data loader start method
|
||||
self.set_multiprocessing_start_method()
|
||||
|
||||
# configure recovery container if provided
|
||||
checkpoint_handler = CheckpointHandler(model_config=self.model_config,
|
||||
checkpoint_handler = CheckpointHandler(container=self.container,
|
||||
azure_config=self.azure_config,
|
||||
project_root=self.project_root,
|
||||
run_context=RUN_CONTEXT)
|
||||
checkpoint_handler.download_recovery_checkpoints_or_weights()
|
||||
trainer: Optional[Trainer] = None
|
||||
# do training and inference, unless the "only register" switch is set (which requires a run_recovery
|
||||
# to be valid).
|
||||
if not self.azure_config.only_register_model:
|
||||
# Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails
|
||||
# and config.local_dataset was not already set.
|
||||
self.model_config.local_dataset = self.mount_or_download_dataset()
|
||||
# Check for existing dataset.csv file in the correct locations. Skip that if a dataset has already been
|
||||
# loaded (typically only during tests)
|
||||
if self.model_config.dataset_data_frame is None:
|
||||
assert self.model_config.local_dataset is not None
|
||||
ml_util.validate_dataset_paths(
|
||||
self.model_config.local_dataset,
|
||||
self.model_config.dataset_csv)
|
||||
|
||||
# train a new model if required
|
||||
if self.azure_config.train:
|
||||
with logging_section("Model training"):
|
||||
model_train(self.model_config, checkpoint_handler, num_nodes=self.azure_config.num_nodes)
|
||||
else:
|
||||
self.model_config.write_dataset_files()
|
||||
trainer, _ = model_train(checkpoint_handler,
|
||||
container=self.container,
|
||||
num_nodes=self.azure_config.num_nodes)
|
||||
# log the number of epochs used for model training
|
||||
RUN_CONTEXT.log(name="Train epochs", value=self.container.num_epochs)
|
||||
elif isinstance(self.container, InnerEyeContainer):
|
||||
self.innereye_config.write_dataset_files()
|
||||
self.create_activation_maps()
|
||||
|
||||
# log the number of epochs used for model training
|
||||
RUN_CONTEXT.log(name="Train epochs", value=self.model_config.num_epochs)
|
||||
if isinstance(self.container, InnerEyeContainer):
|
||||
# Inference for the InnerEye built-in models
|
||||
# We specify the ModelProcessing as DEFAULT here even if the run_recovery points to an ensemble run, because
|
||||
# the current run is a single one. See the documentation of ModelProcessing for more details.
|
||||
self.run_inference_and_register_model(checkpoint_handler, ModelProcessing.DEFAULT)
|
||||
|
||||
# We specify the ModelProcessing as DEFAULT here even if the run_recovery points to an ensemble run, because
|
||||
# the current run is a single one. See the documentation of ModelProcessing for more details.
|
||||
self.run_inference_and_register_model(checkpoint_handler, ModelProcessing.DEFAULT)
|
||||
if self.container.generate_report:
|
||||
self.generate_report(ModelProcessing.DEFAULT)
|
||||
|
||||
if self.model_config.generate_report:
|
||||
self.generate_report(ModelProcessing.DEFAULT)
|
||||
# If this is an cross validation run, and the present run is child run 0, then wait for the sibling runs,
|
||||
# build the ensemble model, and write a report for that.
|
||||
if self.container.number_of_cross_validation_splits > 0:
|
||||
should_wait_for_other_child_runs = (not self.is_offline_run) and \
|
||||
self.container.cross_validation_split_index == 0
|
||||
if should_wait_for_other_child_runs:
|
||||
self.wait_for_runs_to_finish()
|
||||
self.create_ensemble_model_and_run_inference()
|
||||
else:
|
||||
# Inference for all models that are specified via LightningContainers.
|
||||
self.run_inference_for_lightning_models(checkpoint_handler.get_checkpoints_to_test(), trainer)
|
||||
# We can't enforce that files are written to the output folder, hence change the working directory manually
|
||||
with change_working_directory(self.container.outputs_folder):
|
||||
self.container.create_report()
|
||||
|
||||
# If this is an cross validation run, and the present run is child run 0, then wait for the sibling runs,
|
||||
# build the ensemble model, and write a report for that.
|
||||
if self.model_config.number_of_cross_validation_splits > 0:
|
||||
if self.model_config.should_wait_for_other_cross_val_child_runs():
|
||||
self.wait_for_runs_to_finish()
|
||||
self.create_ensemble_model()
|
||||
def run_inference_for_lightning_models(self, checkpoint_paths: List[Path], trainer: Optional[Trainer]) -> None:
|
||||
"""
|
||||
Run inference on the test set for all models that are specified via a LightningContainer.
|
||||
"""
|
||||
if len(checkpoint_paths) != 1:
|
||||
raise ValueError(f"This method expects exactly 1 checkpoint for inference, but got {len(checkpoint_paths)}")
|
||||
lightning_model = self.container.model
|
||||
# Run the customized inference code only if the the "inference" step has been overridden
|
||||
if isinstance(lightning_model, InnerEyeInference) and \
|
||||
type(lightning_model).inference_step != InnerEyeInference.inference_step:
|
||||
logging.info("Running inference via the InnerEyeInference.inference_step method")
|
||||
# Read the data modules before changing the working directory, in case the code relies on relative paths
|
||||
data = self.container.get_inference_data_module()
|
||||
dataloaders: List[Tuple[DataLoader, ModelExecutionMode]] = []
|
||||
if self.container.perform_validation_and_test_set_inference:
|
||||
dataloaders.append((data.test_dataloader(), ModelExecutionMode.TEST)) # type: ignore
|
||||
dataloaders.append((data.val_dataloader(), ModelExecutionMode.VAL)) # type: ignore
|
||||
if self.container.perform_training_set_inference:
|
||||
dataloaders.append((data.train_dataloader(), ModelExecutionMode.TRAIN)) # type: ignore
|
||||
map_location = "gpu" if self.container.use_gpu else "cpu"
|
||||
checkpoint = pl_load(checkpoint_paths[0], map_location=map_location)
|
||||
lightning_model.load_state_dict(checkpoint['state_dict'])
|
||||
lightning_model.eval()
|
||||
with change_working_directory(self.container.outputs_folder):
|
||||
lightning_model.on_inference_start()
|
||||
for loader, split in dataloaders:
|
||||
logging.info(f"Starting inference on {split.value} set")
|
||||
lightning_model.on_inference_epoch_start(dataset_split=split, is_ensemble_model=False)
|
||||
for batch_idx, item in enumerate(loader):
|
||||
model_output = lightning_model.forward(item[0])
|
||||
lightning_model.inference_step(item, batch_idx, model_output=model_output)
|
||||
lightning_model.on_inference_epoch_end()
|
||||
lightning_model.on_inference_end()
|
||||
elif type(lightning_model).test_step != LightningModule.test_step:
|
||||
# Run Lightning's built-in test procedure if the `test_step` method has been overridden
|
||||
logging.info("Running inference via the LightningModule.test_step method")
|
||||
trainer = trainer or create_lightning_trainer(self.container)[0]
|
||||
# When training models that are not built-in InnerEye models, we have no guarantee that they write
|
||||
# files to the right folder. Best guess is to change the current working directory to where files should go.
|
||||
with change_working_directory(self.container.outputs_folder):
|
||||
trainer.test(self.container.model,
|
||||
test_dataloaders=self.container.get_data_module().test_dataloader(),
|
||||
ckpt_path=str(checkpoint_paths[0]))
|
||||
logging.info("Finished inference.")
|
||||
else:
|
||||
logging.warning("None of the suitable test methods is overridden. Skipping inference completely.")
|
||||
|
||||
def run_inference_and_register_model(self, checkpoint_handler: CheckpointHandler,
|
||||
model_proc: ModelProcessing) -> None:
|
||||
|
@ -311,17 +439,17 @@ class MLRunner:
|
|||
model (from the run we recovered) should already have been registered, so we should only
|
||||
do so if this run is specifically for that purpose.
|
||||
"""
|
||||
if self.model_config.is_offline_run:
|
||||
if self.is_offline_run:
|
||||
return False
|
||||
return self.azure_config.train or self.azure_config.only_register_model
|
||||
|
||||
def create_activation_maps(self) -> None:
|
||||
if self.model_config.is_segmentation_model and self.model_config.activation_map_layers is not None:
|
||||
if self.innereye_config.is_segmentation_model and self.innereye_config.activation_map_layers is not None:
|
||||
logging.info("Extracting activation maps for layer")
|
||||
activation_maps.extract_activation_maps(self.model_config)
|
||||
activation_maps.extract_activation_maps(self.innereye_config) # type: ignore
|
||||
logging.info("Successfully extracted and saved activation maps")
|
||||
|
||||
def mount_or_download_dataset(self) -> Path:
|
||||
def mount_or_download_dataset(self) -> Optional[Path]:
|
||||
"""
|
||||
Makes the dataset that the model uses available on the executing machine. If the present training run is outside
|
||||
of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be
|
||||
|
@ -331,53 +459,46 @@ class MLRunner:
|
|||
mounted or downloaded.
|
||||
Returns the path of the dataset on the executing machine.
|
||||
"""
|
||||
azure_dataset_id = self.model_config.azure_dataset_id
|
||||
|
||||
if self.model_config.is_offline_run:
|
||||
azure_dataset_id = self.container.azure_dataset_id
|
||||
local_dataset = self.container.local_dataset
|
||||
if self.is_offline_run:
|
||||
# A dataset, either local or in Azure, is required for the built-in InnerEye models. When models are
|
||||
# specified via a LightningContainer, these dataset fields are optional, because the container datasets
|
||||
# could be downloaded even from the web.
|
||||
is_dataset_required = isinstance(self.container, InnerEyeContainer)
|
||||
# The present run is outside of AzureML: If local_dataset is set, use that as the path to the data.
|
||||
# Otherwise, download the dataset specified by the azure_dataset_id
|
||||
local_dataset = self.model_config.local_dataset
|
||||
if (not azure_dataset_id) and (local_dataset is None):
|
||||
raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
|
||||
if is_dataset_required:
|
||||
if (not azure_dataset_id) and (local_dataset is None):
|
||||
raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
|
||||
if local_dataset:
|
||||
expected_dir = Path(local_dataset)
|
||||
if not expected_dir.is_dir():
|
||||
raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
|
||||
logging.info(f"Model training will use the local dataset provided in {expected_dir}")
|
||||
return expected_dir
|
||||
return download_dataset(azure_dataset_id=azure_dataset_id,
|
||||
target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
|
||||
dataset_csv=self.model_config.dataset_csv,
|
||||
azure_config=self.azure_config)
|
||||
if azure_dataset_id:
|
||||
dataset_csv = ""
|
||||
if isinstance(self.model_config, DeepLearningConfig):
|
||||
dataset_csv = self.model_config.dataset_csv
|
||||
return download_dataset(azure_dataset_id=azure_dataset_id,
|
||||
target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
|
||||
dataset_csv=dataset_csv, azure_config=self.azure_config)
|
||||
return None
|
||||
|
||||
# Inside of AzureML, datasets can be either mounted or downloaded.
|
||||
if not azure_dataset_id:
|
||||
raise ValueError("The model must contain azure_dataset_id for running on AML")
|
||||
mounted = try_to_mount_input_dataset()
|
||||
if not mounted:
|
||||
raise ValueError("Unable to mount or download input dataset.")
|
||||
return mounted
|
||||
|
||||
def save_build_info_for_dotnet_consumers(self) -> None:
|
||||
results_container = get_results_blob_path(RUN_CONTEXT.id)
|
||||
result_location = ExperimentResultLocation(
|
||||
azure_job_name=RUN_CONTEXT.id,
|
||||
dataset_folder=self.model_config.azure_dataset_id,
|
||||
results_container_name=results_container,
|
||||
commandline_overrides=str(self.model_config.overrides),
|
||||
dataset_uri=self.model_config.azure_dataset_id,
|
||||
results_uri="",
|
||||
)
|
||||
# Fill in the missing information in the build config (everything that is not available at the time
|
||||
# of evoking the runner), and then save in the format needed for the .NET consumers
|
||||
build_information_to_dot_net_json_file(
|
||||
self.azure_config, result_location, folder=self.model_config.outputs_folder)
|
||||
if azure_dataset_id:
|
||||
mounted = try_to_mount_input_dataset()
|
||||
if not mounted:
|
||||
raise ValueError("Unable to mount or download input dataset.")
|
||||
return mounted
|
||||
return None
|
||||
|
||||
def set_multiprocessing_start_method(self) -> None:
|
||||
"""
|
||||
Set the (PyTorch) multiprocessing start method.
|
||||
"""
|
||||
method = self.model_config.multiprocessing_start_method
|
||||
method = self.container.multiprocessing_start_method
|
||||
if is_windows():
|
||||
if method != MultiprocessingStartMethod.spawn:
|
||||
logging.warning(f"Cannot set multiprocessing start method to '{method.name}' "
|
||||
|
@ -402,7 +523,7 @@ class MLRunner:
|
|||
logging.warning("Abandoning model registration - no valid checkpoint paths found")
|
||||
return
|
||||
|
||||
if not self.model_config.is_offline_run:
|
||||
if not self.is_offline_run:
|
||||
split_index = RUN_CONTEXT.get_tags().get(CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, None)
|
||||
if split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX:
|
||||
RUN_CONTEXT.tag(IS_ENSEMBLE_KEY_NAME, str(model_proc == ModelProcessing.ENSEMBLE_CREATION))
|
||||
|
@ -447,7 +568,7 @@ class MLRunner:
|
|||
model_subfolder = FINAL_MODEL_FOLDER if model_proc == ModelProcessing.DEFAULT else FINAL_ENSEMBLE_MODEL_FOLDER
|
||||
# This is the path under which AzureML will know the files: Either "final_model" or "final_ensemble_model"
|
||||
artifacts_path = model_subfolder
|
||||
final_model_folder = self.model_config.file_system_config.run_folder / model_subfolder
|
||||
final_model_folder = self.innereye_config.file_system_config.run_folder / model_subfolder
|
||||
# Copy all code from project and InnerEye into the model folder, and copy over checkpoints.
|
||||
# This increases the size of the data stored for the run. The other option would be to store all checkpoints
|
||||
# right in the final model folder - however, then that would also contain any other checkpoints that the model
|
||||
|
@ -467,7 +588,7 @@ class MLRunner:
|
|||
# When registering the model on the run, we need to provide a relative path inside of the run's output
|
||||
# folder in `model_path`
|
||||
model = run_to_register_on.register_model(
|
||||
model_name=self.model_config.model_name,
|
||||
model_name=self.innereye_config.model_name,
|
||||
model_path=artifacts_path,
|
||||
tags=RUN_CONTEXT.get_tags(),
|
||||
description=model_description
|
||||
|
@ -487,9 +608,9 @@ class MLRunner:
|
|||
logging.info(f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}")
|
||||
# create a version of the model for deployment if the hook is provided
|
||||
if self.model_deployment_hook is not None:
|
||||
assert isinstance(self.model_config, SegmentationModelBase)
|
||||
assert isinstance(self.innereye_config, SegmentationModelBase)
|
||||
deployment_result = self.model_deployment_hook(
|
||||
self.model_config, self.azure_config, model, model_proc)
|
||||
self.innereye_config, self.azure_config, model, model_proc)
|
||||
return model, deployment_result
|
||||
|
||||
@staticmethod
|
||||
|
@ -539,17 +660,17 @@ class MLRunner:
|
|||
try:
|
||||
# Checkpoints live in a folder structure in the checkpoint folder. There can be multiple of
|
||||
# them, with identical names, coming from an ensemble run. Hence, preserve their folder structure.
|
||||
checkpoint_relative = checkpoint.relative_to(self.model_config.checkpoint_folder)
|
||||
checkpoint_relative = checkpoint.relative_to(self.innereye_config.checkpoint_folder)
|
||||
except ValueError:
|
||||
raise ValueError(f"Checkpoint file {checkpoint} was expected to be in a subfolder of "
|
||||
f"{self.model_config.checkpoint_folder}")
|
||||
f"{self.innereye_config.checkpoint_folder}")
|
||||
# Checkpoints go into a newly created folder "checkpoints" inside of the model folder
|
||||
relative_checkpoint_paths.append(str(Path(CHECKPOINT_FOLDER) / checkpoint_relative))
|
||||
else:
|
||||
raise ValueError(f"Expected an absolute path to a checkpoint file, but got: {checkpoint}")
|
||||
model_folder.mkdir(parents=True, exist_ok=True)
|
||||
model_inference_config = ModelInferenceConfig(model_name=self.model_config.model_name,
|
||||
model_configs_namespace=self.model_config.__class__.__module__,
|
||||
model_inference_config = ModelInferenceConfig(model_name=self.innereye_config.model_name,
|
||||
model_configs_namespace=self.innereye_config.__class__.__module__,
|
||||
checkpoint_paths=relative_checkpoint_paths)
|
||||
# Inference configuration must live in the root folder of the registered model
|
||||
full_path_to_config = model_folder / fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME
|
||||
|
@ -591,11 +712,12 @@ class MLRunner:
|
|||
val_metrics = None
|
||||
test_metrics = None
|
||||
|
||||
config = self.model_config
|
||||
config = self.innereye_config
|
||||
|
||||
def run_model_test(data_split: ModelExecutionMode) -> Optional[InferenceMetrics]:
|
||||
return model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler,
|
||||
return model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler, # type: ignore
|
||||
model_proc=model_proc)
|
||||
|
||||
if config.perform_validation_and_test_set_inference:
|
||||
# perform inference on test set
|
||||
test_metrics = run_model_test(ModelExecutionMode.TEST)
|
||||
|
@ -610,7 +732,7 @@ class MLRunner:
|
|||
|
||||
# log the metrics to AzureML experiment if possible. When doing ensemble runs, log to the Hyperdrive parent run,
|
||||
# so that we get the metrics of child run 0 and the ensemble separated.
|
||||
if config.is_segmentation_model and not config.is_offline_run:
|
||||
if config.is_segmentation_model and not self.is_offline_run:
|
||||
run_for_logging = PARENT_RUN_CONTEXT if model_proc.ENSEMBLE_CREATION else RUN_CONTEXT
|
||||
log_metrics(val_metrics=val_metrics, test_metrics=test_metrics, # type: ignore
|
||||
train_metrics=train_metrics, run_context=run_for_logging) # type: ignore
|
||||
|
@ -633,9 +755,9 @@ class MLRunner:
|
|||
:return: True if all sibling runs of the current run have finished (they either completed successfully,
|
||||
or failed). False if any of them is still pending (running or queued).
|
||||
"""
|
||||
if (not self.model_config.is_offline_run) \
|
||||
if (not self.is_offline_run) \
|
||||
and (azure_util.is_cross_validation_child_run(RUN_CONTEXT)):
|
||||
n_splits = self.model_config.get_total_number_of_cross_validation_runs()
|
||||
n_splits = self.innereye_config.get_total_number_of_cross_validation_runs()
|
||||
child_runs = azure_util.fetch_child_runs(PARENT_RUN_CONTEXT,
|
||||
expected_number_cross_validation_splits=n_splits)
|
||||
pending_runs = [x.id for x in child_runs
|
||||
|
@ -648,14 +770,14 @@ class MLRunner:
|
|||
else:
|
||||
raise NotImplementedError("are_sibling_runs_finished only works for cross validation runs in AzureML.")
|
||||
|
||||
def create_ensemble_model(self) -> None:
|
||||
def create_ensemble_model_and_run_inference(self) -> None:
|
||||
"""
|
||||
Create an ensemble model from the results of the sibling runs of the present run. The present run here will
|
||||
be cross validation child run 0.
|
||||
"""
|
||||
assert PARENT_RUN_CONTEXT, "This function should only be called in a Hyperdrive run"
|
||||
with logging_section("Downloading checkpoints from sibling runs"):
|
||||
checkpoint_handler = CheckpointHandler(model_config=self.model_config,
|
||||
checkpoint_handler = CheckpointHandler(container=self.container,
|
||||
azure_config=self.azure_config,
|
||||
project_root=self.project_root,
|
||||
run_context=PARENT_RUN_CONTEXT)
|
||||
|
@ -665,13 +787,13 @@ class MLRunner:
|
|||
model_proc=ModelProcessing.ENSEMBLE_CREATION)
|
||||
|
||||
crossval_dir = self.plot_cross_validation_and_upload_results()
|
||||
if self.model_config.generate_report:
|
||||
if self.innereye_config.generate_report:
|
||||
self.generate_report(ModelProcessing.ENSEMBLE_CREATION)
|
||||
# CrossValResults should have been uploaded to the parent run, so we don't need it here.
|
||||
remove_file_or_directory(crossval_dir)
|
||||
# We can also remove OTHER_RUNS under the root, as it is no longer useful and only contains copies of files
|
||||
# available elsewhere. However, first we need to upload relevant parts of OTHER_RUNS/ENSEMBLE.
|
||||
other_runs_dir = self.model_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME
|
||||
other_runs_dir = self.innereye_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME
|
||||
other_runs_ensemble_dir = other_runs_dir / ENSEMBLE_SPLIT_NAME
|
||||
if PARENT_RUN_CONTEXT is not None:
|
||||
if other_runs_ensemble_dir.exists():
|
||||
|
@ -690,9 +812,9 @@ class MLRunner:
|
|||
from InnerEye.ML.visualizers.plot_cross_validation import crossval_config_from_model_config, \
|
||||
plot_cross_validation, unroll_aggregate_metrics
|
||||
# perform aggregation as cross val splits are now ready
|
||||
plot_crossval_config = crossval_config_from_model_config(self.model_config)
|
||||
plot_crossval_config = crossval_config_from_model_config(self.innereye_config)
|
||||
plot_crossval_config.run_recovery_id = PARENT_RUN_CONTEXT.tags[RUN_RECOVERY_ID_KEY_NAME]
|
||||
plot_crossval_config.outputs_directory = self.model_config.outputs_folder
|
||||
plot_crossval_config.outputs_directory = self.innereye_config.outputs_folder
|
||||
plot_crossval_config.azure_config = self.azure_config
|
||||
cross_val_results_root = plot_cross_validation(plot_crossval_config)
|
||||
if isinstance(self.model_config, ScalarModelBase) and not isinstance(self.model_config, SequenceModelBase):
|
||||
|
@ -701,10 +823,10 @@ class MLRunner:
|
|||
full_metrics_csv = cross_val_results_root / FULL_METRICS_DATAFRAME_FILE
|
||||
generate_classification_crossval_notebook(notebook_path, self.model_config, full_metrics_csv)
|
||||
if self.post_cross_validation_hook:
|
||||
self.post_cross_validation_hook(self.model_config, cross_val_results_root)
|
||||
self.post_cross_validation_hook(self.innereye_config, cross_val_results_root)
|
||||
# upload results to the parent run's outputs so that the files are visible inside the AzureML UI.
|
||||
PARENT_RUN_CONTEXT.upload_folder(name=CROSSVAL_RESULTS_FOLDER, path=str(cross_val_results_root))
|
||||
if self.model_config.is_scalar_model:
|
||||
if self.innereye_config.is_scalar_model:
|
||||
try:
|
||||
aggregates = pd.read_csv(cross_val_results_root / METRICS_AGGREGATES_FILE)
|
||||
unrolled_aggregate_metrics = unroll_aggregate_metrics(aggregates)
|
||||
|
@ -715,7 +837,7 @@ class MLRunner:
|
|||
return cross_val_results_root
|
||||
|
||||
def generate_report(self, model_proc: ModelProcessing) -> None:
|
||||
config = self.model_config
|
||||
config = self.innereye_config
|
||||
if config.model_category not in [ModelCategory.Segmentation, ModelCategory.Classification]:
|
||||
logging.info(f"No reporting available for a model with category {config.model_category}")
|
||||
return
|
||||
|
@ -753,7 +875,8 @@ class MLRunner:
|
|||
|
||||
if len(config.class_names) > 1:
|
||||
generate_classification_multilabel_notebook(
|
||||
result_notebook=reports_dir / get_ipynb_report_name(f"{config.model_category.value}_multilabel"),
|
||||
result_notebook=reports_dir / get_ipynb_report_name(
|
||||
f"{config.model_category.value}_multilabel"),
|
||||
config=config,
|
||||
train_metrics=path_to_best_epoch_train,
|
||||
val_metrics=path_to_best_epoch_val,
|
||||
|
|
|
@ -4,50 +4,62 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
|
||||
# individual warnings only.
|
||||
# flake8: noqa
|
||||
|
||||
# Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP,
|
||||
# the working directory is not correctly picked up in sys.path
|
||||
print("Starting InnerEye runner.")
|
||||
|
||||
print(f"Starting InnerEye runner at {sys.argv[0]}")
|
||||
innereye_root = Path(__file__).absolute().parent.parent.parent
|
||||
if (innereye_root / "InnerEye").is_dir():
|
||||
innereye_root_str = str(innereye_root)
|
||||
if innereye_root_str not in sys.path:
|
||||
print(f"Adding to sys.path: {innereye_root_str}")
|
||||
print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
|
||||
sys.path.insert(0, innereye_root_str)
|
||||
# We change the current working directory before starting the actual training. However, this throws off starting
|
||||
# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
|
||||
# path.
|
||||
runner_path = Path(sys.argv[0])
|
||||
if not runner_path.is_absolute():
|
||||
sys.argv[0] = str(runner_path.absolute())
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, List, Optional, Tuple
|
||||
from typing import Any, Optional, Tuple
|
||||
|
||||
from azureml._base_sdk_common import user_agent
|
||||
from azureml.core import Model, Run
|
||||
from azureml.core import Run
|
||||
|
||||
from InnerEye.Azure import azure_util
|
||||
from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
|
||||
from InnerEye.Azure.azure_runner import create_runner_parser, parse_args_and_add_yaml_variables, \
|
||||
parse_arguments, set_environment_variables_for_multi_node, submit_to_azureml
|
||||
from InnerEye.Azure.azure_util import is_run_and_child_runs_completed
|
||||
from InnerEye.Azure.azure_util import get_all_environment_files, is_run_and_child_runs_completed
|
||||
from InnerEye.Azure.run_pytest import download_pytest_result, run_pytest
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
|
||||
ModelProcessing, disable_logging_to_file, is_linux, logging_to_file, logging_to_stdout, print_exception
|
||||
disable_logging_to_file, is_linux, logging_to_stdout
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.config import ModelDeploymentHookSignature, PostCrossValidationHookSignature
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
|
||||
LOG_FILE_NAME = "stdout.txt"
|
||||
try:
|
||||
# This import can fail when the code runs inside the azure_runner.yml Conda environment, that we use
|
||||
# for the PR builds
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
|
||||
ModelDeploymentHookSignature = Callable[[SegmentationModelBase, AzureConfig, Model, ModelProcessing], Any]
|
||||
has_torch = True
|
||||
except ModuleNotFoundError as ex:
|
||||
has_torch = False
|
||||
|
||||
|
||||
def may_initialize_rpdb() -> None:
|
||||
def initialize_rpdb() -> None:
|
||||
"""
|
||||
On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
|
||||
"""
|
||||
|
@ -85,23 +97,10 @@ def suppress_logging_noise() -> None:
|
|||
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
||||
|
||||
|
||||
def get_all_environment_files(project_root: Path) -> List[Path]:
|
||||
"""
|
||||
Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
|
||||
and possibly a second environment.yml file that lives at the project root folder.
|
||||
:param project_root: The root folder of the code that starts the present training run.
|
||||
:return: A list with 1 or 2 entries that are conda environment files.
|
||||
"""
|
||||
innereye_yaml = fixed_paths.get_environment_yaml_file()
|
||||
project_yaml = project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME
|
||||
files = [innereye_yaml]
|
||||
if innereye_yaml != project_yaml:
|
||||
files.append(project_yaml)
|
||||
return files
|
||||
|
||||
|
||||
class Runner:
|
||||
"""
|
||||
This class contains the high-level logic to start a training run: choose a model configuration by name,
|
||||
submit to AzureML if needed, or otherwise start the actual training and test loop.
|
||||
:param project_root: The root folder that contains all of the source code that should be executed.
|
||||
:param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv.
|
||||
:param post_cross_validation_hook: A function to call after waiting for completion of cross validation runs.
|
||||
|
@ -116,17 +115,17 @@ class Runner:
|
|||
project_root: Path,
|
||||
yaml_config_file: Path,
|
||||
post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
|
||||
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None,
|
||||
command_line_args: Optional[List[str]] = None):
|
||||
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None):
|
||||
self.project_root = project_root
|
||||
self.yaml_config_file = yaml_config_file
|
||||
self.post_cross_validation_hook = post_cross_validation_hook
|
||||
self.model_deployment_hook = model_deployment_hook
|
||||
self.command_line_args = command_line_args
|
||||
# model_config and azure_config are placeholders for now, and are set properly when command line args are
|
||||
# parsed.
|
||||
self.model_config: ModelConfigBase = ModelConfigBase(azure_dataset_id="")
|
||||
self.model_config: Optional[DeepLearningConfig] = None
|
||||
self.azure_config: AzureConfig = AzureConfig()
|
||||
# This should be typed as LightningContainer, but we don't always have that imported
|
||||
self.lightning_container: Any = None
|
||||
|
||||
def parse_and_load_model(self) -> ParserResult:
|
||||
"""
|
||||
|
@ -138,47 +137,84 @@ class Runner:
|
|||
"""
|
||||
# Create a parser that will understand only the args we need for an AzureConfig
|
||||
parser1 = create_runner_parser()
|
||||
parser1_result = parse_args_and_add_yaml_variables(parser1,
|
||||
yaml_config_file=self.yaml_config_file,
|
||||
project_root=self.project_root,
|
||||
args=self.command_line_args,
|
||||
fail_on_unknown_args=False)
|
||||
azure_config = AzureConfig(**parser1_result.args)
|
||||
parser_result = parse_args_and_add_yaml_variables(parser1,
|
||||
yaml_config_file=self.yaml_config_file,
|
||||
project_root=self.project_root,
|
||||
fail_on_unknown_args=False)
|
||||
azure_config = AzureConfig(**parser_result.args)
|
||||
azure_config.project_root = self.project_root
|
||||
self.azure_config = azure_config
|
||||
self.model_config = None # type: ignore
|
||||
self.model_config = None
|
||||
self.lightning_container = None
|
||||
if not azure_config.model:
|
||||
raise ValueError("Parameter 'model' needs to be set to tell InnerEye which model to run.")
|
||||
model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser1_result.args)
|
||||
# Create the model as per the "model" commandline option
|
||||
model_config = model_config_loader.create_model_config_from_name(
|
||||
model_name=azure_config.model
|
||||
)
|
||||
# This model will be either a classification model or a segmentation model. Those have different
|
||||
# fields that could be overridden on the command line. Create a parser that understands the fields we need
|
||||
# for the actual model type. We feed this parser will the YAML settings and commandline arguments that the
|
||||
# first parser did not recognize.
|
||||
parser2 = type(model_config).create_argparser()
|
||||
parser2_result = parse_arguments(parser2,
|
||||
settings_from_yaml=parser1_result.unknown_settings_from_yaml,
|
||||
args=parser1_result.unknown,
|
||||
fail_on_unknown_args=True)
|
||||
# Apply the overrides and validate. Overrides can come from either YAML settings or the commandline.
|
||||
model_config.apply_overrides(parser1_result.unknown_settings_from_yaml)
|
||||
model_config.apply_overrides(parser2_result.overrides)
|
||||
model_config.validate()
|
||||
# Set the file system related configs, they might be affected by the overrides that were applied.
|
||||
logging.info("Creating the adjusted output folder structure.")
|
||||
model_config.create_filesystem(self.project_root)
|
||||
model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser_result.args)
|
||||
# Create the model as per the "model" commandline option. This can return either a built-in config
|
||||
# of type DeepLearningConfig, or a LightningContainer.
|
||||
config_or_container = model_config_loader.create_model_config_from_name(model_name=azure_config.model)
|
||||
|
||||
def parse_overrides_and_apply(c: object, previous_parser_result: ParserResult) -> ParserResult:
|
||||
assert isinstance(c, GenericConfig)
|
||||
parser = type(c).create_argparser()
|
||||
# For each parser, feed in the unknown settings from the previous parser. All commandline args should
|
||||
# be consumed by name, hence fail if there is something that is still unknown.
|
||||
parser_result = parse_arguments(parser,
|
||||
settings_from_yaml=previous_parser_result.unknown_settings_from_yaml,
|
||||
args=previous_parser_result.unknown,
|
||||
fail_on_unknown_args=True)
|
||||
# Apply the overrides and validate. Overrides can come from either YAML settings or the commandline.
|
||||
c.apply_overrides(parser_result.known_settings_from_yaml)
|
||||
c.apply_overrides(parser_result.overrides)
|
||||
c.validate()
|
||||
return parser_result
|
||||
|
||||
# Now create a parser that understands overrides at model/container level.
|
||||
parser_result = parse_overrides_and_apply(config_or_container, parser_result)
|
||||
|
||||
if has_torch and isinstance(config_or_container, LightningContainer):
|
||||
self.lightning_container = config_or_container
|
||||
elif isinstance(config_or_container, DeepLearningConfig):
|
||||
# Built-in InnerEye models: A fake container for these models will be created in MLRunner
|
||||
self.model_config = config_or_container
|
||||
else:
|
||||
raise ValueError(f"Don't know how to handle a loaded configuration of type {type(config_or_container)}")
|
||||
if azure_config.extra_code_directory:
|
||||
exist = "exists" if Path(azure_config.extra_code_directory).exists() else "does not exist"
|
||||
logging.info(f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}")
|
||||
else:
|
||||
logging.info("extra_code_directory is unset")
|
||||
self.model_config = model_config
|
||||
return parser2_result
|
||||
return parser_result
|
||||
|
||||
def run(self) -> Tuple[ModelConfigBase, Optional[Run]]:
|
||||
def _get_property_from_config_or_container(self, name: str) -> Any:
|
||||
"""
|
||||
Reads out a property or attribute from either the model configuration (if that is a built-in InnerEye
|
||||
model) or the lightning container.
|
||||
:param name: The name of the property to read.
|
||||
:return: The property value, coming from either the model config or the container.
|
||||
"""
|
||||
if isinstance(self.model_config, DeepLearningConfig):
|
||||
return getattr(self.model_config, name)
|
||||
elif self.lightning_container is not None:
|
||||
return getattr(self.lightning_container, name)
|
||||
else:
|
||||
raise ValueError(f"Did not expect config of type {type(self.model_config)} and container of type "
|
||||
f"{type(self.lightning_container)}")
|
||||
|
||||
@property
|
||||
def perform_cross_validation(self) -> bool:
|
||||
"""
|
||||
Returns True if cross validation will be be performed as part of the training procedure.
|
||||
"""
|
||||
return self._get_property_from_config_or_container("perform_cross_validation")
|
||||
|
||||
@property
|
||||
def azure_dataset_id(self) -> str:
|
||||
"""
|
||||
Returns the name of the Azure dataset that should be used.
|
||||
"""
|
||||
return self._get_property_from_config_or_container("azure_dataset_id")
|
||||
|
||||
def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]:
|
||||
"""
|
||||
The main entry point for training and testing models from the commandline. This chooses a model to train
|
||||
via a commandline argument, runs training or testing, and writes all required info to disk and logs.
|
||||
|
@ -188,10 +224,12 @@ class Runner:
|
|||
# Usually, when we set logging to DEBUG, we want diagnostics about the model
|
||||
# build itself, but not the tons of debug information that AzureML submissions create.
|
||||
logging_to_stdout(logging.INFO)
|
||||
may_initialize_rpdb()
|
||||
initialize_rpdb()
|
||||
user_agent.append(azure_util.INNEREYE_SDK_NAME, azure_util.INNEREYE_SDK_VERSION)
|
||||
self.parse_and_load_model()
|
||||
if self.model_config is not None and self.model_config.perform_cross_validation:
|
||||
if self.perform_cross_validation:
|
||||
if self.lightning_container is not None:
|
||||
raise NotImplementedError("Cross validation for LightingContainer models is not yet supported.")
|
||||
# force hyperdrive usage if performing cross validation
|
||||
self.azure_config.hyperdrive = True
|
||||
run_object: Optional[Run] = None
|
||||
|
@ -208,23 +246,24 @@ class Runner:
|
|||
"""
|
||||
# The adal package creates a logging.info line each time it gets an authentication token, avoid that.
|
||||
logging.getLogger('adal-python').setLevel(logging.WARNING)
|
||||
if not self.model_config.azure_dataset_id:
|
||||
raise ValueError("When running on AzureML, the 'azure_dataset_id' property must be set.")
|
||||
model_config_overrides = str(self.model_config.overrides)
|
||||
# PyJWT prints out warnings that are beyond our control
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
if isinstance(self.model_config, DeepLearningConfig) and not self.azure_dataset_id:
|
||||
raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
|
||||
"property must be set.")
|
||||
hyperdrive_func = lambda run_config: self.model_config.get_hyperdrive_config(run_config) # type: ignore
|
||||
source_config = SourceConfig(
|
||||
root_folder=self.project_root,
|
||||
entry_script=Path(sys.argv[0]).resolve(),
|
||||
conda_dependencies_files=get_all_environment_files(self.project_root),
|
||||
hyperdrive_config_func=lambda run_config: self.model_config.get_hyperdrive_config(run_config),
|
||||
# For large jobs, upload of results times out frequently because of large checkpoint files. Default is 600
|
||||
hyperdrive_config_func=hyperdrive_func,
|
||||
# For large jobs, upload of results can time out because of large checkpoint files. Default is 600
|
||||
upload_timeout_seconds=86400,
|
||||
)
|
||||
source_config.set_script_params_except_submit_flag()
|
||||
assert self.model_config.azure_dataset_id is not None # to stop mypy complaining about next line
|
||||
azure_run = submit_to_azureml(self.azure_config, source_config, model_config_overrides,
|
||||
self.model_config.azure_dataset_id)
|
||||
azure_run = submit_to_azureml(self.azure_config, source_config, self.azure_dataset_id)
|
||||
logging.info("Job submission to AzureML done.")
|
||||
if self.azure_config.pytest_mark:
|
||||
if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
|
||||
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
|
||||
# A build step will pick up that file and publish it to Azure DevOps.
|
||||
# If pytest_mark is set, this file must exist.
|
||||
|
@ -246,21 +285,17 @@ class Runner:
|
|||
# build itself, but not the tons of debug information that AzureML submissions create.
|
||||
logging_to_stdout(self.azure_config.log_level)
|
||||
suppress_logging_noise()
|
||||
error_messages = []
|
||||
# For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both
|
||||
# only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the
|
||||
# large models.
|
||||
if self.azure_config.pytest_mark:
|
||||
try:
|
||||
outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
|
||||
pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder)
|
||||
if not pytest_passed:
|
||||
pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
|
||||
logging.error(pytest_failures)
|
||||
error_messages.append(pytest_failures)
|
||||
except Exception as ex:
|
||||
print_exception(ex, "Unable to run PyTest.")
|
||||
error_messages.append(f"Unable to run PyTest: {ex}")
|
||||
outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
|
||||
pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder)
|
||||
if not pytest_passed:
|
||||
# Terminate if pytest has failed. This makes the smoke test in
|
||||
# PR builds fail if pytest fails.
|
||||
pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
|
||||
raise ValueError(pytest_failures)
|
||||
else:
|
||||
# Set environment variables for multi-node training if needed.
|
||||
# In particular, the multi-node environment variables should NOT be set in single node
|
||||
|
@ -268,20 +303,14 @@ class Runner:
|
|||
# (https://github.com/microsoft/InnerEye-DeepLearning/issues/395)
|
||||
if self.azure_config.num_nodes > 1:
|
||||
set_environment_variables_for_multi_node()
|
||||
logging.info("Creating the output folder structure.")
|
||||
ml_runner = self.create_ml_runner()
|
||||
ml_runner.setup()
|
||||
ml_runner.start_logging_to_file()
|
||||
try:
|
||||
logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME)
|
||||
try:
|
||||
self.create_ml_runner().run()
|
||||
except Exception as ex:
|
||||
print_exception(ex, "Model training/testing failed.")
|
||||
error_messages.append(f"Training failed: {ex}")
|
||||
ml_runner.run()
|
||||
finally:
|
||||
disable_logging_to_file()
|
||||
# Terminate if pytest or model training has failed. This makes the smoke test in
|
||||
# PR builds fail if pytest fails.
|
||||
if error_messages:
|
||||
raise ValueError(
|
||||
f"At least one component of the runner failed: {os.linesep} {os.linesep.join(error_messages)}")
|
||||
|
||||
def create_ml_runner(self) -> Any:
|
||||
"""
|
||||
|
@ -289,10 +318,11 @@ class Runner:
|
|||
"""
|
||||
# This import statement cannot be at the beginning of the file because it will cause import
|
||||
# of packages that are not available inside the azure_runner.yml environment, in particular pytorch.
|
||||
# That is also why we specify the return type as Any rather than MLRunner.
|
||||
# That is also why we specify the return type is Any rather than MLRunner.
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
return MLRunner(
|
||||
model_config=self.model_config,
|
||||
container=self.lightning_container,
|
||||
azure_config=self.azure_config,
|
||||
project_root=self.project_root,
|
||||
post_cross_validation_hook=self.post_cross_validation_hook,
|
||||
|
@ -319,17 +349,15 @@ def default_post_cross_validation_hook(config: ModelConfigBase, root_folder: Pat
|
|||
def run(project_root: Path,
|
||||
yaml_config_file: Path,
|
||||
post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
|
||||
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None,
|
||||
command_line_args: Optional[List[str]] = None) -> \
|
||||
Tuple[ModelConfigBase, Optional[Run]]:
|
||||
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None) -> \
|
||||
Tuple[Optional[DeepLearningConfig], Optional[Run]]:
|
||||
"""
|
||||
The main entry point for training and testing models from the commandline. This chooses a model to train
|
||||
via a commandline argument, runs training or testing, and writes all required info to disk and logs.
|
||||
:return: If submitting to AzureML, returns the model configuration that was used for training,
|
||||
including commandline overrides applied (if any). For details on the arguments, see the constructor of Runner.
|
||||
"""
|
||||
runner = Runner(project_root, yaml_config_file, post_cross_validation_hook,
|
||||
model_deployment_hook, command_line_args)
|
||||
runner = Runner(project_root, yaml_config_file, post_cross_validation_hook, model_deployment_hook)
|
||||
return runner.run()
|
||||
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ from InnerEye.ML.common import ModelExecutionMode
|
|||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.plotting import segmentation_and_groundtruth_plot, surface_distance_ground_truth_plot
|
||||
from InnerEye.ML.utils import surface_distance_utils as sd_util
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.utils.csv_util import get_worst_performing_outliers, load_csv
|
||||
from InnerEye.ML.utils.image_util import multi_label_array_to_binary
|
||||
from InnerEye.ML.utils.io_util import load_nifti_image
|
||||
|
@ -105,10 +105,8 @@ def main() -> None:
|
|||
if config_model is None:
|
||||
raise ValueError("The name of the model to train must be given in the --model argument.")
|
||||
|
||||
model_config = ModelConfigLoader[SegmentationModelBase]().create_model_config_from_name(
|
||||
config_model,
|
||||
overrides=parser_result.overrides
|
||||
)
|
||||
model_config = ModelConfigLoader().create_model_config_from_name(config_model)
|
||||
model_config.apply_overrides(parser_result.overrides, should_validate=True)
|
||||
execution_mode = surface_distance_config.execution_mode
|
||||
|
||||
run_mode = surface_distance_config.run_mode
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from builtins import property
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urlparse
|
||||
|
@ -15,7 +16,8 @@ from azureml.core import Run
|
|||
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig, WEIGHTS_FILE
|
||||
from InnerEye.ML.deep_learning_config import OutputParams, WEIGHTS_FILE
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.utils.run_recovery import RunRecovery
|
||||
|
||||
|
||||
|
@ -25,22 +27,29 @@ class CheckpointHandler:
|
|||
azure config and model config.
|
||||
"""
|
||||
|
||||
def __init__(self, model_config: DeepLearningConfig, azure_config: AzureConfig,
|
||||
def __init__(self, container: LightningContainer, azure_config: AzureConfig,
|
||||
project_root: Path, run_context: Optional[Run] = None):
|
||||
self.azure_config = azure_config
|
||||
self.model_config = model_config
|
||||
self.container = container
|
||||
self.run_recovery: Optional[RunRecovery] = None
|
||||
self.project_root = project_root
|
||||
self.run_context = run_context
|
||||
self.local_weights_path: Optional[Path] = None
|
||||
self.has_continued_training = False
|
||||
|
||||
@property
|
||||
def output_params(self) -> OutputParams:
|
||||
"""
|
||||
Gets the part of the configuration that is responsible for output paths.
|
||||
"""
|
||||
return self.container
|
||||
|
||||
def download_checkpoints_from_hyperdrive_child_runs(self, hyperdrive_parent_run: Run) -> None:
|
||||
"""
|
||||
Downloads the best checkpoints from all child runs of a Hyperdrive parent runs. This is used to gather results
|
||||
for ensemble creation.
|
||||
"""
|
||||
self.run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(self.model_config,
|
||||
self.run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(self.output_params,
|
||||
hyperdrive_parent_run)
|
||||
# Check paths are good, just in case
|
||||
for path in self.run_recovery.checkpoints_roots:
|
||||
|
@ -55,11 +64,11 @@ class CheckpointHandler:
|
|||
"""
|
||||
if self.azure_config.run_recovery_id:
|
||||
run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip())
|
||||
self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.model_config, run_to_recover)
|
||||
self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover)
|
||||
else:
|
||||
self.run_recovery = None
|
||||
|
||||
if self.model_config.weights_url or self.model_config.local_weights_path:
|
||||
if self.container.weights_url or self.container.local_weights_path:
|
||||
self.local_weights_path = self.get_and_save_modified_weights()
|
||||
|
||||
def additional_training_done(self) -> None:
|
||||
|
@ -74,11 +83,11 @@ class CheckpointHandler:
|
|||
checkpoint from there, otherwise use the checkpoints from the current run.
|
||||
:return: Constructed checkpoint path to recover from.
|
||||
"""
|
||||
|
||||
if self.model_config.start_epoch > 0 and not self.run_recovery:
|
||||
start_epoch = self.container.start_epoch
|
||||
if start_epoch > 0 and not self.run_recovery:
|
||||
raise ValueError("Start epoch is > 0, but no run recovery object has been provided to resume training.")
|
||||
|
||||
if self.run_recovery and self.model_config.start_epoch == 0:
|
||||
if self.run_recovery and start_epoch == 0:
|
||||
raise ValueError("Run recovery set, but start epoch is 0. Please provide start epoch > 0 (for which a "
|
||||
"checkpoint was saved in the previous run) to resume training from that run.")
|
||||
|
||||
|
@ -88,7 +97,7 @@ class CheckpointHandler:
|
|||
checkpoints = self.run_recovery.get_recovery_checkpoint_paths()
|
||||
if len(checkpoints) > 1:
|
||||
raise ValueError(f"Recovering training of ensemble runs is not supported. Found more than one "
|
||||
f"checkpoint for epoch {self.model_config.start_epoch}")
|
||||
f"checkpoint for epoch {start_epoch}")
|
||||
return checkpoints[0]
|
||||
elif self.local_weights_path:
|
||||
return self.local_weights_path
|
||||
|
@ -129,7 +138,7 @@ class CheckpointHandler:
|
|||
if self.has_continued_training:
|
||||
# Checkpoint is from the current run, whether a new run or a run recovery which has been doing more
|
||||
# training, so we look for it there.
|
||||
checkpoint_from_current_run = self.model_config.get_path_to_best_checkpoint()
|
||||
checkpoint_from_current_run = self.output_params.get_path_to_best_checkpoint()
|
||||
if checkpoint_from_current_run.is_file():
|
||||
logging.info("Using checkpoints from current run.")
|
||||
checkpoint_paths = [checkpoint_from_current_run]
|
||||
|
@ -172,7 +181,7 @@ class CheckpointHandler:
|
|||
target_folder = self.project_root / fixed_paths.MODEL_WEIGHTS_DIR_NAME
|
||||
target_folder.mkdir(exist_ok=True)
|
||||
|
||||
url = self.model_config.weights_url
|
||||
url = self.container.weights_url
|
||||
|
||||
# assign the same filename as in the download url if possible, so that we can check for duplicates
|
||||
# If that fails, map to a random uuid
|
||||
|
@ -198,9 +207,9 @@ class CheckpointHandler:
|
|||
"""
|
||||
Get the path to the local weights to use or download them and set local_weights_path
|
||||
"""
|
||||
if self.model_config.local_weights_path:
|
||||
weights_path = self.model_config.local_weights_path
|
||||
elif self.model_config.weights_url:
|
||||
if self.container.local_weights_path:
|
||||
weights_path = self.container.local_weights_path
|
||||
elif self.container.weights_url:
|
||||
weights_path = self.download_weights()
|
||||
else:
|
||||
raise ValueError("Cannot download/modify weights - neither local_weights_path nor weights_url is set in"
|
||||
|
@ -219,8 +228,8 @@ class CheckpointHandler:
|
|||
if not weights_path or not weights_path.is_file():
|
||||
raise FileNotFoundError(f"Could not find the weights file at {weights_path}")
|
||||
|
||||
modified_weights = self.model_config.load_checkpoint_and_modify(weights_path)
|
||||
target_file = self.model_config.outputs_folder / WEIGHTS_FILE
|
||||
modified_weights = self.container.load_checkpoint_and_modify(weights_path)
|
||||
target_file = self.output_params.outputs_folder / WEIGHTS_FILE
|
||||
torch.save(modified_weights, target_file)
|
||||
return target_file
|
||||
|
||||
|
@ -228,4 +237,4 @@ class CheckpointHandler:
|
|||
"""
|
||||
Returns true if the optimizer should be loaded from checkpoint. Looks at the model config to determine this.
|
||||
"""
|
||||
return self.model_config.start_epoch > 0
|
||||
return self.container.start_epoch > 0
|
||||
|
|
|
@ -5,21 +5,19 @@
|
|||
import importlib
|
||||
import inspect
|
||||
import logging
|
||||
from importlib._bootstrap import ModuleSpec
|
||||
from importlib.util import find_spec
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Generic, List, Optional, TypeVar
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
import param
|
||||
from importlib._bootstrap import ModuleSpec
|
||||
|
||||
from InnerEye.Common.common_util import path_to_namespace
|
||||
from InnerEye.Common.generic_parsing import GenericConfig
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
|
||||
C = TypeVar('C', bound=ModelConfigBase)
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
|
||||
|
||||
class ModelConfigLoader(GenericConfig, Generic[C]):
|
||||
class ModelConfigLoader(GenericConfig):
|
||||
"""
|
||||
Helper class to manage model config loading
|
||||
"""
|
||||
|
@ -43,20 +41,21 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
|
|||
from InnerEye.ML import configs
|
||||
return configs.__name__
|
||||
|
||||
def create_model_config_from_name(self, model_name: str, overrides: Optional[Dict[str, Any]] = None) -> C:
|
||||
def create_model_config_from_name(self, model_name: str) -> DeepLearningConfig:
|
||||
"""
|
||||
Returns a segmentation or classification model configuration for a model of the given name.
|
||||
Returns a model configuration for a model of the given name. This can be either a segmentation or
|
||||
classification configuration for an InnerEye built-in model, or a LightningContainer.
|
||||
To avoid having to import torch here, there are no references to LightningContainer.
|
||||
Searching for a class member called <model_name> in the search modules provided recursively.
|
||||
|
||||
:param model_name: Name of the model for which to get the configs for.
|
||||
:param overrides: Model properties to override.
|
||||
"""
|
||||
if not model_name:
|
||||
raise ValueError("Unable to load a model configuration because the model name is missing.")
|
||||
|
||||
configs: Dict[str, C] = {}
|
||||
configs: Dict[str, DeepLearningConfig] = {}
|
||||
|
||||
def _get_model_config(module_spec: ModuleSpec) -> Optional[C]:
|
||||
def _get_model_config(module_spec: ModuleSpec) -> Optional[DeepLearningConfig]:
|
||||
"""
|
||||
Given a module specification check to see if it has a class property with
|
||||
the <model_name> provided, and instantiate that config class with the
|
||||
|
@ -66,6 +65,7 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
|
|||
"""
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
logging.debug(f"Importing {module_spec.name}")
|
||||
target_module = importlib.import_module(module_spec.name)
|
||||
# The "if" clause checks that obj is a class, of the desired name, that is
|
||||
# defined in this module rather than being imported into it (and hence potentially
|
||||
|
@ -74,7 +74,7 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
|
|||
if inspect.isclass(obj)
|
||||
and name == model_name
|
||||
and inspect.getmodule(obj) == target_module)
|
||||
logging.info(f"Found class {_class.name} in file {module_spec.origin}")
|
||||
logging.info(f"Found class {_class} in file {module_spec.origin}")
|
||||
# ignore the exception which will occur if the provided module cannot be loaded
|
||||
# or the loaded module does not have the required class as a member
|
||||
except Exception as e:
|
||||
|
@ -82,13 +82,7 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
|
|||
if exception_text != "":
|
||||
logging.warning(f"(from attempt to import module {module_spec.name}): {exception_text}")
|
||||
return None
|
||||
model_config: ModelConfigBase = _class()
|
||||
|
||||
# apply the overrides to the model
|
||||
if overrides is not None:
|
||||
model_config.apply_overrides(overrides)
|
||||
# The parameters have presumably changed, so we need to re-validate.
|
||||
model_config.validate()
|
||||
model_config: DeepLearningConfig = _class()
|
||||
return model_config
|
||||
|
||||
def _search_recursively_and_store(module_search_spec: ModuleSpec) -> None:
|
|
@ -261,7 +261,7 @@ def load_dicom_image(path: PathOrString) -> np.ndarray:
|
|||
"""
|
||||
ds = dicom.dcmread(path)
|
||||
pixels = ds.pixel_array
|
||||
bits_stored = ds.BitsStored
|
||||
bits_stored = int(ds.BitsStored) # type: ignore
|
||||
if ds.PhotometricInterpretation == PhotometricInterpretation.MONOCHROME1.value:
|
||||
pixel_repr = ds.PixelRepresentation
|
||||
if pixel_repr == 0: # unsigned
|
||||
|
|
|
@ -9,7 +9,7 @@ from typing import Dict, List
|
|||
from torch.optim.lr_scheduler import CosineAnnealingLR, ExponentialLR, LambdaLR, MultiStepLR, StepLR, _LRScheduler
|
||||
from torch.optim.optimizer import Optimizer
|
||||
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig, LRSchedulerType, LRWarmUpType
|
||||
from InnerEye.ML.deep_learning_config import LRSchedulerType, LRWarmUpType, OptimizerParams
|
||||
|
||||
|
||||
def get_current_learning_rates(optimizer: Optimizer) -> List[float]:
|
||||
|
@ -23,6 +23,7 @@ class LinearWarmUp(_LRScheduler):
|
|||
"""
|
||||
Implements linear warmup up to a given initial learning rate.
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer: Optimizer, warmup_epochs: int, final_lr: float, last_epoch: int = -1):
|
||||
if warmup_epochs < 0:
|
||||
raise ValueError("The number of warmup epochs must be >= 0.")
|
||||
|
@ -60,9 +61,10 @@ class SchedulerWithWarmUp(_LRScheduler):
|
|||
of the normal schedulers.
|
||||
"""
|
||||
|
||||
def __init__(self, args: DeepLearningConfig, optimizer: Optimizer, last_epoch: int = -1):
|
||||
def __init__(self, args: OptimizerParams, optimizer: Optimizer, num_epochs: int, last_epoch: int = -1):
|
||||
self.optimizer = optimizer
|
||||
self.last_epoch = last_epoch
|
||||
self.num_epochs = num_epochs
|
||||
self.warmup_epochs = 0 if args.l_rate_warmup == LRWarmUpType.NoWarmUp else args.l_rate_warmup_epochs
|
||||
self._scheduler = self.get_scheduler(args)
|
||||
# This must be called after self.get_scheduler, because we want the optimizer to have the learning rate
|
||||
|
@ -75,12 +77,12 @@ class SchedulerWithWarmUp(_LRScheduler):
|
|||
self.min_l_rate = args.min_l_rate
|
||||
super().__init__(optimizer, last_epoch)
|
||||
|
||||
def get_scheduler(self, args: DeepLearningConfig) -> _LRScheduler:
|
||||
def get_scheduler(self, args: OptimizerParams) -> _LRScheduler:
|
||||
"""
|
||||
Create the LR scheduler that will be used after warmup, based on the config params.
|
||||
"""
|
||||
scheduler: _LRScheduler
|
||||
epochs_after_warmup = args.num_epochs - self.warmup_epochs
|
||||
epochs_after_warmup = self.num_epochs - self.warmup_epochs
|
||||
if args.l_rate_scheduler == LRSchedulerType.Exponential:
|
||||
scheduler = ExponentialLR(optimizer=self.optimizer,
|
||||
gamma=args.l_rate_exponential_gamma,
|
||||
|
|
|
@ -125,7 +125,7 @@ def get_number_of_voxels_per_class(labels: torch.Tensor) -> torch.Tensor:
|
|||
if len(labels.shape) == 4:
|
||||
labels = labels[None, ...]
|
||||
|
||||
return torch.tensor(np.count_nonzero(labels.cpu().numpy(), axis=(2, 3, 4)))
|
||||
return torch.count_nonzero(labels, dim=(2, 3, 4))
|
||||
|
||||
|
||||
def get_label_overlap_stats(labels: np.ndarray, label_names: List[str]) -> Dict[str, int]:
|
||||
|
|
|
@ -19,7 +19,7 @@ from InnerEye.ML.config import ModelArchitectureConfig, PaddingMode, Segmentatio
|
|||
basic_size_shrinkage
|
||||
from InnerEye.ML.dataset.scalar_sample import ScalarItem
|
||||
from InnerEye.ML.dataset.sequence_sample import ClassificationItemSequence
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig, OptimizerType
|
||||
from InnerEye.ML.deep_learning_config import OptimizerParams, OptimizerType
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel, CropSizeConstraints
|
||||
from InnerEye.ML.models.architectures.complex import ComplexModel
|
||||
|
@ -38,7 +38,7 @@ from InnerEye.ML.utils.temperature_scaling import ModelWithTemperature
|
|||
from InnerEye.ML.visualizers.model_summary import ModelSummary
|
||||
|
||||
|
||||
def create_optimizer(config: DeepLearningConfig, parameters: Iterator[Parameter]) -> torch.optim.Optimizer:
|
||||
def create_optimizer(config: OptimizerParams, parameters: Iterator[Parameter]) -> torch.optim.Optimizer:
|
||||
# Select optimizer type
|
||||
if config.optimizer_type in [OptimizerType.Adam, OptimizerType.AMSGrad]:
|
||||
return torch.optim.Adam(parameters, config.l_rate,
|
||||
|
|
|
@ -15,7 +15,7 @@ from InnerEye.Azure.azure_util import RUN_CONTEXT, download_outputs_from_run, fe
|
|||
from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, check_properties_are_not_none
|
||||
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, \
|
||||
create_recovery_checkpoint_path, get_best_checkpoint_path
|
||||
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig
|
||||
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, OutputParams
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
@ -26,7 +26,7 @@ class RunRecovery:
|
|||
checkpoints_roots: List[Path]
|
||||
|
||||
@staticmethod
|
||||
def download_best_checkpoints_from_child_runs(config: DeepLearningConfig, run: Run) -> RunRecovery:
|
||||
def download_best_checkpoints_from_child_runs(config: OutputParams, run: Run) -> RunRecovery:
|
||||
"""
|
||||
Downloads the best checkpoints from all child runs of the provided Hyperdrive parent run.
|
||||
The checkpoints for the sibling runs will go into folder 'OTHER_RUNS/<cross_validation_split>'
|
||||
|
@ -61,7 +61,7 @@ class RunRecovery:
|
|||
return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots)
|
||||
|
||||
@staticmethod
|
||||
def download_all_checkpoints_from_run(config: DeepLearningConfig, run: Run) -> RunRecovery:
|
||||
def download_all_checkpoints_from_run(config: OutputParams, run: Run) -> RunRecovery:
|
||||
"""
|
||||
Downloads all checkpoints of the provided run: The best checkpoint and the recovery checkpoint.
|
||||
A single folder inside the checkpoints folder will be created that contains the downloaded checkpoints.
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, List
|
||||
|
||||
from InnerEye.Common.type_annotations import DictStrFloat
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ModelTrainingResults:
|
||||
"""
|
||||
Stores the results from training, with the results on training and validation data for each training epoch.
|
||||
"""
|
||||
train_results_per_epoch: List[DictStrFloat]
|
||||
val_results_per_epoch: List[DictStrFloat]
|
||||
train_diagnostics: Any
|
||||
val_diagnostics: Any
|
||||
optimal_temperature_scale_values_per_checkpoint_epoch: List[float] = field(default_factory=list)
|
||||
|
||||
def get_metric(self, is_training: bool, metric_type: str) -> List[float]:
|
||||
"""
|
||||
Gets a scalar metric out of either the list of training or the list of validation results. This returns
|
||||
that value that a specific metric attains in all of the epochs.
|
||||
:param is_training: If True, read metrics from the `train_results_per_epoch` field, if False read from the
|
||||
`val_results_per_epoch` field.
|
||||
:param metric_type: The metric to extract.
|
||||
:return: A list of floating point numbers, with one entry per entry in the the training or validation results.
|
||||
"""
|
||||
metrics = self.train_results_per_epoch if is_training else self.val_results_per_epoch
|
||||
return [m[metric_type] for m in metrics]
|
||||
|
||||
def get_training_metric(self, metric_type: str) -> List[float]:
|
||||
"""
|
||||
Gets a scalar metric from the list of training results. This returns
|
||||
the value that a specific metric attains in all of the epochs.
|
||||
:param metric_type: The metric to extract.
|
||||
:return: A list of floating point numbers, with one entry per entry in the the training results.
|
||||
"""
|
||||
return self.get_metric(is_training=True, metric_type=metric_type)
|
||||
|
||||
def get_validation_metric(self, metric_type: str) -> List[float]:
|
||||
"""
|
||||
Gets a scalar metric from the list of validation results. This returns
|
||||
the value that a specific metric attains in all of the epochs.
|
||||
:param metric_type: The metric to extract.
|
||||
:return: A list of floating point numbers, with one entry per entry in the the validation results.
|
||||
"""
|
||||
return self.get_metric(is_training=False, metric_type=metric_type)
|
|
@ -16,8 +16,7 @@ from InnerEye.ML.dataset.cropping_dataset import CroppingDataset
|
|||
from InnerEye.ML.dataset.full_image_dataset import FullImageDataset
|
||||
from InnerEye.ML.dataset.sample import Sample
|
||||
from InnerEye.ML.plotting import resize_and_save, scan_with_transparent_overlay
|
||||
from InnerEye.ML.utils import augmentation, io_util, ml_util
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils import augmentation, io_util
|
||||
# The name of the folder inside the default outputs folder that will holds plots that show the effect of
|
||||
# sampling random patches
|
||||
from InnerEye.ML.utils.image_util import get_unit_image_header
|
||||
|
@ -113,7 +112,7 @@ def visualize_random_crops_for_dataset(config: SegmentationModelBase, output_fol
|
|||
for training. Visualizations are stored in both Nifti format, and as 3 PNG thumbnail files, in the output folder.
|
||||
:param config: The model configuration.
|
||||
:param output_folder: The folder in which the visualizations should be written. If not provided, use a subfolder
|
||||
"patch_sampling" in the models's default output folder
|
||||
"patch_sampling" in the model's default output folder
|
||||
"""
|
||||
dataset_splits = config.get_dataset_splits()
|
||||
# Load a sample using the full image data loader
|
||||
|
@ -123,24 +122,3 @@ def visualize_random_crops_for_dataset(config: SegmentationModelBase, output_fol
|
|||
for sample_index in range(count):
|
||||
sample = full_image_dataset.get_samples_at_index(index=sample_index)[0]
|
||||
visualize_random_crops(sample, config, output_folder=output_folder)
|
||||
|
||||
|
||||
def main(args: CheckPatchSamplingConfig) -> None:
|
||||
# Identify paths to inputs and outputs
|
||||
commandline_args = {
|
||||
"train_batch_size": 1,
|
||||
"local_dataset": Path(args.local_dataset)
|
||||
}
|
||||
output_folder = Path(args.output_folder)
|
||||
output_folder.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create a config file
|
||||
config = ModelConfigLoader[SegmentationModelBase]().create_model_config_from_name(
|
||||
args.model_name, overrides=commandline_args)
|
||||
config.show_patch_sampling = args.number_samples
|
||||
ml_util.set_random_seed(config.random_seed)
|
||||
visualize_random_crops_for_dataset(config, output_folder=output_folder)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(CheckPatchSamplingConfig.parse_args())
|
||||
|
|
|
@ -11,6 +11,7 @@ On the modelling side, this toolbox supports
|
|||
- Segmentation models
|
||||
- Classification and regression models
|
||||
- Sequence models
|
||||
- Adding cloud support to any PyTorch Lightning model, via a [bring-your-own-model setup](docs/bring_your_own_model.md)
|
||||
|
||||
Classification, regression, and sequence models can be built with only images as inputs, or a combination of images
|
||||
and non-imaging data as input. This supports typical use cases on medical data where measurements, biomarkers,
|
||||
|
|
|
@ -41,7 +41,7 @@ from InnerEye.Scripts import submit_for_inference
|
|||
from Tests.ML.util import assert_nifti_content, get_default_azure_config, get_nifti_shape
|
||||
|
||||
FALLBACK_ENSEMBLE_RUN = "refs_pull_432_merge:HD_3af84e4a-0043-4260-8be2-04ce9ab09b1f"
|
||||
FALLBACK_SINGLE_RUN = "refs_pull_407_merge_1614271518_cdbeb28e"
|
||||
FALLBACK_SINGLE_RUN = "refs_pull_407_merge:refs_pull_407_merge_1614271518_cdbeb28e"
|
||||
FALLBACK_2NODE_RUN = "refs_pull_385_merge:refs_pull_385_merge_1612421371_ba12a007"
|
||||
FALLBACK_CV_GLAUCOMA = "refs_pull_432_merge_1618332810_b5d10d74"
|
||||
|
||||
|
|
|
@ -1,68 +0,0 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import pytest
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY
|
||||
from InnerEye.Common.build_config import BUILDINFORMATION_JSON, ExperimentResultLocation, \
|
||||
build_information_to_dot_net_json, build_information_to_dot_net_json_file
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.scalar_config import ScalarModelBase
|
||||
|
||||
|
||||
def test_build_config(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test that json with build information is created correctly.
|
||||
"""
|
||||
config = AzureConfig(
|
||||
build_number=42,
|
||||
build_user="user",
|
||||
build_branch="branch",
|
||||
build_source_id="00deadbeef",
|
||||
build_source_author="author",
|
||||
tag="tag",
|
||||
model="model")
|
||||
result_location = ExperimentResultLocation(azure_job_name="job")
|
||||
net_json = build_information_to_dot_net_json(config, result_location)
|
||||
expected = '{"BuildNumber": 42, "BuildRequestedFor": "user", "BuildSourceBranchName": "branch", ' \
|
||||
'"BuildSourceVersion": "00deadbeef", "BuildSourceAuthor": "author", "ModelName": "model", ' \
|
||||
'"ResultsContainerName": null, "ResultsUri": null, "DatasetFolder": null, "DatasetFolderUri": null, ' \
|
||||
'"AzureBatchJobName": "job"}'
|
||||
assert expected == net_json
|
||||
result_folder = test_output_dirs.root_dir / "buildinfo"
|
||||
build_information_to_dot_net_json_file(config, result_location, folder=result_folder)
|
||||
result_file = result_folder / BUILDINFORMATION_JSON
|
||||
assert result_file.exists()
|
||||
assert result_file.read_text() == expected
|
||||
|
||||
|
||||
def test_fields_are_set() -> None:
|
||||
"""
|
||||
Tests that expected fields are set when creating config classes.
|
||||
"""
|
||||
expected = [("hello", None), ("world", None)]
|
||||
config = SegmentationModelBase(
|
||||
should_validate=False,
|
||||
ground_truth_ids=[x[0] for x in expected],
|
||||
largest_connected_component_foreground_classes=expected
|
||||
)
|
||||
assert hasattr(config, CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY)
|
||||
assert config.largest_connected_component_foreground_classes == expected
|
||||
|
||||
|
||||
@pytest.mark.cpu_and_gpu
|
||||
def test_dataset_reader_workers() -> None:
|
||||
"""
|
||||
Test to make sure the number of dataset reader workers are set correctly
|
||||
"""
|
||||
config = ScalarModelBase(
|
||||
should_validate=False,
|
||||
num_dataset_reader_workers=-1
|
||||
)
|
||||
if config.is_offline_run:
|
||||
assert config.num_dataset_reader_workers == -1
|
||||
else:
|
||||
assert config.num_dataset_reader_workers == 0
|
|
@ -2,6 +2,7 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
@ -12,62 +13,71 @@ from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR
|
|||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.config import PhotometricNormalizationMethod, SegmentationModelBase
|
||||
from InnerEye.ML.runner import Runner
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_default_namespace", [True, False])
|
||||
@pytest.mark.parametrize("is_container", [True, False])
|
||||
@pytest.mark.parametrize("is_offline_run", [True, False])
|
||||
@pytest.mark.parametrize("set_output_to", [True, False])
|
||||
def test_create_ml_runner_args(is_default_namespace: bool,
|
||||
def test_create_ml_runner_args(is_container: bool,
|
||||
test_output_dirs: OutputFolderForTests,
|
||||
is_offline_run: bool,
|
||||
set_output_to: bool) -> None:
|
||||
"""Test round trip parsing of commandline arguments:
|
||||
From arguments to the Azure runner to the arguments of the ML runner, checking that
|
||||
whatever is passed on can be correctly parsed."""
|
||||
whatever is passed on can be correctly parsed. It also checks that the output files go into the right place
|
||||
in local runs and in AzureML."""
|
||||
logging_to_stdout()
|
||||
model_name = "Lung"
|
||||
model_name = "DummyContainerWithPlainLightning" if is_container else "DummyModel"
|
||||
if is_container:
|
||||
dataset_folder = Path("download")
|
||||
else:
|
||||
local_dataset = DummyModel().local_dataset
|
||||
assert local_dataset is not None
|
||||
dataset_folder = local_dataset
|
||||
outputs_folder = test_output_dirs.root_dir
|
||||
project_root = fixed_paths.repository_root_directory()
|
||||
if is_default_namespace:
|
||||
model_configs_namespace = None
|
||||
else:
|
||||
model_configs_namespace = "Tests.ML.configs"
|
||||
model_name = "DummyModel"
|
||||
model_configs_namespace = "Tests.ML.configs"
|
||||
|
||||
args_list = [f"--model={model_name}", "--train=True", "--l_rate=100.0",
|
||||
"--norm_method=Simple Norm", "--subscription_id", "Test1", "--tenant_id=Test2",
|
||||
"--application_id", "Test3", "--azureml_datastore", "Test5",
|
||||
"--pytest_mark", "gpu"]
|
||||
"--subscription_id", "Test1", "--tenant_id=Test2",
|
||||
"--application_id", "Test3", "--azureml_datastore", "Test5"]
|
||||
|
||||
# toggle the output_to flag off only for online runs
|
||||
if set_output_to or is_offline_run:
|
||||
args_list.append(f"--output_to={outputs_folder}")
|
||||
if not is_container:
|
||||
args_list.append("--norm_method=Simple Norm")
|
||||
|
||||
if not is_default_namespace:
|
||||
args_list.append(f"--model_configs_namespace={model_configs_namespace}")
|
||||
args_list.append(f"--model_configs_namespace={model_configs_namespace}")
|
||||
|
||||
with mock.patch("sys.argv", [""] + args_list):
|
||||
with mock.patch("InnerEye.ML.deep_learning_config.is_offline_run_context", return_value=is_offline_run):
|
||||
runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
|
||||
runner.parse_and_load_model()
|
||||
azure_config = runner.azure_config
|
||||
model_config = runner.model_config
|
||||
with mock.patch("InnerEye.ML.run_ml.MLRunner.run", return_value=None):
|
||||
with mock.patch("InnerEye.ML.run_ml.MLRunner.mount_or_download_dataset", return_value=dataset_folder):
|
||||
runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
|
||||
runner.parse_and_load_model()
|
||||
# Only when calling config.create_filesystem we expect to see the correct paths, and this happens
|
||||
# inside run_in_situ
|
||||
runner.run_in_situ()
|
||||
azure_config = runner.azure_config
|
||||
container_or_legacy_config = runner.lightning_container if is_container else runner.model_config
|
||||
assert azure_config.model == model_name
|
||||
assert model_config.l_rate == 100.0
|
||||
assert model_config.norm_method == PhotometricNormalizationMethod.SimpleNorm
|
||||
if not is_container:
|
||||
assert container_or_legacy_config.norm_method == PhotometricNormalizationMethod.SimpleNorm
|
||||
if set_output_to or is_offline_run:
|
||||
# The actual output folder must be a subfolder of the folder given on the commandline. The folder will contain
|
||||
# a timestamp, that will start with the year number, hence will start with 20...
|
||||
assert str(model_config.outputs_folder).startswith(str(outputs_folder / "20"))
|
||||
assert model_config.logs_folder == (model_config.outputs_folder / DEFAULT_LOGS_DIR_NAME)
|
||||
assert str(container_or_legacy_config.outputs_folder).startswith(str(outputs_folder / "20"))
|
||||
assert container_or_legacy_config.logs_folder == \
|
||||
(container_or_legacy_config.outputs_folder / DEFAULT_LOGS_DIR_NAME)
|
||||
else:
|
||||
# For runs inside AzureML, the output folder is the project root (the root of the folders that are
|
||||
# included in the snapshot). The "outputs_to" argument will be ignored.
|
||||
assert model_config.outputs_folder == (project_root / DEFAULT_AML_UPLOAD_DIR)
|
||||
assert model_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME)
|
||||
assert container_or_legacy_config.outputs_folder == (project_root / DEFAULT_AML_UPLOAD_DIR)
|
||||
assert container_or_legacy_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME)
|
||||
|
||||
assert not hasattr(model_config, "azureml_datastore")
|
||||
assert azure_config.pytest_mark == "gpu"
|
||||
assert not hasattr(container_or_legacy_config, "azureml_datastore")
|
||||
|
||||
|
||||
def test_overridable_properties() -> None:
|
||||
|
@ -146,6 +156,7 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non
|
|||
yaml_config_file=yaml_file)
|
||||
loader_result = runner.parse_and_load_model()
|
||||
assert runner.azure_config is not None
|
||||
assert runner.model_config is not None
|
||||
# This is only present in yaml
|
||||
# This is present in yaml and command line, and the latter should be used.
|
||||
assert runner.azure_config.tenant_id == "bar"
|
||||
|
|
|
@ -8,7 +8,7 @@ from typing import Any, List, Optional, Tuple
|
|||
import param
|
||||
import pytest
|
||||
|
||||
from InnerEye.Common.generic_parsing import GenericConfig, IntTuple
|
||||
from InnerEye.Common.generic_parsing import GenericConfig, IntTuple, create_from_matching_params
|
||||
|
||||
|
||||
class ParamEnum(Enum):
|
||||
|
@ -57,6 +57,7 @@ def test_create_parser() -> None:
|
|||
"""
|
||||
Check that parse_args works as expected, with both non default and default values.
|
||||
"""
|
||||
|
||||
def check(arg: List[str], expected_key: str, expected_value: Any) -> None:
|
||||
parsed = ParamClass.parse_args(arg)
|
||||
assert getattr(parsed, expected_key) == expected_value
|
||||
|
@ -127,3 +128,44 @@ def test_int_tuple_validation(value_idx_0: Any, value_idx_1: Any, value_idx_2: A
|
|||
m.int_tuple = (value_idx_0, value_idx_1, value_idx_2)
|
||||
else:
|
||||
m.int_tuple = (value_idx_0, value_idx_1, value_idx_2)
|
||||
|
||||
|
||||
class ClassFrom(param.Parameterized):
|
||||
foo = param.String("foo")
|
||||
bar = param.Integer(1)
|
||||
baz = param.String("baz")
|
||||
_private = param.String("private")
|
||||
constant = param.String("constant", constant=True)
|
||||
|
||||
|
||||
class ClassTo(param.Parameterized):
|
||||
foo = param.String("foo2")
|
||||
bar = param.Integer(2)
|
||||
_private = param.String("private2")
|
||||
constant = param.String("constant2", constant=True)
|
||||
|
||||
|
||||
class NotParameterized:
|
||||
foo = 1
|
||||
|
||||
|
||||
def test_create_from_matching_params() -> None:
|
||||
"""
|
||||
Test if Parameterized objects can be cloned by looking at matching fields.
|
||||
"""
|
||||
class_from = ClassFrom()
|
||||
class_to = create_from_matching_params(class_from, cls_=ClassTo)
|
||||
assert isinstance(class_to, ClassTo)
|
||||
assert class_to.foo == "foo"
|
||||
assert class_to.bar == 1
|
||||
# Constant fields should not be touched
|
||||
assert class_to.constant == "constant2"
|
||||
# Private fields must be copied over.
|
||||
assert class_to._private == "private"
|
||||
# Baz is only present in the "from" object, and should not be copied to the new object
|
||||
assert not hasattr(class_to, "baz")
|
||||
|
||||
with pytest.raises(ValueError) as ex:
|
||||
create_from_matching_params(class_from, NotParameterized)
|
||||
assert "subclass of param.Parameterized" in str(ex)
|
||||
assert "NotParameterized" in str(ex)
|
||||
|
|
|
@ -2,11 +2,14 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from InnerEye.Common import common_util
|
||||
from InnerEye.Common.common_util import check_is_any_of, is_private_field_name, namespace_to_path, \
|
||||
from InnerEye.Common.common_util import change_working_directory, check_is_any_of, is_private_field_name, \
|
||||
namespace_to_path, \
|
||||
path_to_namespace, print_exception
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path, tests_root_directory
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
|
@ -106,3 +109,18 @@ def test_path_to_namespace(is_external: bool, test_output_dirs: OutputFolderForT
|
|||
path=full_ml_test_data_path(),
|
||||
root=tests_root_directory().parent
|
||||
) == test_data.__name__
|
||||
|
||||
|
||||
def test_change_dir(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test the context manager for changing directories.
|
||||
"""
|
||||
os.chdir(test_output_dirs.root_dir)
|
||||
assert Path.cwd() == test_output_dirs.root_dir
|
||||
new_dir = test_output_dirs.root_dir / "foo"
|
||||
new_dir.mkdir()
|
||||
with change_working_directory(new_dir):
|
||||
assert Path.cwd() == new_dir
|
||||
Path("bar.txt").touch()
|
||||
assert Path.cwd() == test_output_dirs.root_dir
|
||||
assert (new_dir / "bar.txt").is_file()
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
||||
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
|
||||
# individual warnings only.
|
||||
# flake8: noqa
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
from _pytest.monkeypatch import MonkeyPatch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
|
||||
from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
|
||||
from InnerEye.ML.configs.other.fastmri_varnet import VarNetWithImageLogging
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
add_folder_to_sys_path_if_needed("fastMRI")
|
||||
|
||||
from fastmri.data import SliceDataset
|
||||
from fastmri.data.subsample import create_mask_for_mask_type
|
||||
from fastmri.data.transforms import VarNetDataTransform
|
||||
from fastmri.pl_modules import FastMriDataModule
|
||||
# This import can fail if written as "from tests.create_temp_data, even though fastMRI is already in the path.
|
||||
from fastMRI.tests.create_temp_data import create_temp_data
|
||||
|
||||
|
||||
class FastMriRandomData(FastMriDataModule):
|
||||
def __init__(self) -> None:
|
||||
data_path = Path.cwd() / "data"
|
||||
if data_path.is_dir():
|
||||
shutil.rmtree(str(data_path))
|
||||
data_path.mkdir(exist_ok=False, parents=True)
|
||||
_, _, metadata = create_temp_data(data_path)
|
||||
|
||||
def retrieve_metadata_mock(a: Any, fname: Any) -> Any:
|
||||
return metadata[str(fname)]
|
||||
|
||||
# That's a bit flaky, we should be un-doing that after, but there's no obvious place of doing so.
|
||||
MonkeyPatch().setattr(SliceDataset, "_retrieve_metadata", retrieve_metadata_mock)
|
||||
|
||||
mask = create_mask_for_mask_type(mask_type_str="equispaced",
|
||||
center_fractions=[0.08],
|
||||
accelerations=[4])
|
||||
# use random masks for train transform, fixed masks for val transform
|
||||
train_transform = VarNetDataTransform(mask_func=mask, use_seed=False)
|
||||
val_transform = VarNetDataTransform(mask_func=mask)
|
||||
test_transform = VarNetDataTransform()
|
||||
|
||||
FastMriDataModule.__init__(self,
|
||||
data_path=data_path / "knee_data",
|
||||
challenge="multicoil",
|
||||
train_transform=train_transform,
|
||||
val_transform=val_transform,
|
||||
test_transform=test_transform)
|
||||
|
||||
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
|
||||
print("FastMriRandomData.prepare_data")
|
||||
|
||||
def setup(self, stage: Optional[str] = None) -> None:
|
||||
print("FastMriRandomData.setup")
|
||||
|
||||
|
||||
class FastMriOnRandomData(LightningContainer):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.num_epochs = 1
|
||||
# Restrict to a single GPU, because we have code in dataset creation that could cause race conditions
|
||||
self.max_num_gpus = 1
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return VarNetWithImageLogging()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
# Local_dataset is set via the commandline to a random folder for unit testss
|
||||
return FastMriRandomData()
|
|
@ -0,0 +1,230 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
import pandas as pd
|
||||
import param
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from pytorch_lightning.metrics import MeanSquaredError
|
||||
from torch import Tensor
|
||||
from torch.nn import Identity
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer, LightningModuleWithOptimizer
|
||||
|
||||
|
||||
class DummyContainerWithDatasets(LightningContainer):
|
||||
def __init__(self, has_local_dataset: bool = False, has_azure_dataset: bool = False):
|
||||
super().__init__()
|
||||
self.local_dataset = full_ml_test_data_path("lightning_module_data") if has_local_dataset else None
|
||||
self.azure_dataset_id = "azure_dataset" if has_azure_dataset else ""
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return LightningModuleWithOptimizer()
|
||||
|
||||
|
||||
class DummyContainerWithAzureDataset(DummyContainerWithDatasets):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(has_azure_dataset=True)
|
||||
|
||||
|
||||
class DummyContainerWithoutDataset(DummyContainerWithDatasets):
|
||||
pass
|
||||
|
||||
|
||||
class DummyContainerWithLocalDataset(DummyContainerWithDatasets):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(has_local_dataset=True)
|
||||
|
||||
|
||||
class DummyContainerWithAzureAndLocalDataset(DummyContainerWithDatasets):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(has_local_dataset=True, has_azure_dataset=True)
|
||||
|
||||
|
||||
class InferenceWithParameters(LightningModule):
|
||||
model_param = param.String(default="bar")
|
||||
|
||||
def __init__(self, container_param: str):
|
||||
super().__init__()
|
||||
|
||||
|
||||
class DummyContainerWithParameters(LightningContainer):
|
||||
container_param = param.String(default="foo")
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return InferenceWithParameters(self.container_param)
|
||||
|
||||
|
||||
class DummyRegressionPlainLightning(LightningModuleWithOptimizer):
|
||||
"""
|
||||
A class that only implements plain Lightning training and test. Ideally, we want to support importing any plain
|
||||
Lightning module without further methods added. This class here inherits LightningWithInference, but does not
|
||||
implement the inference_step method
|
||||
"""
|
||||
|
||||
def __init__(self, in_features: int = 1, *args: Any, **kwargs: Any):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.l_rate = 1e-1
|
||||
activation = Identity()
|
||||
layers = [
|
||||
torch.nn.Linear(in_features=in_features, out_features=1, bias=True),
|
||||
activation
|
||||
]
|
||||
self.model = torch.nn.Sequential(*layers) # type: ignore
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor: # type: ignore
|
||||
return self.model(x)
|
||||
|
||||
def training_step(self, batch: Any, *args: Any, **kwargs: Any) -> torch.Tensor: # type: ignore
|
||||
input, target = batch
|
||||
prediction = self.forward(input)
|
||||
loss = torch.nn.functional.mse_loss(prediction, target)
|
||||
self.log("loss", loss, on_epoch=True, on_step=True)
|
||||
return loss
|
||||
|
||||
def test_step(self, batch, batch_idx) -> torch.Tensor: # type: ignore
|
||||
Path("test_step.txt").touch()
|
||||
input, target = batch
|
||||
prediction = self.forward(input)
|
||||
loss = torch.nn.functional.mse_loss(prediction, target)
|
||||
self.log("test_loss", loss, on_epoch=True, on_step=True)
|
||||
return loss
|
||||
|
||||
def on_test_epoch_end(self) -> None:
|
||||
Path("on_test_epoch_end.txt").touch()
|
||||
pass
|
||||
|
||||
|
||||
class DummyRegression(DummyRegressionPlainLightning, InnerEyeInference):
|
||||
def __init__(self, in_features: int = 1, *args, **kwargs) -> None: # type: ignore
|
||||
super().__init__(in_features=in_features, *args, **kwargs) # type: ignore
|
||||
self.l_rate = 1e-1
|
||||
self.dataset_split = ModelExecutionMode.TRAIN
|
||||
activation = Identity()
|
||||
layers = [
|
||||
torch.nn.Linear(in_features=in_features, out_features=1, bias=True),
|
||||
activation
|
||||
]
|
||||
self.model = torch.nn.Sequential(*layers) # type: ignore
|
||||
|
||||
def forward(self, x: Tensor) -> Tensor: # type: ignore
|
||||
return self.model(x)
|
||||
|
||||
def training_step(self, batch, *args, **kwargs) -> torch.Tensor: # type: ignore
|
||||
input, target = batch
|
||||
prediction = self.forward(input)
|
||||
loss = torch.nn.functional.mse_loss(prediction, target)
|
||||
self.log("loss", loss, on_epoch=True, on_step=True)
|
||||
return loss
|
||||
|
||||
def on_inference_start(self) -> None:
|
||||
Path("on_inference_start.txt").touch()
|
||||
self.inference_mse: Dict[ModelExecutionMode, float] = {}
|
||||
|
||||
def on_inference_epoch_start(self, dataset_split: ModelExecutionMode, is_ensemble_model: bool) -> None:
|
||||
self.dataset_split = dataset_split
|
||||
Path(f"on_inference_start_{self.dataset_split.value}.txt").touch()
|
||||
self.mse = MeanSquaredError()
|
||||
|
||||
def inference_step(self, item: Tuple[Tensor, Tensor], batch_idx: int, model_output: torch.Tensor) -> None:
|
||||
input, target = item
|
||||
prediction = self.forward(input)
|
||||
self.mse(prediction, target)
|
||||
with Path(f"inference_step_{self.dataset_split.value}.txt").open(mode="a") as f:
|
||||
f.write(f"{prediction.item()},{target.item()}\n")
|
||||
|
||||
def on_inference_epoch_end(self) -> None:
|
||||
Path(f"on_inference_end_{self.dataset_split.value}.txt").touch()
|
||||
self.inference_mse[self.dataset_split] = self.mse.compute().item()
|
||||
self.mse.reset()
|
||||
|
||||
def on_inference_end(self) -> None:
|
||||
Path("on_inference_end.txt").touch()
|
||||
df = pd.DataFrame(columns=["Split", "MSE"],
|
||||
data=[[split.value, mse] for split, mse in self.inference_mse.items()])
|
||||
df.to_csv("metrics_per_split.csv", index=False)
|
||||
|
||||
|
||||
class FixedDataset(Dataset):
|
||||
def __init__(self, inputs_and_targets: List[Tuple[Any, Any]]):
|
||||
super().__init__()
|
||||
self.inputs_and_targets = inputs_and_targets
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.inputs_and_targets)
|
||||
|
||||
def __getitem__(self, item: int) -> Tuple[Tensor, Tensor]:
|
||||
input = torch.tensor([float(self.inputs_and_targets[item][0])])
|
||||
target = torch.tensor([float(self.inputs_and_targets[item][1])])
|
||||
return input, target
|
||||
|
||||
|
||||
class FixedRegressionData(LightningDataModule):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.train_data = [(i, i) for i in range(1, 20, 3)]
|
||||
self.val_data = [(i, i) for i in range(2, 20, 3)]
|
||||
self.test_data = [(i, i) for i in range(3, 20, 3)]
|
||||
|
||||
def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(FixedDataset(self.train_data)) # type: ignore
|
||||
|
||||
def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(FixedDataset(self.val_data)) # type: ignore
|
||||
|
||||
def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
|
||||
return DataLoader(FixedDataset(self.test_data)) # type: ignore
|
||||
|
||||
|
||||
class DummyContainerWithModel(LightningContainer):
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.perform_training_set_inference = True
|
||||
self.num_epochs = 50
|
||||
self.l_rate = 1e-1
|
||||
|
||||
def setup(self) -> None:
|
||||
assert self.local_dataset is not None
|
||||
(self.local_dataset / "setup.txt").touch()
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return DummyRegression()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
return FixedRegressionData() # type: ignore
|
||||
|
||||
def create_report(self) -> None:
|
||||
Path("create_report.txt").touch()
|
||||
|
||||
|
||||
class DummyContainerWithInvalidTrainerArguments(LightningContainer):
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return DummyRegression()
|
||||
|
||||
def get_trainer_arguments(self) -> Dict[str, Any]:
|
||||
return {"no_such_argument": 1}
|
||||
|
||||
|
||||
class DummyContainerWithPlainLightning(LightningContainer):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.num_epochs = 100
|
||||
self.l_rate = 1e-2
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return DummyRegressionPlainLightning()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
return FixedRegressionData() # type: ignore
|
|
@ -21,7 +21,6 @@ from InnerEye.ML.deep_learning_config import TemperatureScalingConfig
|
|||
from InnerEye.ML.lightning_models import transfer_batch_to_device
|
||||
from InnerEye.ML.model_config_base import ModelTransformsPerExecutionMode
|
||||
from InnerEye.ML.model_testing import create_metrics_dict_for_scalar_models
|
||||
from InnerEye.ML.model_training import model_train
|
||||
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImageEncoder, ImagingFeatureType
|
||||
from InnerEye.ML.models.architectures.sequential.rnn_classifier import RNNClassifier, RNNClassifierWithEncoder
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
|
@ -34,7 +33,7 @@ from InnerEye.ML.utils.io_util import ImageAndSegmentations
|
|||
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling, get_scalar_model_inputs_and_labels
|
||||
from InnerEye.ML.utils.split_dataset import DatasetSplits
|
||||
from InnerEye.ML.visualizers.grad_cam_hooks import VisualizationMaps
|
||||
from Tests.ML.util import get_default_azure_config, get_default_checkpoint_handler
|
||||
from Tests.ML.util import get_default_azure_config, model_train_unittest
|
||||
|
||||
SCAN_SIZE = (6, 64, 60)
|
||||
|
||||
|
@ -213,8 +212,7 @@ def test_rnn_classifier_via_config_1(use_combined_model: bool,
|
|||
image_and_seg = ImageAndSegmentations[np.ndarray](images=np.random.uniform(0, 1, SCAN_SIZE),
|
||||
segmentations=np.random.randint(0, 2, SCAN_SIZE))
|
||||
with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
|
||||
model_train(config, get_default_checkpoint_handler(model_config=config,
|
||||
project_root=test_output_dirs.root_dir))
|
||||
model_train_unittest(config, dirs=test_output_dirs)
|
||||
|
||||
|
||||
@pytest.mark.skipif(common_util.is_windows(), reason="Has issues on windows build")
|
||||
|
@ -247,7 +245,7 @@ def test_run_ml_with_sequence_model(use_combined_model: bool,
|
|||
with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
|
||||
azure_config = get_default_azure_config()
|
||||
azure_config.train = True
|
||||
MLRunner(config, azure_config).run()
|
||||
MLRunner(config, azure_config=azure_config).run()
|
||||
|
||||
|
||||
@pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
|
||||
|
@ -379,8 +377,7 @@ def test_rnn_classifier_via_config_2(test_output_dirs: OutputFolderForTests) ->
|
|||
config.num_epochs = 2
|
||||
config.set_output_to(test_output_dirs.root_dir)
|
||||
config.dataset_data_frame = _get_mock_sequence_dataset(dataset_contents)
|
||||
results = model_train(config, get_default_checkpoint_handler(model_config=config,
|
||||
project_root=test_output_dirs.root_dir))
|
||||
results, _ = model_train_unittest(config, dirs=test_output_dirs)
|
||||
|
||||
actual_train_loss = results.get_metric(is_training=True, metric_type=MetricType.LOSS.value)[-1]
|
||||
actual_val_loss = results.get_metric(is_training=False, metric_type=MetricType.LOSS.value)[-1]
|
||||
|
@ -455,7 +452,7 @@ def test_run_ml_with_multi_label_sequence_model(test_output_dirs: OutputFolderFo
|
|||
config.max_batch_grad_cam = 1
|
||||
azure_config = get_default_azure_config()
|
||||
azure_config.train = True
|
||||
MLRunner(config, azure_config).run()
|
||||
MLRunner(config, azure_config=azure_config).run()
|
||||
# The metrics file should have one entry per epoch per subject per prediction target,
|
||||
# for all the 3 prediction targets.
|
||||
metrics_file = config.outputs_folder / "Train" / SUBJECT_METRICS_FILE_NAME
|
||||
|
|
|
@ -20,7 +20,6 @@ from InnerEye.Common.type_annotations import TupleInt3
|
|||
from InnerEye.ML.dataset.scalar_dataset import ScalarDataset
|
||||
from InnerEye.ML.lightning_models import transfer_batch_to_device
|
||||
from InnerEye.ML.model_config_base import ModelTransformsPerExecutionMode
|
||||
from InnerEye.ML.model_training import model_train
|
||||
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImageEncoderWithMlp, \
|
||||
ImagingFeatureType
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
|
@ -34,7 +33,7 @@ from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling,
|
|||
from InnerEye.ML.utils.split_dataset import DatasetSplits
|
||||
from InnerEye.ML.visualizers.grad_cam_hooks import VisualizationMaps
|
||||
from InnerEye.ML.visualizers.model_summary import ModelSummary
|
||||
from Tests.ML.util import get_default_azure_config, get_default_checkpoint_handler
|
||||
from Tests.ML.util import get_default_azure_config, model_train_unittest
|
||||
|
||||
|
||||
class ImageEncoder(ScalarModelBase):
|
||||
|
@ -222,8 +221,7 @@ S3,week1,scan3.npy,True,6,60,Male,Val2
|
|||
summarizer.generate_summary(input_sizes=input_size)
|
||||
config.local_dataset = dataset_folder
|
||||
config.validate()
|
||||
model_train(config, checkpoint_handler=get_default_checkpoint_handler(model_config=config,
|
||||
project_root=Path(test_output_dirs.root_dir)))
|
||||
model_train_unittest(config, dirs=test_output_dirs)
|
||||
# No further asserts here because the models are still in experimental state. Most errors would come
|
||||
# from having invalid model architectures, which would throw runtime errors during training.
|
||||
|
||||
|
@ -231,13 +229,13 @@ S3,week1,scan3.npy,True,6,60,Male,Val2
|
|||
@pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
|
||||
@pytest.mark.gpu
|
||||
@pytest.mark.parametrize(["encode_channels_jointly", "aggregation_type", "imaging_feature_type"],
|
||||
[(False, AggregationType.Average, ImagingFeatureType.Segmentation),
|
||||
(True, AggregationType.Average, ImagingFeatureType.Segmentation),
|
||||
(False, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.GatedPooling, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.MixPooling, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.ZAdaptive3dAvg, ImagingFeatureType.ImageAndSegmentation)])
|
||||
[(False, AggregationType.Average, ImagingFeatureType.Segmentation),
|
||||
(True, AggregationType.Average, ImagingFeatureType.Segmentation),
|
||||
(False, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.GatedPooling, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.MixPooling, ImagingFeatureType.ImageAndSegmentation),
|
||||
(True, AggregationType.ZAdaptive3dAvg, ImagingFeatureType.ImageAndSegmentation)])
|
||||
def test_image_encoder_with_segmentation(test_output_dirs: OutputFolderForTests,
|
||||
encode_channels_jointly: bool,
|
||||
aggregation_type: AggregationType,
|
||||
|
@ -274,7 +272,7 @@ def test_image_encoder_with_segmentation(test_output_dirs: OutputFolderForTests,
|
|||
with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
|
||||
azure_config = get_default_azure_config()
|
||||
azure_config.train = True
|
||||
MLRunner(config, azure_config).run()
|
||||
MLRunner(config, azure_config=azure_config).run()
|
||||
# No further asserts here because the models are still in experimental state. Most errors would come
|
||||
# from having invalid model architectures, which would throw runtime errors during training.
|
||||
# Verified manually that the cross entropy on the Val set that appears during training, and the
|
||||
|
|
|
@ -11,13 +11,11 @@ import pytest
|
|||
from InnerEye.Common import common_util
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
|
||||
from InnerEye.ML.model_training import model_train
|
||||
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import create_mlp
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from InnerEye.ML.scalar_config import ScalarLoss, ScalarModelBase
|
||||
from InnerEye.ML.utils.split_dataset import DatasetSplits
|
||||
|
||||
from Tests.ML.util import get_default_checkpoint_handler
|
||||
from Tests.ML.util import model_train_unittest
|
||||
|
||||
|
||||
class NonImageEncoder(ScalarModelBase):
|
||||
|
@ -73,11 +71,11 @@ def test_non_image_encoder(test_output_dirs: OutputFolderForTests,
|
|||
config.max_batch_grad_cam = 1
|
||||
config.validate()
|
||||
# run model training
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
|
||||
project_root=Path(test_output_dirs.root_dir))
|
||||
model_train(config, checkpoint_handler=checkpoint_handler)
|
||||
_, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
|
||||
# run model inference
|
||||
MLRunner(config).model_inference_train_and_test(checkpoint_handler=checkpoint_handler)
|
||||
runner = MLRunner(config)
|
||||
runner.setup()
|
||||
runner.model_inference_train_and_test(checkpoint_handler=checkpoint_handler)
|
||||
assert config.get_total_number_of_non_imaging_features() == 18
|
||||
|
||||
|
||||
|
|
|
@ -4,18 +4,17 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from InnerEye.Common.common_util import logging_to_stdout
|
||||
from InnerEye.Common.metrics_constants import MetricType
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML import model_testing, model_training
|
||||
from InnerEye.ML import model_testing
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.metrics import InferenceMetricsForClassification
|
||||
from Tests.ML.configs.ClassificationModelForTesting2D import ClassificationModelForTesting2D
|
||||
from Tests.ML.util import get_default_checkpoint_handler
|
||||
from Tests.ML.util import model_train_unittest
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_mixed_precision", [False])
|
||||
|
@ -31,10 +30,7 @@ def test_train_2d_classification_model(test_output_dirs: OutputFolderForTests,
|
|||
# Train for 4 epochs, checkpoints at epochs 2 and 4
|
||||
config.num_epochs = 4
|
||||
config.use_mixed_precision = use_mixed_precision
|
||||
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
|
||||
project_root=Path(test_output_dirs.root_dir))
|
||||
model_training_result = model_training.model_train(config, checkpoint_handler=checkpoint_handler)
|
||||
model_training_result, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
|
||||
assert model_training_result is not None
|
||||
expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
|
||||
|
||||
|
|
|
@ -4,16 +4,20 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
import logging
|
||||
from typing import List
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from azureml.core import Run
|
||||
|
||||
from InnerEye.Common.common_util import logging_to_stdout, namespace_to_path
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.model_training import generate_and_print_model_summary
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling, generate_and_print_model_summary
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from Tests.ML.util import get_model_loader
|
||||
from Tests.ML.configs.lightning_test_containers import DummyContainerWithInvalidTrainerArguments, \
|
||||
DummyContainerWithParameters
|
||||
from Tests.ML.util import default_runner, get_model_loader, model_loader_including_tests, model_train_unittest
|
||||
|
||||
|
||||
def find_models() -> List[str]:
|
||||
|
@ -48,7 +52,7 @@ def test_load_all_configs(model_name: str) -> None:
|
|||
"""
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
config = ModelConfigLoader[SegmentationModelBase]().create_model_config_from_name(model_name)
|
||||
config = ModelConfigLoader().create_model_config_from_name(model_name)
|
||||
assert config.model_name == model_name, "Mismatch between definition .py file and model name"
|
||||
if config.is_segmentation_model:
|
||||
# Reduce the feature channels to a minimum, to make tests run fast on CPU.
|
||||
|
@ -98,12 +102,90 @@ def test_config_loader_as_in_registration() -> None:
|
|||
During model registration, the model config namespace is read out from the present model. Ensure that we
|
||||
can create a config loader that has that value as an input.
|
||||
"""
|
||||
loader1 = ModelConfigLoader[SegmentationModelBase]()
|
||||
loader1 = ModelConfigLoader()
|
||||
model_name = "BasicModel2Epochs"
|
||||
model = loader1.create_model_config_from_name(model_name)
|
||||
assert model is not None
|
||||
namespace = model.__module__
|
||||
loader2 = ModelConfigLoader[SegmentationModelBase](model_configs_namespace=namespace)
|
||||
loader2 = ModelConfigLoader(model_configs_namespace=namespace)
|
||||
assert len(loader2.module_search_specs) == 2
|
||||
model2 = loader2.create_model_config_from_name(model_name)
|
||||
assert model2 is not None
|
||||
|
||||
|
||||
def test_config_loader_on_lightning_container() -> None:
|
||||
"""
|
||||
Test if the config loader can load an model that is neither classification nor segmentation.
|
||||
"""
|
||||
# First test if the container can be instantiated at all (it is tricky to get that right when inheritance change)
|
||||
DummyContainerWithParameters()
|
||||
logging_to_stdout(log_level=logging.DEBUG)
|
||||
model = model_loader_including_tests.create_model_config_from_name("DummyContainerWithParameters")
|
||||
assert model is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("container_name", ["DummyContainerWithAzureDataset",
|
||||
"DummyContainerWithoutDataset",
|
||||
"DummyContainerWithLocalDataset",
|
||||
"DummyContainerWithAzureAndLocalDataset"])
|
||||
def test_submit_container_to_azureml(container_name: str) -> None:
|
||||
"""
|
||||
Test if we can get the config loader to load a Lightning container model, and get it through the AzureML
|
||||
submission process.
|
||||
"""
|
||||
runner = default_runner()
|
||||
mock_run = Run.get_context()
|
||||
args = ["", f"--model={container_name}", "--azureml=True", "--model_configs_namespace=Tests.ML.configs"]
|
||||
with mock.patch("sys.argv", args):
|
||||
with mock.patch("InnerEye.Azure.azure_runner.get_dataset_consumption", return_value=None):
|
||||
with mock.patch("azureml.core.Experiment.submit", return_value=mock_run):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run == mock_run
|
||||
assert loaded_config is None
|
||||
assert isinstance(runner.lightning_container, LightningContainer)
|
||||
|
||||
|
||||
def test_load_container_with_arguments() -> None:
|
||||
"""
|
||||
Test if we can load a container and override a value in it via the commandline. Parameters can only be set at
|
||||
container level, not at model level.
|
||||
"""
|
||||
DummyContainerWithParameters()
|
||||
runner = default_runner()
|
||||
args = ["", "--model=DummyContainerWithParameters", "--container_param=param1",
|
||||
"--model_configs_namespace=Tests.ML.configs"]
|
||||
with mock.patch("sys.argv", args):
|
||||
runner.parse_and_load_model()
|
||||
assert isinstance(runner.lightning_container, DummyContainerWithParameters)
|
||||
assert runner.lightning_container.container_param == "param1"
|
||||
# Overriding model parameters should not work
|
||||
args = ["", "--model=DummyContainerWithParameters", "--model_param=param2",
|
||||
"--model_configs_namespace=Tests.ML.configs"]
|
||||
with pytest.raises(ValueError) as ex:
|
||||
with mock.patch("sys.argv", args):
|
||||
runner.parse_and_load_model()
|
||||
assert "model_param" in str(ex)
|
||||
|
||||
|
||||
def test_load_invalid_container() -> None:
|
||||
"""
|
||||
Test if we loading a container fails if one of the parameters is not valid.
|
||||
"""
|
||||
DummyContainerWithParameters()
|
||||
runner = default_runner()
|
||||
args = ["", "--model=DummyContainerWithParameters", "--number_of_cross_validation_splits=1",
|
||||
"--model_configs_namespace=Tests.ML.configs"]
|
||||
with pytest.raises(ValueError) as ex:
|
||||
with mock.patch("sys.argv", args):
|
||||
runner.parse_and_load_model()
|
||||
assert "At least two splits required to perform cross validation, but got 1" in str(ex)
|
||||
|
||||
|
||||
def test_run_model_with_invalid_trainer_arguments(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test if the trainer_arguments in a LightningContainer are passed to the trainer.
|
||||
"""
|
||||
container = DummyContainerWithInvalidTrainerArguments()
|
||||
with pytest.raises(Exception) as ex:
|
||||
model_train_unittest(config=None, dirs=test_output_dirs, lightning_container=container)
|
||||
assert "no_such_argument" in str(ex)
|
||||
|
|
|
@ -16,10 +16,11 @@ import torch
|
|||
from InnerEye.Common import common_util, fixed_paths
|
||||
from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME, CROSSVAL_RESULTS_FOLDER, EPOCH_METRICS_FILE_NAME, \
|
||||
METRICS_AGGREGATES_FILE, SUBJECT_METRICS_FILE_NAME, get_best_epoch_results_path, logging_to_stdout
|
||||
from InnerEye.Common.fixed_paths import LOG_FILE_NAME
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
|
||||
from InnerEye.Common.metrics_constants import LoggingColumns, MetricType
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML import model_testing, model_training, runner
|
||||
from InnerEye.ML import model_testing, runner
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.configs.classification.DummyMulticlassClassification import DummyMulticlassClassification
|
||||
from InnerEye.ML.dataset.scalar_dataset import ScalarDataset
|
||||
|
@ -30,12 +31,13 @@ from InnerEye.ML.reports.notebook_report import generate_classification_multilab
|
|||
generate_classification_notebook, get_html_report_name, get_ipynb_report_name
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from InnerEye.ML.scalar_config import ScalarLoss, ScalarModelBase
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.visualizers.plot_cross_validation import EpochMetricValues, get_config_and_results_for_offline_runs, \
|
||||
unroll_aggregate_metrics
|
||||
from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from Tests.ML.util import get_default_azure_config, get_default_checkpoint_handler, machine_has_gpu
|
||||
from Tests.ML.util import get_default_azure_config, machine_has_gpu, \
|
||||
model_train_unittest
|
||||
|
||||
|
||||
@pytest.mark.cpu_and_gpu
|
||||
|
@ -50,20 +52,20 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
|
|||
config = ClassificationModelForTesting()
|
||||
config.class_names = [class_name]
|
||||
config.set_output_to(test_output_dirs.root_dir)
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
|
||||
project_root=Path(test_output_dirs.root_dir))
|
||||
# Train for 4 epochs, checkpoints at epochs 2 and 4
|
||||
config.num_epochs = 4
|
||||
model_training_result = model_training.model_train(config, checkpoint_handler=checkpoint_handler)
|
||||
model_training_result, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
|
||||
assert model_training_result is not None
|
||||
expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
|
||||
expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167]
|
||||
expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952]
|
||||
# Ensure that all metrics are computed on both training and validation set
|
||||
assert len(model_training_result.train_results_per_epoch) == config.num_epochs
|
||||
assert len(model_training_result.val_results_per_epoch) == config.num_epochs
|
||||
assert len(model_training_result.train_results_per_epoch[0]) >= 11
|
||||
assert len(model_training_result.val_results_per_epoch[0]) >= 11
|
||||
train_results_per_epoch = model_training_result.train_results_per_epoch()
|
||||
val_results_per_epoch = model_training_result.val_results_per_epoch()
|
||||
assert len(train_results_per_epoch) == config.num_epochs
|
||||
assert len(val_results_per_epoch) == config.num_epochs
|
||||
assert len(train_results_per_epoch[0]) >= 11
|
||||
assert len(val_results_per_epoch[0]) >= 11
|
||||
|
||||
for metric in [MetricType.ACCURACY_AT_THRESHOLD_05,
|
||||
MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
|
||||
|
@ -74,10 +76,8 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
|
|||
MetricType.SECONDS_PER_BATCH,
|
||||
MetricType.SECONDS_PER_EPOCH,
|
||||
MetricType.SUBJECT_COUNT]:
|
||||
assert metric.value in model_training_result.train_results_per_epoch[0], \
|
||||
f"{metric.value} not in training"
|
||||
assert metric.value in model_training_result.val_results_per_epoch[0], \
|
||||
f"{metric.value} not in validation"
|
||||
assert metric.value in train_results_per_epoch[0], f"{metric.value} not in training"
|
||||
assert metric.value in val_results_per_epoch[0], f"{metric.value} not in validation"
|
||||
|
||||
actual_train_loss = model_training_result.get_metric(is_training=True, metric_type=MetricType.LOSS.value)
|
||||
actual_val_loss = model_training_result.get_metric(is_training=False, metric_type=MetricType.LOSS.value)
|
||||
|
@ -144,6 +144,7 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
|
|||
"""
|
||||
check_log_file(inference_metrics_path, inference_metrics_expected, ignore_columns=[])
|
||||
|
||||
|
||||
@pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
|
||||
@pytest.mark.cpu_and_gpu
|
||||
def test_train_classification_multilabel_model(test_output_dirs: OutputFolderForTests) -> None:
|
||||
|
@ -155,35 +156,33 @@ def test_train_classification_multilabel_model(test_output_dirs: OutputFolderFor
|
|||
logging_to_stdout(logging.DEBUG)
|
||||
config = DummyMulticlassClassification()
|
||||
config.set_output_to(test_output_dirs.root_dir)
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
|
||||
project_root=Path(test_output_dirs.root_dir))
|
||||
# Train for 4 epochs, checkpoints at epochs 2 and 4
|
||||
config.num_epochs = 4
|
||||
model_training_result = model_training.model_train(config, checkpoint_handler=checkpoint_handler)
|
||||
model_training_result, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
|
||||
assert model_training_result is not None
|
||||
expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
|
||||
expected_train_loss = [0.699870228767395, 0.6239662170410156, 0.551329493522644, 0.4825132489204407]
|
||||
expected_val_loss = [0.6299371719360352, 0.5546272993087769, 0.4843321740627289, 0.41909298300743103]
|
||||
# Ensure that all metrics are computed on both training and validation set
|
||||
assert len(model_training_result.train_results_per_epoch) == config.num_epochs
|
||||
assert len(model_training_result.val_results_per_epoch) == config.num_epochs
|
||||
assert len(model_training_result.train_results_per_epoch[0]) >= 11
|
||||
assert len(model_training_result.val_results_per_epoch[0]) >= 11
|
||||
train_results_per_epoch = model_training_result.train_results_per_epoch()
|
||||
val_results_per_epoch = model_training_result.val_results_per_epoch()
|
||||
assert len(train_results_per_epoch) == config.num_epochs
|
||||
assert len(val_results_per_epoch) == config.num_epochs
|
||||
assert len(train_results_per_epoch[0]) >= 11
|
||||
assert len(val_results_per_epoch[0]) >= 11
|
||||
for class_name in config.class_names:
|
||||
for metric in [MetricType.ACCURACY_AT_THRESHOLD_05,
|
||||
MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
|
||||
MetricType.AREA_UNDER_PR_CURVE,
|
||||
MetricType.AREA_UNDER_ROC_CURVE,
|
||||
MetricType.CROSS_ENTROPY]:
|
||||
assert f'{metric.value}/{class_name}' in model_training_result.train_results_per_epoch[
|
||||
0], f"{metric.value} not in training"
|
||||
assert f'{metric.value}/{class_name}' in model_training_result.val_results_per_epoch[
|
||||
0], f"{metric.value} not in validation"
|
||||
assert f'{metric.value}/{class_name}' in train_results_per_epoch[0], f"{metric.value} not in training"
|
||||
assert f'{metric.value}/{class_name}' in val_results_per_epoch[0], f"{metric.value} not in validation"
|
||||
for metric in [MetricType.LOSS,
|
||||
MetricType.SECONDS_PER_EPOCH,
|
||||
MetricType.SUBJECT_COUNT]:
|
||||
assert metric.value in model_training_result.train_results_per_epoch[0], f"{metric.value} not in training"
|
||||
assert metric.value in model_training_result.val_results_per_epoch[0], f"{metric.value} not in validation"
|
||||
assert metric.value in train_results_per_epoch[0], f"{metric.value} not in training"
|
||||
assert metric.value in val_results_per_epoch[0], f"{metric.value} not in validation"
|
||||
|
||||
actual_train_loss = model_training_result.get_metric(is_training=True, metric_type=MetricType.LOSS.value)
|
||||
actual_val_loss = model_training_result.get_metric(is_training=False, metric_type=MetricType.LOSS.value)
|
||||
|
@ -265,13 +264,12 @@ def test_run_ml_with_classification_model(test_output_dirs: OutputFolderForTests
|
|||
logging_to_stdout()
|
||||
azure_config = get_default_azure_config()
|
||||
azure_config.train = True
|
||||
config: ScalarModelBase = ModelConfigLoader[ScalarModelBase]() \
|
||||
.create_model_config_from_name(model_name)
|
||||
config: ScalarModelBase = ModelConfigLoader().create_model_config_from_name(model_name)
|
||||
config.number_of_cross_validation_splits = number_of_offline_cross_validation_splits
|
||||
config.set_output_to(test_output_dirs.root_dir)
|
||||
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
|
||||
config.max_num_gpus = 1
|
||||
MLRunner(config, azure_config).run()
|
||||
MLRunner(config, azure_config=azure_config).run()
|
||||
_check_offline_cross_validation_output_files(config)
|
||||
|
||||
if config.perform_cross_validation:
|
||||
|
@ -306,7 +304,7 @@ def test_run_ml_with_segmentation_model(test_output_dirs: OutputFolderForTests)
|
|||
config.set_output_to(test_output_dirs.root_dir)
|
||||
azure_config = get_default_azure_config()
|
||||
azure_config.train = True
|
||||
MLRunner(config, azure_config).run()
|
||||
MLRunner(config, azure_config=azure_config).run()
|
||||
|
||||
|
||||
@pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
|
||||
|
@ -319,7 +317,7 @@ def test_runner1(test_output_dirs: OutputFolderForTests) -> None:
|
|||
set_from_commandline = 12345
|
||||
scalar1 = '["label"]'
|
||||
model_name = "DummyClassification"
|
||||
initial_config = ModelConfigLoader[ScalarModelBase]().create_model_config_from_name(model_name)
|
||||
initial_config = ModelConfigLoader().create_model_config_from_name(model_name)
|
||||
assert initial_config.non_image_feature_channels == []
|
||||
output_root = str(test_output_dirs.root_dir)
|
||||
args = ["",
|
||||
|
@ -338,7 +336,7 @@ def test_runner1(test_output_dirs: OutputFolderForTests) -> None:
|
|||
assert config.get_effective_random_seed() == set_from_commandline
|
||||
assert config.non_image_feature_channels == ["label"]
|
||||
assert str(config.outputs_folder).startswith(output_root)
|
||||
assert (config.logs_folder / runner.LOG_FILE_NAME).exists()
|
||||
assert (config.logs_folder / LOG_FILE_NAME).exists()
|
||||
|
||||
|
||||
@pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
|
||||
|
@ -456,7 +454,9 @@ def _compute_scalar_metrics(output_values_list: List[List[float]],
|
|||
def test_is_offline_cross_val_parent_run(offline_parent_cv_run: bool) -> None:
|
||||
train_config = DummyModel()
|
||||
train_config.number_of_cross_validation_splits = 2 if offline_parent_cv_run else 0
|
||||
assert MLRunner(train_config).is_offline_cross_val_parent_run() == offline_parent_cv_run
|
||||
runner = MLRunner(train_config)
|
||||
runner.setup()
|
||||
assert runner.is_offline_cross_val_parent_run() == offline_parent_cv_run
|
||||
|
||||
|
||||
def _check_offline_cross_validation_output_files(train_config: ScalarModelBase) -> None:
|
||||
|
@ -487,12 +487,15 @@ def _check_offline_cross_validation_output_files(train_config: ScalarModelBase)
|
|||
_dataset_splits.train[train_config.subject_column].unique())
|
||||
_test_dataset_split_count = len(_dataset_splits.test[train_config.subject_column].unique())
|
||||
_aggregates_csv = pd.read_csv(aggregate_metrics_path)
|
||||
_aggregates_csv_test = _aggregates_csv.loc[_aggregates_csv[LoggingColumns.DataSplit.value] == ModelExecutionMode.TEST.value]
|
||||
_aggregates_csv_train_val = _aggregates_csv.loc[_aggregates_csv[LoggingColumns.DataSplit.value] != ModelExecutionMode.TEST.value]
|
||||
_aggregates_csv_test = _aggregates_csv.loc[
|
||||
_aggregates_csv[LoggingColumns.DataSplit.value] == ModelExecutionMode.TEST.value]
|
||||
_aggregates_csv_train_val = _aggregates_csv.loc[
|
||||
_aggregates_csv[LoggingColumns.DataSplit.value] != ModelExecutionMode.TEST.value]
|
||||
_counts_for_splits_train_val = list(_aggregates_csv_train_val[LoggingColumns.SubjectCount.value])
|
||||
_counts_for_splits_test = list(_aggregates_csv_test[LoggingColumns.SubjectCount.value])
|
||||
assert all([x == _val_dataset_split_count for x in _counts_for_splits_train_val])
|
||||
assert all([x == _test_dataset_split_count * train_config.number_of_cross_validation_splits for x in _counts_for_splits_test])
|
||||
assert all([x == _test_dataset_split_count * train_config.number_of_cross_validation_splits for x in
|
||||
_counts_for_splits_test])
|
||||
_epochs = list(_aggregates_csv_train_val[LoggingColumns.Epoch.value].astype(int))
|
||||
# Each epoch is recorded twice once for the training split and once for the validation
|
||||
# split
|
||||
|
|
|
@ -11,8 +11,8 @@ from InnerEye.Common.common_util import is_windows
|
|||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.Common.type_annotations import TupleInt3
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.lightning_helpers import create_lightning_model, load_from_checkpoint_and_adjust_for_inference
|
||||
from InnerEye.ML.lightning_models import SegmentationLightning
|
||||
from InnerEye.ML.lightning_helpers import load_from_checkpoint_and_adjust_for_inference
|
||||
from InnerEye.ML.lightning_models import SegmentationLightning, create_lightning_model
|
||||
from InnerEye.ML.pipelines.inference import InferencePipeline
|
||||
from InnerEye.ML.utils import image_util
|
||||
from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
|
||||
|
|
|
@ -12,8 +12,7 @@ from InnerEye.Common.output_directories import OutputFolderForTests
|
|||
from InnerEye.Common.type_annotations import TupleInt3
|
||||
from InnerEye.ML.dataset.sample import GeneralSampleMetadata
|
||||
from InnerEye.ML.dataset.scalar_sample import ScalarItem
|
||||
from InnerEye.ML.lightning_helpers import create_lightning_model
|
||||
from InnerEye.ML.lightning_models import ScalarLightning
|
||||
from InnerEye.ML.lightning_models import ScalarLightning, create_lightning_model
|
||||
from InnerEye.ML.models.architectures.base_model import DeviceAwareModule
|
||||
from InnerEye.ML.pipelines.scalar_inference import ScalarEnsemblePipeline, ScalarInferencePipeline, \
|
||||
ScalarInferencePipelineBase
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import argparse
|
||||
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from Tests.ML.util import get_model_loader
|
||||
|
||||
MODEL_NAME = "DummyModelWithOverrideGroups"
|
||||
LOADER = get_model_loader("Tests.ML.configs")
|
||||
|
||||
|
||||
def test_script_params_override() -> None:
|
||||
# these are the parameters from the command line that should override
|
||||
# the initial parameters
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--l_rate",
|
||||
help="The name of the model to train/test.",
|
||||
type=float,
|
||||
default=1.0)
|
||||
args = parser.parse_args("")
|
||||
|
||||
try:
|
||||
config: ModelConfigBase = LOADER.create_model_config_from_name(model_name=MODEL_NAME, overrides=vars(args))
|
||||
# check that the values were changed
|
||||
assert config.l_rate == args.l_rate
|
||||
except ValueError:
|
||||
# (Temporarily) handle the case where there is no Lung config.
|
||||
pass
|
|
@ -7,10 +7,13 @@ from typing import List, Optional, Union
|
|||
|
||||
import pytest
|
||||
import torch
|
||||
from pandas import DataFrame
|
||||
|
||||
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.config import ModelArchitectureConfig, SegmentationModelBase, equally_weighted_classes
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
|
||||
from InnerEye.ML.scalar_config import ScalarModelBase
|
||||
from InnerEye.ML.utils import ml_util
|
||||
|
@ -127,6 +130,35 @@ def test_equally_weighted_classes_fails(num_fg_clases: int, background_weight: O
|
|||
equally_weighted_classes(classes, background_weight)
|
||||
|
||||
|
||||
def test_fields_are_set() -> None:
|
||||
"""
|
||||
Tests that expected fields are set when creating config classes.
|
||||
"""
|
||||
expected = [("hello", None), ("world", None)]
|
||||
config = SegmentationModelBase(
|
||||
should_validate=False,
|
||||
ground_truth_ids=[x[0] for x in expected],
|
||||
largest_connected_component_foreground_classes=expected
|
||||
)
|
||||
assert hasattr(config, CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY)
|
||||
assert config.largest_connected_component_foreground_classes == expected
|
||||
|
||||
|
||||
@pytest.mark.cpu_and_gpu
|
||||
def test_dataset_reader_workers() -> None:
|
||||
"""
|
||||
Test to make sure the number of dataset reader workers are set correctly
|
||||
"""
|
||||
config = ScalarModelBase(
|
||||
should_validate=False,
|
||||
num_dataset_reader_workers=-1
|
||||
)
|
||||
if config.is_offline_run:
|
||||
assert config.num_dataset_reader_workers == -1
|
||||
else:
|
||||
assert config.num_dataset_reader_workers == 0
|
||||
|
||||
|
||||
def create_dataset_csv(test_output_dirs: OutputFolderForTests) -> Path:
|
||||
"""Create dummy dataset csv file for tests,
|
||||
deleting any pre-existing file."""
|
||||
|
@ -176,34 +208,46 @@ def test_dataset_csv_with_ScalarModelBase(
|
|||
assert model_config.dataset_data_frame is not None
|
||||
validate_dataset_paths(model_config)
|
||||
|
||||
|
||||
def test_unet3_num_downsampling_paths() -> None:
|
||||
for num_downsampling_paths in range(1, 5):
|
||||
j = int(2**num_downsampling_paths)
|
||||
j = int(2 ** num_downsampling_paths)
|
||||
|
||||
# Test that num_downsampling_paths for built UNet3D
|
||||
# is set via model configuration
|
||||
crop_size = (j, j, j)
|
||||
config = SegmentationModelBase(
|
||||
architecture=ModelArchitectureConfig.UNet3D,
|
||||
image_channels=["ct"],
|
||||
feature_channels=[1],
|
||||
crop_size=crop_size,
|
||||
num_downsampling_paths=num_downsampling_paths,
|
||||
should_validate=False)
|
||||
architecture=ModelArchitectureConfig.UNet3D,
|
||||
image_channels=["ct"],
|
||||
feature_channels=[1],
|
||||
crop_size=crop_size,
|
||||
num_downsampling_paths=num_downsampling_paths,
|
||||
should_validate=False)
|
||||
network = build_net(config)
|
||||
assert network.num_downsampling_paths == num_downsampling_paths
|
||||
|
||||
# Test that exception is raised if crop size is smaller than is allowed
|
||||
# by num_downsampling_paths
|
||||
too_small_crop_size = (j//2, j//2, j//2)
|
||||
too_small_crop_size = (j // 2, j // 2, j // 2)
|
||||
ex_msg = f"Crop size is not valid. The required minimum is {crop_size}"
|
||||
config = SegmentationModelBase(
|
||||
architecture=ModelArchitectureConfig.UNet3D,
|
||||
image_channels=["ct"],
|
||||
feature_channels=[1],
|
||||
crop_size=too_small_crop_size,
|
||||
num_downsampling_paths=num_downsampling_paths,
|
||||
should_validate=False)
|
||||
architecture=ModelArchitectureConfig.UNet3D,
|
||||
image_channels=["ct"],
|
||||
feature_channels=[1],
|
||||
crop_size=too_small_crop_size,
|
||||
num_downsampling_paths=num_downsampling_paths,
|
||||
should_validate=False)
|
||||
with pytest.raises(ValueError) as ex:
|
||||
network = build_net(config)
|
||||
build_net(config)
|
||||
assert ex_msg in str(ex)
|
||||
|
||||
|
||||
def test_config_str() -> None:
|
||||
"""
|
||||
Check if dataframe fields are omitted from the string conversion of a config object.
|
||||
"""
|
||||
config = DeepLearningConfig()
|
||||
df = DataFrame(columns=["foobar"], data=[1.0, 2.0])
|
||||
config.dataset_data_frame = df
|
||||
s = str(config)
|
||||
assert "foobar" not in s, f"Incorrect output: {s}"
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
5.749202728271484375e+00,1.045434713363647461e+00
|
||||
7.637505531311035156e+00,1.481036424636840820e+00
|
||||
9.329424858093261719e+00,1.917572140693664551e+00
|
||||
7.494448661804199219e+00,1.591190218925476074e+00
|
||||
9.689485549926757812e+00,1.971193432807922363e+00
|
||||
1.534464955329895020e+00,2.667523026466369629e-01
|
||||
9.561051368713378906e+00,1.685984492301940918e+00
|
||||
8.846873283386230469e+00,1.842781662940979004e+00
|
||||
5.290946006774902344e+00,1.096875548362731934e+00
|
||||
6.057560443878173828e+00,1.215266227722167969e+00
|
||||
9.478215217590332031e+00,2.150293827056884766e+00
|
||||
3.349403142929077148e+00,6.767664551734924316e-01
|
||||
4.251931190490722656e+00,8.223311901092529297e-01
|
||||
9.133169174194335938e+00,1.817616820335388184e+00
|
||||
5.868637561798095703e-01,7.238323986530303955e-02
|
||||
4.395127296447753906e-02,-1.203215569257736206e-01
|
||||
4.473074913024902344e+00,1.031657218933105469e+00
|
||||
9.513977050781250000e+00,1.858699083328247070e+00
|
||||
1.762510538101196289e+00,5.511313676834106445e-01
|
||||
3.059309720993041992e+00,7.563391923904418945e-01
|
||||
1.646029353141784668e+00,1.496532708406448364e-01
|
||||
4.897529602050781250e+00,1.009216904640197754e+00
|
||||
5.573034763336181641e+00,1.063154220581054688e+00
|
||||
6.369268894195556641e-01,1.111971661448478699e-01
|
||||
4.231463432312011719e+00,7.547589540481567383e-01
|
||||
2.101852893829345703e+00,4.300535917282104492e-01
|
||||
6.729665279388427734e+00,1.531016945838928223e+00
|
||||
7.961721420288085938e+00,1.478816986083984375e+00
|
||||
2.669513702392578125e+00,5.368775129318237305e-01
|
||||
4.574956893920898438e+00,8.576059341430664062e-01
|
||||
7.437694072723388672e-01,2.781907916069030762e-01
|
||||
5.983660221099853516e+00,1.187077164649963379e+00
|
||||
9.752596855163574219e+00,2.027858018875122070e+00
|
||||
9.338418960571289062e+00,2.025038957595825195e+00
|
||||
6.751420497894287109e+00,1.303008437156677246e+00
|
||||
9.554377555847167969e+00,1.930274367332458496e+00
|
||||
3.040063381195068359e-01,1.029240339994430542e-02
|
||||
2.818799018859863281e-01,2.614871561527252197e-01
|
||||
5.994813919067382812e+00,1.070150852203369141e+00
|
||||
5.795848369598388672e-01,-4.384742677211761475e-02
|
||||
3.211182355880737305e+00,5.655854344367980957e-01
|
||||
8.615511894226074219e+00,1.730698943138122559e+00
|
||||
4.950296401977539062e+00,9.849137663841247559e-01
|
||||
1.632133126258850098e+00,4.643072187900543213e-01
|
||||
5.252981662750244141e+00,1.020202517509460449e+00
|
||||
6.792118072509765625e+00,1.392252922058105469e+00
|
||||
2.290313720703125000e+00,2.924301028251647949e-01
|
||||
3.329365253448486328e+00,8.425519466400146484e-01
|
||||
3.469936370849609375e+00,6.026793718338012695e-01
|
||||
8.790910243988037109e-02,-2.861313149333000183e-02
|
||||
3.999347686767578125e+00,7.818984389305114746e-01
|
||||
2.891576290130615234e-01,1.004043519496917725e-01
|
||||
5.804258346557617188e+00,1.216606140136718750e+00
|
||||
3.836791992187500000e+00,9.209365844726562500e-01
|
||||
5.516016006469726562e+00,1.077136993408203125e+00
|
||||
7.987973213195800781e+00,1.436331987380981445e+00
|
||||
8.790102958679199219e+00,1.966201663017272949e+00
|
||||
7.359976291656494141e+00,1.414163231849670410e+00
|
||||
3.334070444107055664e+00,6.647097468376159668e-01
|
||||
6.998687744140625000e+00,1.236205577850341797e+00
|
||||
4.366195201873779297e+00,8.169019818305969238e-01
|
||||
5.620658397674560547e-02,1.258963048458099365e-01
|
||||
4.941163539886474609e+00,8.424454331398010254e-01
|
||||
5.723971366882324219e+00,1.067836642265319824e+00
|
||||
6.601081371307373047e+00,1.462573528289794922e+00
|
||||
2.285490512847900391e+00,3.970748484134674072e-01
|
||||
3.233198642730712891e+00,5.250911116600036621e-01
|
||||
8.005992889404296875e+00,1.783252477645874023e+00
|
||||
9.988401412963867188e+00,2.108428716659545898e+00
|
||||
3.176209926605224609e+00,6.435566544532775879e-01
|
||||
8.746008872985839844e+00,1.664346933364868164e+00
|
||||
1.034811139106750488e+00,7.043153047561645508e-02
|
||||
4.228623390197753906e+00,7.815699577331542969e-01
|
||||
8.936402320861816406e+00,1.719643831253051758e+00
|
||||
6.312811374664306641e+00,1.115693926811218262e+00
|
||||
2.998808622360229492e+00,6.670392155647277832e-01
|
||||
4.147662162780761719e+00,8.732877969741821289e-01
|
||||
7.289369106292724609e+00,1.516813516616821289e+00
|
||||
7.059363842010498047e+00,1.442374944686889648e+00
|
||||
8.922320365905761719e+00,1.986880540847778320e+00
|
||||
2.708734989166259766e+00,5.354607105255126953e-01
|
||||
7.997574329376220703e+00,1.465035080909729004e+00
|
||||
3.965347290039062500e+00,8.214159011840820312e-01
|
||||
6.266443729400634766e+00,1.197527050971984863e+00
|
||||
1.101133823394775391e+00,6.622195243835449219e-02
|
||||
5.336141586303710938e+00,9.389448165893554688e-01
|
||||
9.122401237487792969e+00,1.791326642036437988e+00
|
||||
5.871895790100097656e+00,1.136480450630187988e+00
|
||||
7.560163497924804688e+00,1.564274787902832031e+00
|
||||
9.699577093124389648e-01,3.698585033416748047e-01
|
||||
6.118352413177490234e+00,1.192414522171020508e+00
|
||||
2.855588197708129883e+00,7.185647487640380859e-01
|
||||
7.776415348052978516e-01,1.955290585756301880e-01
|
||||
5.909432888031005859e+00,1.285851240158081055e+00
|
||||
1.471748352050781250e+00,4.017572104930877686e-01
|
||||
7.064949989318847656e+00,1.341601967811584473e+00
|
||||
4.807097911834716797e+00,1.043601632118225098e+00
|
||||
6.570946693420410156e+00,1.190045952796936035e+00
|
||||
4.360112667083740234e+00,7.959681153297424316e-01
|
||||
9.832940101623535156e+00,1.984794259071350098e+00
|
|
|
@ -3,21 +3,27 @@
|
|||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Any, List, Optional
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Azure.azure_util import get_results_blob_path
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_section, logging_to_stdout
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from InnerEye.ML.utils.run_recovery import RunRecovery
|
||||
from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, FALLBACK_SINGLE_RUN, get_most_recent_run
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from Tests.ML.configs.lightning_test_containers import DummyContainerWithDatasets
|
||||
from Tests.ML.util import get_default_azure_config
|
||||
|
||||
logging_to_stdout(logging.DEBUG)
|
||||
|
@ -35,10 +41,6 @@ def runner_config() -> AzureConfig:
|
|||
return config
|
||||
|
||||
|
||||
def test_get_results_blob_path() -> None:
|
||||
assert get_results_blob_path("some_run_id") == "azureml/ExperimentRun/dcid.some_run_id"
|
||||
|
||||
|
||||
def check_single_checkpoint(downloaded_checkpoints: List[Path]) -> None:
|
||||
assert len(downloaded_checkpoints) == 1
|
||||
assert downloaded_checkpoints[0].is_file()
|
||||
|
@ -77,18 +79,22 @@ def test_download_best_checkpoints_ensemble_run(test_output_dirs: OutputFolderFo
|
|||
|
||||
def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> None:
|
||||
dataset_name = "test-dataset"
|
||||
config = ModelConfigBase(should_validate=False)
|
||||
config = DummyModel()
|
||||
config.local_dataset = None
|
||||
config.azure_dataset_id = ""
|
||||
azure_config = get_default_azure_config()
|
||||
runner = MLRunner(config, azure_config)
|
||||
runner.project_root = test_output_dirs.root_dir
|
||||
|
||||
runner = MLRunner(config, azure_config=azure_config)
|
||||
# If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail.
|
||||
with pytest.raises(ValueError):
|
||||
runner.mount_or_download_dataset()
|
||||
# This mounting call must happen before any other operations on the container, because already the model
|
||||
# creation may need access to the dataset.
|
||||
with pytest.raises(ValueError) as ex:
|
||||
runner.setup()
|
||||
assert ex.value.args[0] == "The model must contain either local_dataset or azure_dataset_id."
|
||||
runner.project_root = test_output_dirs.root_dir
|
||||
|
||||
# Pointing the model to a dataset folder that does not exist should raise an Exception
|
||||
fake_folder = runner.project_root / "foo"
|
||||
runner.model_config.local_dataset = fake_folder
|
||||
runner.container.local_dataset = fake_folder
|
||||
with pytest.raises(FileNotFoundError):
|
||||
runner.mount_or_download_dataset()
|
||||
|
||||
|
@ -98,8 +104,8 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
|
|||
assert local_dataset == fake_folder
|
||||
|
||||
# Pointing the model to a dataset in Azure should trigger a download
|
||||
runner.model_config.local_dataset = None
|
||||
runner.model_config.azure_dataset_id = dataset_name
|
||||
runner.container.local_dataset = None
|
||||
runner.container.azure_dataset_id = dataset_name
|
||||
with logging_section("Starting download"):
|
||||
result_path = runner.mount_or_download_dataset()
|
||||
# Download goes into <project_root> / "datasets" / "test_dataset"
|
||||
|
@ -115,3 +121,127 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
|
|||
for file in ["ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"]:
|
||||
f = (sub_folder / file).with_suffix(".nii.gz")
|
||||
assert f.is_file()
|
||||
|
||||
|
||||
def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests,
|
||||
is_offline_run: bool,
|
||||
local_dataset: Optional[Path],
|
||||
azure_dataset: str,
|
||||
is_lightning_model: bool) -> LightningContainer:
|
||||
config: Optional[DeepLearningConfig] = None
|
||||
container: Optional[LightningContainer] = None
|
||||
if is_lightning_model:
|
||||
container = DummyContainerWithDatasets()
|
||||
container.azure_dataset_id = azure_dataset
|
||||
container.local_dataset = local_dataset
|
||||
else:
|
||||
config = DummyModel()
|
||||
config.azure_dataset_id = azure_dataset
|
||||
config.local_dataset = local_dataset
|
||||
# The legacy InnerEye models require an existing dataset_csv file present in the dataset folder. Create that.
|
||||
download_path = test_output_dirs.root_dir / "downloaded"
|
||||
mount_path = test_output_dirs.root_dir / "mounted"
|
||||
if not is_lightning_model:
|
||||
for path in [download_path, mount_path]:
|
||||
path.mkdir(exist_ok=True)
|
||||
shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), path / DATASET_CSV_FILE_NAME)
|
||||
with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run):
|
||||
with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path):
|
||||
with mock.patch("InnerEye.ML.run_ml.try_to_mount_input_dataset", return_value=mount_path):
|
||||
runner = MLRunner(config, container=container,
|
||||
azure_config=None, project_root=test_output_dirs.root_dir)
|
||||
runner.setup()
|
||||
return runner.container
|
||||
|
||||
|
||||
@pytest.mark.parametrize(("is_lightning_model", "expected_error"),
|
||||
[
|
||||
# A built-in InnerEye model must have either local dataset or azure dataset provided.
|
||||
(False, "The model must contain either local_dataset or azure_dataset_id"),
|
||||
# ... but this is OK for Lightning container models. A Lightning container could simply
|
||||
# download its data from the web before training.
|
||||
(True, "")
|
||||
])
|
||||
def test_mount_failing_offline_runs(test_output_dirs: OutputFolderForTests,
|
||||
is_lightning_model: bool,
|
||||
expected_error: str) -> None:
|
||||
"""
|
||||
Test cases when MLRunner.mount_or_download_dataset raises an exception, when running outside AzureML.
|
||||
"""
|
||||
|
||||
def run() -> Any:
|
||||
return _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
|
||||
is_offline_run=True,
|
||||
local_dataset=None,
|
||||
azure_dataset="",
|
||||
is_lightning_model=is_lightning_model)
|
||||
|
||||
if expected_error:
|
||||
with pytest.raises(ValueError) as ex:
|
||||
run()
|
||||
assert expected_error in str(ex)
|
||||
else:
|
||||
assert run().local_dataset is None
|
||||
|
||||
|
||||
def test_mount_in_azureml1(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test cases when MLRunner.mount_or_download_dataset runs inside AzureML.
|
||||
"""
|
||||
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
|
||||
is_offline_run=False,
|
||||
local_dataset=None,
|
||||
azure_dataset="foo",
|
||||
is_lightning_model=False)
|
||||
assert "mounted" in str(container.local_dataset)
|
||||
|
||||
|
||||
def test_mount_in_azureml2(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test cases when MLRunner.mount_or_download_dataset runs inside AzureML.
|
||||
"""
|
||||
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
|
||||
is_offline_run=False,
|
||||
local_dataset=None,
|
||||
azure_dataset="",
|
||||
is_lightning_model=True)
|
||||
assert container.local_dataset is None
|
||||
|
||||
|
||||
def test_mount_or_download(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Tests the different combinations of local and Azure datasets, with Innereye built-in and container models.
|
||||
"""
|
||||
root = test_output_dirs.root_dir
|
||||
for is_lightning_model in [True, False]:
|
||||
# With runs outside of AzureML, an AML dataset should get downloaded.
|
||||
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
|
||||
is_offline_run=True,
|
||||
local_dataset=None,
|
||||
azure_dataset="foo",
|
||||
is_lightning_model=is_lightning_model)
|
||||
assert "downloaded" in str(container.local_dataset)
|
||||
# For all InnerEye built-in models, the paths from container level need to be copied down to legacy config
|
||||
# level.
|
||||
if not is_lightning_model:
|
||||
assert container.config.local_dataset == container.local_dataset
|
||||
# With runs in AzureML, an AML dataset should get mounted.
|
||||
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
|
||||
is_offline_run=False,
|
||||
local_dataset=None,
|
||||
azure_dataset="foo",
|
||||
is_lightning_model=is_lightning_model)
|
||||
assert "mounted" in str(container.local_dataset)
|
||||
if not is_lightning_model:
|
||||
assert container.config.local_dataset == container.local_dataset
|
||||
|
||||
# With runs outside of AzureML, a local dataset should be used as-is. Azure dataset ID is ignored here.
|
||||
shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), root / DATASET_CSV_FILE_NAME)
|
||||
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
|
||||
is_offline_run=True,
|
||||
local_dataset=root,
|
||||
azure_dataset="",
|
||||
is_lightning_model=is_lightning_model)
|
||||
assert container.local_dataset == root
|
||||
if not is_lightning_model:
|
||||
assert container.config.local_dataset == container.local_dataset
|
||||
|
|
|
@ -0,0 +1,229 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
from io import StringIO
|
||||
from unittest import mock
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from pytorch_lightning import LightningModule
|
||||
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.common import ModelExecutionMode
|
||||
from InnerEye.ML.deep_learning_config import ARGS_TXT, DatasetParams, WorkflowParams
|
||||
from InnerEye.ML.lightning_base import InnerEyeContainer
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from Tests.ML.configs.lightning_test_containers import DummyContainerWithModel, DummyContainerWithPlainLightning
|
||||
from Tests.ML.util import default_runner
|
||||
|
||||
|
||||
def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test if we can get the config loader to load a Lightning container model, and then train locally.
|
||||
"""
|
||||
runner = default_runner()
|
||||
local_dataset = test_output_dirs.root_dir / "dataset"
|
||||
local_dataset.mkdir()
|
||||
args = ["", "--model=DummyContainerWithModel", "--model_configs_namespace=Tests.ML.configs",
|
||||
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
|
||||
with mock.patch("sys.argv", args):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run is None
|
||||
assert isinstance(runner.lightning_container, DummyContainerWithModel)
|
||||
# Test if the outputs folder is relative to the folder that we specified via the commandline
|
||||
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
|
||||
results = runner.lightning_container.outputs_folder
|
||||
# Test that the setup method has been called
|
||||
assert runner.lightning_container.local_dataset is not None
|
||||
assert (runner.lightning_container.local_dataset / "setup.txt").is_file()
|
||||
# Test if all the files that are written during inference exist. Data for all 3 splits must be processed
|
||||
assert (results / "on_inference_start.txt").is_file()
|
||||
assert (results / "on_inference_end.txt").is_file()
|
||||
for mode in ModelExecutionMode:
|
||||
assert (results / f"on_inference_start_{mode.value}.txt").is_file()
|
||||
assert (results / f"on_inference_end_{mode.value}.txt").is_file()
|
||||
step_results = results / f"inference_step_{mode.value}.txt"
|
||||
assert step_results.is_file()
|
||||
# We should have one line per data item, and there are around 6 of them
|
||||
result_lines = [line for line in step_results.read_text().splitlines() if line.strip()]
|
||||
assert len(result_lines) >= 5
|
||||
metrics_per_split = pd.read_csv(results / "metrics_per_split.csv")
|
||||
# Training should have reduced the MSE to pretty much zero.
|
||||
expected = pd.read_csv(StringIO("""Split,MSE
|
||||
Test,1e-7
|
||||
Val,1e-7
|
||||
Train,1e-7"""))
|
||||
pd.testing.assert_frame_equal(metrics_per_split, expected, check_less_precise=True)
|
||||
# Test if we have an args file that lists all parameters
|
||||
args_file = (results / ARGS_TXT).read_text()
|
||||
assert "Container:" in args_file
|
||||
assert "adam_betas" in args_file
|
||||
# Report generation must run
|
||||
assert (results / "create_report.txt").is_file()
|
||||
|
||||
|
||||
def test_run_container_with_plain_lightning_in_situ(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test if we can train a plain Lightning model, without any additional methods defined, end-to-end.
|
||||
"""
|
||||
runner = default_runner()
|
||||
local_dataset = test_output_dirs.root_dir / "dataset"
|
||||
local_dataset.mkdir()
|
||||
args = ["", "--model=DummyContainerWithPlainLightning", "--model_configs_namespace=Tests.ML.configs",
|
||||
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
|
||||
with mock.patch("sys.argv", args):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run is None
|
||||
assert isinstance(runner.lightning_container, DummyContainerWithPlainLightning)
|
||||
# Test if the outputs folder is relative to the folder that we specified via the commandline
|
||||
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
|
||||
results = runner.lightning_container.outputs_folder
|
||||
# Test if all the files that are written during inference exist.
|
||||
assert not (results / "on_inference_start.txt").is_file()
|
||||
assert (results / "test_step.txt").is_file()
|
||||
|
||||
|
||||
def test_innereye_container_init() -> None:
|
||||
"""
|
||||
Test if the constructor of the InnerEye container copies attributes as expected.
|
||||
"""
|
||||
# The constructor should copy all fields that belong to either EssentialParams or DatasetParams from the
|
||||
# config object to the container.
|
||||
for (attrib, type_) in [("weights_url", WorkflowParams), ("azure_dataset_id", DatasetParams)]:
|
||||
config = ModelConfigBase()
|
||||
assert hasattr(type_, attrib)
|
||||
assert hasattr(config, attrib)
|
||||
setattr(config, attrib, "foo")
|
||||
container = InnerEyeContainer(config)
|
||||
assert getattr(container, attrib) == "foo"
|
||||
|
||||
|
||||
def test_create_fastmri_container() -> None:
|
||||
"""
|
||||
Test if we can create a model that uses the fastMRI submodule. This is effectively just testing module imports,
|
||||
and if the submodule is created correctly.
|
||||
"""
|
||||
from InnerEye.ML.configs.other.fastmri_varnet import VarNetWithImageLogging
|
||||
from Tests.ML.configs.fastmri_random import FastMriOnRandomData
|
||||
FastMriOnRandomData()
|
||||
VarNetWithImageLogging()
|
||||
|
||||
|
||||
@pytest.mark.gpu
|
||||
def test_run_fastmri_container(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test if we can get run the fastMRI model end-to-end. This takes about 2min on a CPU machine, hence only run
|
||||
in AzureML
|
||||
"""
|
||||
runner = default_runner()
|
||||
dataset_dir = test_output_dirs.root_dir / "dataset"
|
||||
dataset_dir.mkdir(parents=True)
|
||||
args = ["", "--model=FastMriOnRandomData",
|
||||
f"--output_to={test_output_dirs.root_dir}",
|
||||
"--model_configs_namespace=Tests.ML.configs"]
|
||||
with mock.patch("sys.argv", args):
|
||||
loaded_config, actual_run = runner.run()
|
||||
assert actual_run is None
|
||||
from Tests.ML.configs.fastmri_random import FastMriOnRandomData
|
||||
assert isinstance(runner.lightning_container, FastMriOnRandomData)
|
||||
|
||||
|
||||
def test_model_name_is_set(test_output_dirs: OutputFolderForTests) -> None:
|
||||
container = DummyContainerWithModel()
|
||||
container.local_dataset = test_output_dirs.root_dir
|
||||
runner = MLRunner(model_config=None, container=container)
|
||||
runner.setup()
|
||||
expected_name = "DummyContainerWithModel"
|
||||
assert runner.container._model_name == expected_name
|
||||
assert expected_name in str(runner.container.outputs_folder)
|
||||
|
||||
|
||||
def test_model_name_for_innereye_container() -> None:
|
||||
"""
|
||||
Test if the InnerEye container picks up the name of the model correctly. The name will impact the output folder
|
||||
structure that is created.
|
||||
"""
|
||||
expected_name = "DummyModel"
|
||||
model = DummyModel()
|
||||
assert model.model_name == expected_name
|
||||
container = InnerEyeContainer(model)
|
||||
assert container.model_name == expected_name
|
||||
|
||||
|
||||
class DummyContainerWithFields(LightningContainer):
|
||||
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
self.perform_training_set_inference = True
|
||||
self.num_epochs = 123456
|
||||
self.l_rate = 1e-2
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return LightningModule()
|
||||
|
||||
|
||||
def test_container_to_str() -> None:
|
||||
"""
|
||||
Test how a string representation of a container looks like.
|
||||
"""
|
||||
c = DummyContainerWithFields()
|
||||
# Set any other field that is not done via the params library
|
||||
c.foo = "bar"
|
||||
s = str(c)
|
||||
print(s)
|
||||
assert "foo" in s
|
||||
assert "bar" in s
|
||||
assert "param" not in s
|
||||
assert "initialized" not in s
|
||||
assert "123456" in s
|
||||
|
||||
|
||||
def test_file_system_with_subfolders(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test if a subfolder can be created within the output folder structure, for use with cross validation.
|
||||
"""
|
||||
model = DummyModel()
|
||||
model.set_output_to(test_output_dirs.root_dir)
|
||||
container = InnerEyeContainer(model)
|
||||
# File system should be copied from model config to container
|
||||
assert container.file_system_config == model.file_system_config
|
||||
runner = MLRunner(model_config=model)
|
||||
runner.setup()
|
||||
assert str(runner.container.outputs_folder).endswith(model.model_name)
|
||||
output_subfolder = "foo"
|
||||
expected_folder = runner.container.outputs_folder / output_subfolder
|
||||
runner = MLRunner(model_config=model, output_subfolder=output_subfolder)
|
||||
runner.setup()
|
||||
assert runner.container.outputs_folder == expected_folder
|
||||
|
||||
|
||||
def test_optim_params1(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test if the optimizer parameters are read correctly for InnerEye configs.
|
||||
"""
|
||||
model = DummyModel()
|
||||
model.set_output_to(test_output_dirs.root_dir)
|
||||
runner = MLRunner(model_config=model)
|
||||
runner.setup()
|
||||
lightning_model = runner.container.model
|
||||
optim, _ = lightning_model.configure_optimizers()
|
||||
assert optim[0].param_groups[0]["lr"] == 1e-3
|
||||
|
||||
|
||||
def test_optim_params2(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
Test if the optimizer parameters are read correctly for containers.
|
||||
"""
|
||||
container = DummyContainerWithModel()
|
||||
container.local_dataset = test_output_dirs.root_dir
|
||||
runner = MLRunner(model_config=None, container=container)
|
||||
runner.setup()
|
||||
lightning_model = runner.container.model
|
||||
optim, _ = lightning_model.configure_optimizers()
|
||||
expected_lr = 1e-1
|
||||
assert container.l_rate == expected_lr
|
||||
assert optim[0].param_groups[0]["lr"] == expected_lr
|
|
@ -14,9 +14,8 @@ from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, ModelExecu
|
|||
from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
|
||||
from InnerEye.ML.metrics import InferenceMetricsForClassification
|
||||
from InnerEye.ML.model_testing import model_test
|
||||
from InnerEye.ML.model_training import model_train
|
||||
from InnerEye.ML.utils.run_recovery import RunRecovery
|
||||
from Tests.ML.util import get_default_checkpoint_handler
|
||||
from Tests.ML.util import get_default_checkpoint_handler, model_train_unittest
|
||||
|
||||
|
||||
# @pytest.mark.parametrize("mean_teacher_model", [True, False])
|
||||
|
@ -35,10 +34,8 @@ def test_recover_testing_from_run_recovery(mean_teacher_model: bool,
|
|||
os.makedirs(str(config.outputs_folder))
|
||||
config.recovery_checkpoint_save_interval = 2
|
||||
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
|
||||
project_root=test_output_dirs.root_dir)
|
||||
train_results = model_train(config, checkpoint_handler=checkpoint_handler)
|
||||
assert len(train_results.train_results_per_epoch) == config.num_epochs
|
||||
train_results, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
|
||||
assert len(train_results.train_results_per_epoch()) == config.num_epochs
|
||||
|
||||
# Run inference on this
|
||||
test_results = model_test(config=config, data_split=ModelExecutionMode.TEST, checkpoint_handler=checkpoint_handler)
|
||||
|
|
|
@ -19,7 +19,6 @@ from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, is_windows, l
|
|||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
|
||||
from InnerEye.Common.metrics_constants import MetricType, TrackedMetrics, VALIDATION_PREFIX
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML import model_training
|
||||
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode, \
|
||||
RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX, \
|
||||
STORED_CSV_FILE_NAMES
|
||||
|
@ -27,15 +26,15 @@ from InnerEye.ML.config import MixtureLossComponent, SegmentationLoss
|
|||
from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
|
||||
from InnerEye.ML.dataset.sample import CroppedSample
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
from InnerEye.ML.model_training import aggregate_and_create_subject_metrics_file, model_train
|
||||
from InnerEye.ML.lightning_loggers import StoringLogger
|
||||
from InnerEye.ML.model_training import aggregate_and_create_subject_metrics_file
|
||||
from InnerEye.ML.models.losses.mixture import MixtureLoss
|
||||
from InnerEye.ML.utils.io_util import load_nifti_image
|
||||
from InnerEye.ML.utils.model_util import create_segmentation_loss_function
|
||||
from InnerEye.ML.utils.run_recovery import RunRecovery
|
||||
from InnerEye.ML.utils.training_util import ModelTrainingResults
|
||||
from InnerEye.ML.visualizers.patch_sampling import PATCH_SAMPLING_FOLDER
|
||||
from Tests.ML.configs.DummyModel import DummyModel
|
||||
from Tests.ML.util import get_default_checkpoint_handler, machine_has_gpu
|
||||
from Tests.ML.util import get_default_checkpoint_handler, machine_has_gpu, model_train_unittest
|
||||
|
||||
config_path = full_ml_test_data_path()
|
||||
base_path = full_ml_test_data_path()
|
||||
|
@ -102,22 +101,24 @@ def _test_model_train(output_dirs: OutputFolderForTests,
|
|||
train_config.recovery_checkpoint_save_interval = 1
|
||||
|
||||
if machine_has_gpu:
|
||||
expected_train_losses = [0.4553468, 0.454904]
|
||||
expected_val_losses = [0.4553881, 0.4553041]
|
||||
expected_train_losses = [0.4552919, 0.4548529]
|
||||
expected_val_losses = [0.455389, 0.455306]
|
||||
else:
|
||||
expected_train_losses = [0.4553469, 0.4548947]
|
||||
expected_val_losses = [0.4553880, 0.4553041]
|
||||
expected_train_losses = [0.4552919, 0.4548538]
|
||||
expected_val_losses = [0.4553891, 0.4553060]
|
||||
loss_absolute_tolerance = 1e-6
|
||||
expected_learning_rates = [train_config.l_rate, 5.3589e-4]
|
||||
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=train_config,
|
||||
project_root=Path(output_dirs.root_dir))
|
||||
model_training_result = model_training.model_train(train_config,
|
||||
checkpoint_handler=checkpoint_handler)
|
||||
assert isinstance(model_training_result, ModelTrainingResults)
|
||||
model_training_result, _ = model_train_unittest(train_config, dirs=output_dirs)
|
||||
assert isinstance(model_training_result, StoringLogger)
|
||||
|
||||
actual_train_losses = model_training_result.get_train_metric(MetricType.LOSS.value)
|
||||
actual_val_losses = model_training_result.get_val_metric(MetricType.LOSS.value)
|
||||
print("actual_train_losses = {}".format(actual_train_losses))
|
||||
print("actual_val_losses = {}".format(actual_val_losses))
|
||||
|
||||
def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
|
||||
actual = model_training_result.get_training_metric(metric)
|
||||
actual = model_training_result.get_train_metric(metric)
|
||||
assert np.allclose(actual, expected, **kwargs), f"Mismatch for {metric}: Got {actual}, expected {expected}"
|
||||
|
||||
# check to make sure training batches are NOT all the same across epochs
|
||||
|
@ -135,28 +136,24 @@ def _test_model_train(output_dirs: OutputFolderForTests,
|
|||
# and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
|
||||
# The following values are read off directly from the results of compute_dice_across_patches in the training loop
|
||||
# This checks that averages are computed correctly, and that metric computers are reset after each epoch.
|
||||
train_voxels = [[83092.0, 83212.0, 82946.0], [83000.0, 82881.0, 83309.0]]
|
||||
train_voxels = [[82860.0, 83212.0, 83087.0], [82831.0, 82900.0, 83212.0]]
|
||||
val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
|
||||
_check_voxel_count(model_training_result.train_results_per_epoch, _mean_list(train_voxels), "Train")
|
||||
_check_voxel_count(model_training_result.val_results_per_epoch, _mean_list(val_voxels), "Val")
|
||||
_check_voxel_count(model_training_result.train_results_per_epoch(), _mean_list(train_voxels), "Train")
|
||||
_check_voxel_count(model_training_result.val_results_per_epoch(), _mean_list(val_voxels), "Val")
|
||||
|
||||
actual_train_losses = model_training_result.get_training_metric(MetricType.LOSS.value)
|
||||
actual_val_losses = model_training_result.get_validation_metric(MetricType.LOSS.value)
|
||||
print("actual_train_losses = {}".format(actual_train_losses))
|
||||
print("actual_val_losses = {}".format(actual_val_losses))
|
||||
assert np.allclose(actual_train_losses, expected_train_losses, atol=loss_absolute_tolerance), "Train losses"
|
||||
assert np.allclose(actual_val_losses, expected_val_losses, atol=loss_absolute_tolerance), "Val losses"
|
||||
# Check that the metric we track for Hyperdrive runs is actually written.
|
||||
assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX)
|
||||
tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):]
|
||||
for val_result in model_training_result.val_results_per_epoch:
|
||||
for val_result in model_training_result.val_results_per_epoch():
|
||||
assert tracked_metric in val_result
|
||||
|
||||
# The following values are read off directly from the results of compute_dice_across_patches in the
|
||||
# training loop. Results are slightly different for CPU, hence use a larger tolerance there.
|
||||
dice_tolerance = 1e-4 if machine_has_gpu else 4.5e-4
|
||||
train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0309, 0.0334, 0.0961]]
|
||||
train_dice_region1 = [[0.4806, 0.4800, 0.4832], [0.4812, 0.4842, 0.4663]]
|
||||
# training loop. Results are slightly different for GPU, hence use a larger tolerance there.
|
||||
dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4
|
||||
train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0372, 0.0388, 0.1091]]
|
||||
train_dice_region1 = [[0.4785, 0.4807, 0.4834], [0.4832, 0.4800, 0.4628]]
|
||||
# There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
|
||||
# test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
|
||||
# failing here, the losses match up to the expected tolerance.
|
||||
|
@ -192,10 +189,10 @@ def _test_model_train(output_dirs: OutputFolderForTests,
|
|||
assert len(list(sampling_folder.rglob("*.png"))) == 3 * train_config.show_patch_sampling
|
||||
|
||||
# Time per epoch: Test that we have all these times logged.
|
||||
model_training_result.get_training_metric(MetricType.SECONDS_PER_EPOCH.value)
|
||||
model_training_result.get_validation_metric(MetricType.SECONDS_PER_EPOCH.value)
|
||||
model_training_result.get_validation_metric(MetricType.SECONDS_PER_BATCH.value)
|
||||
model_training_result.get_training_metric(MetricType.SECONDS_PER_BATCH.value)
|
||||
model_training_result.get_train_metric(MetricType.SECONDS_PER_EPOCH.value)
|
||||
model_training_result.get_val_metric(MetricType.SECONDS_PER_EPOCH.value)
|
||||
model_training_result.get_val_metric(MetricType.SECONDS_PER_BATCH.value)
|
||||
model_training_result.get_train_metric(MetricType.SECONDS_PER_BATCH.value)
|
||||
|
||||
# Issue #372
|
||||
# # Test for saving of example images
|
||||
|
@ -323,9 +320,7 @@ def test_recover_training_mean_teacher_model(test_output_dirs: OutputFolderForTe
|
|||
|
||||
# First round of training
|
||||
config.num_epochs = 2
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
|
||||
project_root=test_output_dirs.root_dir)
|
||||
model_train(config, checkpoint_handler=checkpoint_handler)
|
||||
model_train_unittest(config, dirs=test_output_dirs)
|
||||
assert len(list(config.checkpoint_folder.glob("*.*"))) == 2
|
||||
|
||||
# Restart training from previous run
|
||||
|
@ -336,9 +331,13 @@ def test_recover_training_mean_teacher_model(test_output_dirs: OutputFolderForTe
|
|||
# make if seem like run recovery objects have been downloaded
|
||||
checkpoint_root = config.checkpoint_folder / "old_run"
|
||||
shutil.copytree(str(original_checkpoint_folder), str(checkpoint_root))
|
||||
|
||||
# Create a new checkpoint handler and set run_recovery to the copied checkpoints
|
||||
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
|
||||
project_root=test_output_dirs.root_dir)
|
||||
checkpoint_handler.run_recovery = RunRecovery([checkpoint_root])
|
||||
|
||||
model_train(config, checkpoint_handler=checkpoint_handler)
|
||||
model_train_unittest(config, dirs=test_output_dirs, checkpoint_handler=checkpoint_handler)
|
||||
# remove recovery checkpoints
|
||||
shutil.rmtree(checkpoint_root)
|
||||
assert len(list(config.checkpoint_folder.glob("*.*"))) == 2
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
# ------------------------------------------------------------------------------------------
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Union
|
||||
from typing import Any, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
@ -15,15 +15,21 @@ from azureml.core import Workspace
|
|||
from InnerEye.Azure.azure_config import AzureConfig
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.Common.type_annotations import PathOrString, TupleInt3
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.dataset.full_image_dataset import PatientDatasetSource
|
||||
from InnerEye.ML.dataset.sample import PatientMetadata, Sample
|
||||
from InnerEye.ML.deep_learning_config import DeepLearningConfig
|
||||
from InnerEye.ML.lightning_base import InnerEyeContainer
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.lightning_loggers import StoringLogger
|
||||
from InnerEye.ML.model_training import model_train
|
||||
from InnerEye.ML.photometric_normalization import PhotometricNormalization
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
from InnerEye.ML.runner import Runner
|
||||
from InnerEye.ML.utils import io_util
|
||||
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.utils.io_util import ImageHeader, ImageWithHeader
|
||||
from InnerEye.ML.utils.ml_util import is_gpu_available
|
||||
|
||||
|
@ -185,12 +191,12 @@ def assert_binary_files_match(actual_file: Path, expected_file: Path) -> None:
|
|||
DummyPatientMetadata = PatientMetadata(patient_id='42')
|
||||
|
||||
|
||||
def get_model_loader(namespace: Optional[str] = None) -> ModelConfigLoader[SegmentationModelBase]:
|
||||
def get_model_loader(namespace: Optional[str] = None) -> ModelConfigLoader:
|
||||
"""
|
||||
Returns a ModelConfigLoader for segmentation models, with the given non-default namespace (if not None)
|
||||
to search under.
|
||||
"""
|
||||
return ModelConfigLoader[SegmentationModelBase](model_configs_namespace=namespace)
|
||||
return ModelConfigLoader(model_configs_namespace=namespace)
|
||||
|
||||
|
||||
def get_default_azure_config() -> AzureConfig:
|
||||
|
@ -206,7 +212,9 @@ def get_default_checkpoint_handler(model_config: DeepLearningConfig, project_roo
|
|||
Gets a checkpoint handler, using the given model config and the default azure configuration.
|
||||
"""
|
||||
azure_config = get_default_azure_config()
|
||||
return CheckpointHandler(azure_config=azure_config, model_config=model_config,
|
||||
lightning_container = InnerEyeContainer(model_config)
|
||||
return CheckpointHandler(azure_config=azure_config,
|
||||
container=lightning_container,
|
||||
project_root=project_root)
|
||||
|
||||
|
||||
|
@ -216,3 +224,46 @@ def get_default_workspace() -> Workspace:
|
|||
:return:
|
||||
"""
|
||||
return get_default_azure_config().get_workspace()
|
||||
|
||||
|
||||
def model_train_unittest(config: Optional[DeepLearningConfig],
|
||||
dirs: OutputFolderForTests,
|
||||
checkpoint_handler: Optional[CheckpointHandler] = None,
|
||||
lightning_container: Optional[LightningContainer] = None) -> \
|
||||
Tuple[StoringLogger, CheckpointHandler]:
|
||||
"""
|
||||
A shortcut for running model training in the unit test suite. It runs training for the given config, with the
|
||||
default checkpoint handler initialized to point to the test output folder specified in dirs.
|
||||
:param config: The configuration of the model to train.
|
||||
:param dirs: The test fixture that provides an output folder for the test.
|
||||
:param lightning_container: An optional LightningContainer object that will be pass through to the training routine.
|
||||
:param checkpoint_handler: The checkpoint handler that should be used for training. If not provided, it will be
|
||||
created via get_default_checkpoint_handler.
|
||||
:return: Tuple[StoringLogger, CheckpointHandler]
|
||||
"""
|
||||
runner = MLRunner(model_config=config, container=lightning_container)
|
||||
# Setup will set random seeds before model creation, and set the model in the container.
|
||||
# It will also set random seeds correctly. Later we use so initialized container.
|
||||
# For all tests running in AzureML, we need to skip the downloading of datasets that would otherwise happen,
|
||||
# because all unit test configs come with their own local dataset already.
|
||||
runner.setup(use_mount_or_download_dataset=False)
|
||||
if checkpoint_handler is None:
|
||||
azure_config = get_default_azure_config()
|
||||
checkpoint_handler = CheckpointHandler(azure_config=azure_config,
|
||||
container=runner.container,
|
||||
project_root=dirs.root_dir)
|
||||
_, storing_logger = model_train(checkpoint_handler=checkpoint_handler,
|
||||
container=runner.container)
|
||||
return storing_logger, checkpoint_handler # type: ignore
|
||||
|
||||
|
||||
def default_runner() -> Runner:
|
||||
"""
|
||||
Create an InnerEye Runner object with the default settings, pointing to the repository root and
|
||||
default settings files.
|
||||
"""
|
||||
return Runner(project_root=fixed_paths.repository_root_directory(),
|
||||
yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
|
||||
|
||||
|
||||
model_loader_including_tests = get_model_loader(namespace="Tests.ML.configs")
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import pytest
|
||||
|
@ -49,20 +50,20 @@ def test_use_local_weights_file(test_output_dirs: OutputFolderForTests) -> None:
|
|||
assert not checkpoint_handler.local_weights_path
|
||||
|
||||
# weights from local_weights_path and weights_url will be modified if needed and stored at this location
|
||||
expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE
|
||||
expected_path = checkpoint_handler.output_params.outputs_folder / WEIGHTS_FILE
|
||||
|
||||
# Set a weights_path
|
||||
checkpoint_handler.azure_config.run_recovery_id = ""
|
||||
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
checkpoint_handler.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
checkpoint_handler.download_recovery_checkpoints_or_weights()
|
||||
assert checkpoint_handler.local_weights_path == expected_path
|
||||
assert checkpoint_handler.local_weights_path.is_file()
|
||||
|
||||
# set a local_weights_path
|
||||
config.weights_url = ""
|
||||
checkpoint_handler.container.weights_url = ""
|
||||
local_weights_path = test_output_dirs.root_dir / "exist.pth"
|
||||
create_checkpoint_file(local_weights_path)
|
||||
config.local_weights_path = local_weights_path
|
||||
checkpoint_handler.container.local_weights_path = local_weights_path
|
||||
checkpoint_handler.download_recovery_checkpoints_or_weights()
|
||||
assert checkpoint_handler.local_weights_path == expected_path
|
||||
|
||||
|
@ -113,35 +114,35 @@ def test_get_recovery_path_train(test_output_dirs: OutputFolderForTests) -> None
|
|||
assert checkpoint_handler.get_recovery_path_train() is None
|
||||
|
||||
# weights from local_weights_path and weights_url will be modified if needed and stored at this location
|
||||
expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE
|
||||
expected_path = checkpoint_handler.output_params.outputs_folder / WEIGHTS_FILE
|
||||
|
||||
# Set a weights_url to get checkpoint from
|
||||
checkpoint_handler.azure_config.run_recovery_id = ""
|
||||
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
checkpoint_handler.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
checkpoint_handler.download_recovery_checkpoints_or_weights()
|
||||
assert checkpoint_handler.local_weights_path == expected_path
|
||||
config.start_epoch = 0
|
||||
checkpoint_handler.container.start_epoch = 0
|
||||
assert checkpoint_handler.get_recovery_path_train() == expected_path
|
||||
# Can't resume training from an external checkpoint
|
||||
config.start_epoch = 20
|
||||
checkpoint_handler.container.start_epoch = 20
|
||||
with pytest.raises(ValueError) as ex:
|
||||
checkpoint_handler.get_recovery_path_train()
|
||||
assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
|
||||
assert ex.value.args[0] == "Start epoch is > 0, but no run recovery object has been provided to resume training."
|
||||
|
||||
# Set a local_weights_path to get checkpoint from
|
||||
config.weights_url = ""
|
||||
checkpoint_handler.container.weights_url = ""
|
||||
local_weights_path = test_output_dirs.root_dir / "exist.pth"
|
||||
create_checkpoint_file(local_weights_path)
|
||||
config.local_weights_path = local_weights_path
|
||||
checkpoint_handler.container.local_weights_path = local_weights_path
|
||||
checkpoint_handler.download_recovery_checkpoints_or_weights()
|
||||
assert checkpoint_handler.local_weights_path == expected_path
|
||||
config.start_epoch = 0
|
||||
checkpoint_handler.container.start_epoch = 0
|
||||
assert checkpoint_handler.get_recovery_path_train() == expected_path
|
||||
# Can't resume training from an external checkpoint
|
||||
config.start_epoch = 20
|
||||
checkpoint_handler.container.start_epoch = 20
|
||||
with pytest.raises(ValueError) as ex:
|
||||
checkpoint_handler.get_recovery_path_train()
|
||||
assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
|
||||
assert ex.value.args[0] == "Start epoch is > 0, but no run recovery object has been provided to resume training."
|
||||
|
||||
|
||||
@pytest.mark.after_training_single_run
|
||||
|
@ -162,7 +163,7 @@ def test_get_recovery_path_train_single_run(test_output_dirs: OutputFolderForTes
|
|||
assert "Run recovery set, but start epoch is 0" in ex.value.args[0]
|
||||
|
||||
# Run recovery with start epoch provided should succeed
|
||||
config.start_epoch = 20
|
||||
checkpoint_handler.container.start_epoch = 20
|
||||
expected_path = create_recovery_checkpoint_path(path=config.checkpoint_folder / run_recovery_id.split(":")[1])
|
||||
assert checkpoint_handler.get_recovery_path_train() == expected_path
|
||||
|
||||
|
@ -200,7 +201,7 @@ def test_get_best_checkpoint_single_run(test_output_dirs: OutputFolderForTests)
|
|||
checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
|
||||
checkpoint_handler.download_recovery_checkpoints_or_weights()
|
||||
|
||||
config.start_epoch = 1
|
||||
checkpoint_handler.container.start_epoch = 1
|
||||
# There is no checkpoint in the current run - use the one from run_recovery
|
||||
checkpoint_paths = checkpoint_handler.get_best_checkpoint()
|
||||
expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \
|
||||
|
@ -246,16 +247,16 @@ def test_get_checkpoints_to_test(test_output_dirs: OutputFolderForTests) -> None
|
|||
# so the local weights should be used ignoring any epochs to test
|
||||
local_weights_path = test_output_dirs.root_dir / "exist.pth"
|
||||
create_checkpoint_file(local_weights_path)
|
||||
config.local_weights_path = local_weights_path
|
||||
manage_recovery.container.local_weights_path = local_weights_path
|
||||
manage_recovery.download_recovery_checkpoints_or_weights()
|
||||
checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()
|
||||
assert checkpoint_and_paths
|
||||
assert len(checkpoint_and_paths) == 1
|
||||
assert checkpoint_and_paths[0] == manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
|
||||
assert checkpoint_and_paths[0] == manage_recovery.output_params.outputs_folder / WEIGHTS_FILE
|
||||
|
||||
config.start_epoch = 1
|
||||
manage_recovery.container.start_epoch = 1
|
||||
manage_recovery.additional_training_done()
|
||||
config.checkpoint_folder.mkdir()
|
||||
manage_recovery.container.checkpoint_folder.mkdir()
|
||||
|
||||
# Copy checkpoint to make it seem like training has happened
|
||||
expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
|
||||
|
@ -325,13 +326,13 @@ def test_get_local_weights_path_or_download(test_output_dirs: OutputFolderForTes
|
|||
# If local_weights_path folder exists, get_local_weights_path_or_download should not do anything.
|
||||
local_weights_path = manage_recovery.project_root / "exist.pth"
|
||||
create_checkpoint_file(local_weights_path)
|
||||
manage_recovery.model_config.local_weights_path = local_weights_path
|
||||
manage_recovery.container.local_weights_path = local_weights_path
|
||||
returned_weights_path = manage_recovery.get_local_weights_path_or_download()
|
||||
assert local_weights_path == returned_weights_path
|
||||
|
||||
# Pointing the model to a URL should trigger a download
|
||||
config.local_weights_path = None
|
||||
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
manage_recovery.container.local_weights_path = None
|
||||
manage_recovery.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
downloaded_weights = manage_recovery.get_local_weights_path_or_download()
|
||||
# Download goes into <project_root> / "modelweights" / "resnet18-5c106cde.pth"
|
||||
expected_path = manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \
|
||||
|
@ -361,7 +362,7 @@ def test_get_and_modify_local_weights(test_output_dirs: OutputFolderForTests) ->
|
|||
assert "neither local_weights_path nor weights_url is set in the model config" in ex.value.args[0]
|
||||
|
||||
# Pointing the model to a local_weights_path that does not exist will raise an error.
|
||||
config.local_weights_path = manage_recovery.project_root / "non_exist"
|
||||
manage_recovery.container.local_weights_path = manage_recovery.project_root / "non_exist"
|
||||
with pytest.raises(FileNotFoundError) as file_ex:
|
||||
manage_recovery.get_and_save_modified_weights()
|
||||
assert "Could not find the weights file" in file_ex.value.args[0]
|
||||
|
@ -369,39 +370,42 @@ def test_get_and_modify_local_weights(test_output_dirs: OutputFolderForTests) ->
|
|||
# Test that weights are properly modified when a local_weights_path is set
|
||||
|
||||
# set a method to modify weights:
|
||||
ModelConfigBase.load_checkpoint_and_modify = lambda self, path_to_checkpoint: {"modified": "local", # type: ignore
|
||||
"path": path_to_checkpoint}
|
||||
# Set the local_weights_path to an empty file, which will be passed to modify_checkpoint
|
||||
local_weights_path = manage_recovery.project_root / "exist.pth"
|
||||
create_checkpoint_file(local_weights_path)
|
||||
config.local_weights_path = local_weights_path
|
||||
weights_path = manage_recovery.get_and_save_modified_weights()
|
||||
expected_path = manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
|
||||
# read from weights_path and check that the dict has been written
|
||||
assert weights_path.is_file()
|
||||
assert expected_path == weights_path
|
||||
read = torch.load(str(weights_path))
|
||||
assert read.keys() == {"modified", "path"}
|
||||
assert read["modified"] == "local"
|
||||
assert read["path"] == local_weights_path
|
||||
# clean up
|
||||
weights_path.unlink()
|
||||
with mock.patch.object(ModelConfigBase,
|
||||
'load_checkpoint_and_modify',
|
||||
lambda self, path_to_checkpoint: {"modified": "local", # type: ignore
|
||||
"path": path_to_checkpoint}):
|
||||
# Set the local_weights_path to an empty file, which will be passed to modify_checkpoint
|
||||
local_weights_path = manage_recovery.project_root / "exist.pth"
|
||||
create_checkpoint_file(local_weights_path)
|
||||
manage_recovery.container.local_weights_path = local_weights_path
|
||||
weights_path = manage_recovery.get_and_save_modified_weights()
|
||||
expected_path = manage_recovery.output_params.outputs_folder / WEIGHTS_FILE
|
||||
# read from weights_path and check that the dict has been written
|
||||
assert weights_path.is_file()
|
||||
assert expected_path == weights_path
|
||||
read = torch.load(str(weights_path))
|
||||
assert read.keys() == {"modified", "path"}
|
||||
assert read["modified"] == "local"
|
||||
assert read["path"] == local_weights_path
|
||||
# clean up
|
||||
weights_path.unlink()
|
||||
|
||||
# Test that weights are properly modified when weights_url is set
|
||||
|
||||
# set a different method to modify weights, to avoid using old files from other tests:
|
||||
ModelConfigBase.load_checkpoint_and_modify = lambda self, path_to_checkpoint: {"modified": "url", # type: ignore
|
||||
"path": path_to_checkpoint}
|
||||
# Set the weights_url to the sample pytorch URL, which will be passed to modify_checkpoint
|
||||
config.local_weights_path = None
|
||||
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
weights_path = manage_recovery.get_and_save_modified_weights()
|
||||
expected_path = manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
|
||||
# read from weights_path and check that the dict has been written
|
||||
assert weights_path.is_file()
|
||||
assert expected_path == weights_path
|
||||
read = torch.load(str(weights_path))
|
||||
assert read.keys() == {"modified", "path"}
|
||||
assert read["modified"] == "url"
|
||||
assert read["path"] == manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \
|
||||
os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path)
|
||||
with mock.patch.object(ModelConfigBase,
|
||||
'load_checkpoint_and_modify',
|
||||
lambda self, path_to_checkpoint: {"modified": "url", "path": path_to_checkpoint}):
|
||||
# Set the weights_url to the sample pytorch URL, which will be passed to modify_checkpoint
|
||||
manage_recovery.container.local_weights_path = None
|
||||
manage_recovery.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
|
||||
weights_path = manage_recovery.get_and_save_modified_weights()
|
||||
expected_path = manage_recovery.output_params.outputs_folder / WEIGHTS_FILE
|
||||
# read from weights_path and check that the dict has been written
|
||||
assert weights_path.is_file()
|
||||
assert expected_path == weights_path
|
||||
read = torch.load(str(weights_path))
|
||||
assert read.keys() == {"modified", "path"}
|
||||
assert read["modified"] == "url"
|
||||
assert read["path"] == manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \
|
||||
os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path)
|
||||
|
|
|
@ -161,7 +161,7 @@ def _create_lr_scheduler_and_optimizer(config: SegmentationModelBase, optimizer:
|
|||
if optimizer is None:
|
||||
optimizer = _create_dummy_optimizer(config)
|
||||
# create lr scheduler
|
||||
lr_scheduler = SchedulerWithWarmUp(config, optimizer)
|
||||
lr_scheduler = SchedulerWithWarmUp(config, optimizer, num_epochs=config.num_epochs)
|
||||
return lr_scheduler, optimizer
|
||||
|
||||
|
||||
|
@ -215,7 +215,7 @@ def test_lr_scheduler_with_warmup(warmup_epochs: int, expected_values: List[floa
|
|||
l_rate_warmup_epochs=warmup_epochs,
|
||||
l_rate_warmup=LRWarmUpType.Linear,
|
||||
should_validate=False)
|
||||
scheduler = SchedulerWithWarmUp(config, optimizer)
|
||||
scheduler = SchedulerWithWarmUp(config, optimizer, num_epochs=config.num_epochs)
|
||||
lrs = enumerate_scheduler(scheduler, 4)
|
||||
assert lrs == expected_values
|
||||
|
||||
|
|
|
@ -15,7 +15,9 @@ from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, LAST_CHECK
|
|||
LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, RECOVERY_CHECKPOINT_FILE_NAME, RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX, \
|
||||
cleanup_checkpoint_folder, keep_best_checkpoint, keep_latest
|
||||
from InnerEye.ML.config import SegmentationModelBase
|
||||
from InnerEye.ML.lightning_helpers import create_lightning_model, load_from_checkpoint_and_adjust_for_inference
|
||||
from InnerEye.ML.lightning_base import InnerEyeContainer
|
||||
from InnerEye.ML.lightning_helpers import load_from_checkpoint_and_adjust_for_inference
|
||||
from InnerEye.ML.lightning_models import create_lightning_model
|
||||
from InnerEye.ML.model_config_base import ModelConfigBase
|
||||
from InnerEye.ML.model_training import create_lightning_trainer
|
||||
from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
|
||||
|
@ -34,7 +36,8 @@ def create_model_and_store_checkpoint(config: ModelConfigBase, checkpoint_path:
|
|||
:param config: The model configuration.
|
||||
:param checkpoint_path: The path and filename of the checkpoint file.
|
||||
"""
|
||||
trainer, _ = create_lightning_trainer(config)
|
||||
container = InnerEyeContainer(config)
|
||||
trainer, _ = create_lightning_trainer(container)
|
||||
model = create_lightning_model(config)
|
||||
if machine_has_gpu:
|
||||
model = model.cuda() # type: ignore
|
||||
|
|
|
@ -397,7 +397,7 @@ def test_run_ml_with_multi_label_sequence_in_crossval(test_output_dirs: OutputFo
|
|||
config.number_of_cross_validation_splits = 2
|
||||
azure_config = get_default_azure_config()
|
||||
azure_config.train = True
|
||||
MLRunner(config, azure_config).run()
|
||||
MLRunner(config, azure_config=azure_config).run()
|
||||
|
||||
|
||||
def test_load_files_with_prediction_target() -> None:
|
||||
|
|
|
@ -148,3 +148,18 @@ jobs:
|
|||
parameters:
|
||||
pytest_mark: after_training_glaucoma_cv_run
|
||||
test_run_title: tests_after_training_glaucoma_cv_run
|
||||
|
||||
- job: TrainHelloWorld
|
||||
variables:
|
||||
- name: model
|
||||
value: 'HelloWorld'
|
||||
- name: tag
|
||||
value: 'HelloWorldPR'
|
||||
pool:
|
||||
vmImage: 'ubuntu-18.04'
|
||||
steps:
|
||||
- template: train_template.yml
|
||||
parameters:
|
||||
wait_for_completion: 'True'
|
||||
pytest_mark: ''
|
||||
max_run_duration: '1h'
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
steps:
|
||||
- checkout: self
|
||||
lfs: true
|
||||
submodules: true
|
||||
|
||||
- bash: |
|
||||
if [ $(Agent.OS) = 'Windows_NT' ]
|
||||
|
|
|
@ -0,0 +1,229 @@
|
|||
# Bring Your Own PyTorch Lightning Model
|
||||
|
||||
The InnerEye toolbox is capable of training any PyTorch Lighting (PL) model inside of AzureML, making
|
||||
use of all the usual InnerEye toolbox features:
|
||||
- Working with different model in the same codebase, and selecting one by name
|
||||
- Distributed training in AzureML
|
||||
- Logging via AzureML's native capabilities
|
||||
- Training on a local GPU machine or inside of AzureML without code changes
|
||||
- Supply commandline overrides for model configuration elements, to quickly queue many jobs
|
||||
|
||||
This can be used by
|
||||
- Defining a special container class, that encapsulates the PyTorch Lighting model to train, and the data that should
|
||||
be used for training and testing.
|
||||
- Adding essential trainer parameters like number of epochs to that container.
|
||||
- Invoking the InnerEye runner and providing the name of the container class, like this:
|
||||
`python InnerEye/ML/runner.py --model=MyContainer`. To train in AzureML, just add a `--azureml=True` flag.
|
||||
|
||||
There is a fully working example [HelloContainer](../InnerEye/ML/configs/other/HelloContainer.py), that implements
|
||||
a simple 1-dimensional regression model from data stored in a CSV file. You can run that
|
||||
from the command line by `python InnerEye/ML/runner.py --model=HelloContainer`.
|
||||
|
||||
## Setup
|
||||
|
||||
In order to use these capabilities, you need to implement a class deriving from `LightningContainer`. This class
|
||||
encapsulates everything that is needed for training with PyTorch Lightning:
|
||||
- The `create_model` method needs to return a subclass of `LightningModule`, that has
|
||||
all the usual PyTorch Lightning methods required for training, like the `training_step` and `forward` methods. This
|
||||
object needs to adhere to additional constraints, see below.
|
||||
- The `get_data_module` method of the container needs to return a `LightningDataModule` that has the data loaders for
|
||||
training and validation data.
|
||||
- The optional `get_inference_data_module` returns a `LightningDataModule` that is used to read the data for inference
|
||||
(that is, evaluating the trained model). By default, this returns the same data as `get_training_data_module`, but you
|
||||
can override this for special models like segmentation models that are trained on equal sized image patches, but
|
||||
evaluated on full images of varying size.
|
||||
|
||||
Your class needs to be defined in a Python file in the `InnerEye/ML/configs` folder, otherwise it won't be picked up
|
||||
correctly. If you'd like to have your model defined in a different folder, please specify the Python namespace via
|
||||
the `--model_configs_namespace` argument. For example, use `--model_configs_namespace=My.Own.configs` if your
|
||||
model configuration classes reside in folder `My/Own/configs` from the repository root.
|
||||
|
||||
*Example*:
|
||||
```python
|
||||
from pathlib import Path
|
||||
from torch.utils.data import DataLoader
|
||||
from pytorch_lightning import LightningModule, LightningDataModule
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
class MyLightningModel(LightningModule):
|
||||
def __init__(self):
|
||||
self.layer = ...
|
||||
def training_step(self, *args, **kwargs):
|
||||
...
|
||||
def forward(self, *args, **kwargs):
|
||||
...
|
||||
def configure_optimizers(self):
|
||||
...
|
||||
def test_step(self, *args, **kwargs):
|
||||
...
|
||||
|
||||
class MyDataModule(LightningDataModule):
|
||||
def __init__(self, root_path: Path):
|
||||
# All data should be read from the folder given in self.root_path
|
||||
self.root_path = root_path
|
||||
def train_dataloader(self, *args, **kwargs) -> DataLoader:
|
||||
...
|
||||
def val_dataloader(self, *args, **kwargs) -> DataLoader:
|
||||
# The data should be read off self.root_path
|
||||
...
|
||||
def test_dataloader(self, *args, **kwargs) -> DataLoader:
|
||||
# The data should be read off self.root_path
|
||||
...
|
||||
|
||||
class MyContainer(LightningContainer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.azure_dataset_id = "folder_name_in_azure_blob_storage"
|
||||
self.local_dataset = "/some/local/path"
|
||||
self.num_epochs = 42
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return MyLightningModel()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
return MyDataModule(root_path=self.local_dataset)
|
||||
```
|
||||
|
||||
Where does the data for training come from?
|
||||
- When training a model on a local box or VM, the data is read from the `local_dataset` folder that you define in the
|
||||
container.
|
||||
- When training a model in AzureML, the code searches for a folder called `folder_name_in_azure_blob_storage` in
|
||||
Azure blob storage. That is then downloaded or mounted. The local download path is then copied over the `local_dataset`
|
||||
field in the container, and hence you can always read data from `self.local_dataset`
|
||||
- Alternatively, you can use the `prepare_data` method of a `LightningDataModule` to download data from the web,
|
||||
for example. In this case, you don't need to define any of the `local_dataset` or `azure_dataset_id` fields.
|
||||
|
||||
In the above example, training is done for 42 epochs. After the model is trained, it will be evaluated on the test set,
|
||||
via PyTorch Lightning's [built-in test functionality](https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html?highlight=trainer.test#test).
|
||||
See below for an alternative way of running the evaluation on the test set.
|
||||
|
||||
### Outputting files during training
|
||||
|
||||
The Lightning model returned by `create_model` needs to write its output files to the current working directory.
|
||||
When running the InnerEye toolbox outside of AzureML, the toolbox will change the current working directory to a
|
||||
newly created output folder, with a name that contains the time stamp and and the model name.
|
||||
When running the InnerEye toolbox in AzureML, the folder structure will be set up such that all files written
|
||||
to the current working directory are later uploaded to Azure blob storage at the end of the AzureML job. The files
|
||||
will also be later available via the AzureML UI.
|
||||
|
||||
### Trainer arguments
|
||||
All arguments that control the PyTorch Lightning `Trainer` object are defined in the class `TrainerParams`. A
|
||||
`LightningContainer` object inherits from this class. The most essential one is the `num_epochs` field, which controls
|
||||
the `max_epochs` argument of the `Trainer`.
|
||||
|
||||
Usage example:
|
||||
```python
|
||||
from pytorch_lightning import LightningModule, LightningDataModule
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
class MyContainer(LightningContainer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.num_epochs = 42
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return MyLightningModel()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
return MyDataModule(root_path=self.local_dataset)
|
||||
```
|
||||
|
||||
For further details how the `TrainerParams` are used, refer to the `create_lightning_trainer` method in
|
||||
[InnerEye/ML/model_training.py](../InnerEye/ML/model_training.py)
|
||||
|
||||
### Optimizer and LR scheduler arguments
|
||||
There are two possible ways of choosing the optimizer and LR scheduler:
|
||||
- The Lightning model returned by `create_model` can define its own `configure_optimizers` method, with the same
|
||||
signature as `LightningModule.configure_optimizers`. This is the typical way of configuring it for Lightning models.
|
||||
- Alternatively, the model can inherit from `LightningModuleWithOptimizer`. This class implements a
|
||||
`configure_optimizers` method that uses settings defined in the `OptimizerParams` class. These settings are all
|
||||
available from the command line, and you can, for example, start a new run with a different learning rate by
|
||||
supplying the additional commandline flag `--l_rate=1e-2`.
|
||||
|
||||
### Evaluating the trained model
|
||||
The InnerEye toolbox provides two possible routes of implementing that:
|
||||
|
||||
You can either use PyTorch Lightning's built-in capabilities, via the `test_step` method. If the model that is
|
||||
returned by `create_model` implements the `test_step` method, the InnerEye toolbox will use the `trainer.test` method
|
||||
(see [docs](https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html?highlight=trainer.test#test)).
|
||||
In this case, the best checkpoint during training will be used. The test data is read via the data loader created
|
||||
by the `test_dataloader` of the `LightningDataModule` that is used for training/validation.
|
||||
|
||||
Alternatively, the model can implement the methods defined in `InnerEyeInference`. In this case, the methods will be
|
||||
call in this order:
|
||||
```
|
||||
model.on_inference_start()
|
||||
for dataset_split in [Train, Val, Test]
|
||||
model.on_inference_epoch_start(dataset_split, is_ensemble_model=False)
|
||||
for batch_idx, item in enumerate(dataloader[dataset_split])):
|
||||
model_outputs = model.forward(item)
|
||||
model.inference_step(item, batch_idx, model_outputs)
|
||||
model.on_inference_epoch_end()
|
||||
model.on_inference_end()
|
||||
```
|
||||
|
||||
## Overriding properties on the commandline
|
||||
|
||||
You can define hyperparameters that affect data and/or model, as in the following code snippet:
|
||||
```python
|
||||
import param
|
||||
from pytorch_lightning import LightningModule
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
class DummyContainerWithParameters(LightningContainer):
|
||||
num_layers = param.Integer(default=4)
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return MyLightningModel(self.num_layers)
|
||||
...
|
||||
```
|
||||
All parameters added in this form will be automatically accessible from the commandline, there is no need to define
|
||||
a separate argument parser: When starting training, you can add a flag like `--num_layers=7`.
|
||||
|
||||
## Examples
|
||||
|
||||
### Setting only the required fields
|
||||
```python
|
||||
from pytorch_lightning import LightningModule, LightningDataModule
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
|
||||
class Container1(LightningContainer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.azure_dataset_id = "some_folder_in_azure"
|
||||
self.num_epochs = 20
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return MyLightningModel()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
# This should read data from self.local_dataset. Before training, the data folder "some_folder_in_azure"
|
||||
# (given by self.azure_dataset_id) will be downloaded or mounted, and its local path set in
|
||||
# self.local_dataset
|
||||
return MyDataModule(root_folder=self.local_dataset)
|
||||
```
|
||||
|
||||
### Adding additional arguments for the PyTorch Lightning trainer
|
||||
|
||||
```python
|
||||
from typing import Dict, Any
|
||||
from pytorch_lightning import LightningModule, LightningDataModule
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
class Container2(LightningContainer):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.azure_dataset_id = "some_folder_in_azure"
|
||||
self.num_epochs = 20
|
||||
|
||||
def create_model(self) -> LightningModule:
|
||||
return MyLightningModel()
|
||||
|
||||
def get_data_module(self) -> LightningDataModule:
|
||||
# This should read data from self.local_dataset. Before training, the data folder "some_folder_in_azure"
|
||||
# (given by self.azure_dataset_id) will be downloaded or mounted, and its local path set in
|
||||
# self.local_dataset
|
||||
return MyDataModule(root_folder=self.local_dataset)
|
||||
|
||||
def get_trainer_arguments(self) -> Dict[str, Any]:
|
||||
# These arguments will be passed through to the Lightning trainer.
|
||||
return {"gradient_clip_val": 1, "limit_train_batches": 10}
|
||||
```
|
||||
|
|
@ -27,7 +27,7 @@ dependencies:
|
|||
- lightning-bolts==0.3.1
|
||||
- matplotlib==3.3.0
|
||||
- mlflow==1.12.1
|
||||
- mypy==0.770
|
||||
- mypy==0.812
|
||||
- mypy-extensions==0.4.3
|
||||
- numba==0.51.2
|
||||
- numpy==1.19.1
|
||||
|
@ -47,6 +47,7 @@ dependencies:
|
|||
- pytorch-lightning==1.2.8
|
||||
- rich==5.1.1
|
||||
- rpdb==0.1.6
|
||||
- runstats==1.8.0
|
||||
- scikit-image==0.17.2
|
||||
- scikit-learn==0.23.2
|
||||
- scipy==1.5.2
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Subproject commit f2070aeb7a5e7d1b0e45c6aad247d18d074705a8
|
|
@ -5,10 +5,10 @@
|
|||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from shutil import which
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
from shutil import which
|
||||
from typing import List
|
||||
|
||||
|
||||
def run_mypy(files: List[str], mypy_executable_path: str) -> int:
|
||||
|
@ -23,51 +23,24 @@ def run_mypy(files: List[str], mypy_executable_path: str) -> int:
|
|||
:return: maximum return code from any of the mypy runs
|
||||
"""
|
||||
return_code = 0
|
||||
iteration = 1
|
||||
while files:
|
||||
dirs = sorted(set(os.path.dirname(file) or "." for file in files))
|
||||
print(f"Iteration {iteration}: running mypy on {len(files)} files in {len(dirs)} directories")
|
||||
# Set of files we are hoping to see mentioned in the mypy log.
|
||||
files_to_do = set(files)
|
||||
for index, dir in enumerate(dirs, 1):
|
||||
# Adding "--no-site-packages" might be necessary if there are errors in site packages,
|
||||
# but it may stop inconsistencies with site packages being spotted.
|
||||
command = [mypy_executable_path, "--config=mypy.ini", "--verbose", dir]
|
||||
print(f"Processing directory {index:2d} of {len(dirs)}: {Path(dir).absolute()}")
|
||||
print(f"Running mypy on {len(files)} files")
|
||||
for index, file in enumerate(files):
|
||||
print(f"Processing {(index+1):2d} of {len(files)}: {file}")
|
||||
file_path = Path(file)
|
||||
mypy_args = []
|
||||
if file_path.is_file():
|
||||
mypy_args = [file]
|
||||
elif file_path.is_dir():
|
||||
# There is a bug in recent mypy versions, complaining about duplicate files when telling
|
||||
# mypy to scan a directory. Telling it to scan a namespace avoids this bug.
|
||||
mypy_args = ["-p", file.replace(os.path.sep, ".")]
|
||||
else:
|
||||
print("Skipping.")
|
||||
if mypy_args:
|
||||
command = [mypy_executable_path, "--config=mypy.ini", *mypy_args]
|
||||
# We pipe stdout and then print it, otherwise lines can appear in the wrong order in builds.
|
||||
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
process = subprocess.run(command)
|
||||
return_code = max(return_code, process.returncode)
|
||||
for line in process.stdout.split("\n"):
|
||||
if line and not line.startswith("Success: "):
|
||||
tokens = line.split(":")
|
||||
if line.startswith("Found") or len(tokens) < 2:
|
||||
print(line)
|
||||
else:
|
||||
print(f"{Path.cwd() / tokens[0]}:{':'.join(tokens[1:])}")
|
||||
|
||||
# Remove from files_to_do every Python file that's reported as processed in the log.
|
||||
for line in process.stderr.split("\n"):
|
||||
tokens = line.split()
|
||||
if len(tokens) == 4 and tokens[0] == "LOG:" and tokens[1] == "Parsing":
|
||||
name = tokens[2]
|
||||
elif len(tokens) == 7 and tokens[:4] == ["LOG:", "Metadata", "fresh", "for"]:
|
||||
name = tokens[-1]
|
||||
else:
|
||||
continue
|
||||
if name.endswith(".py"):
|
||||
if name.startswith("./") or name.startswith(".\\"):
|
||||
name = name[2:]
|
||||
files_to_do.discard(name)
|
||||
# If we didn't manage to discard any files, there's no point continuing. This should not occur, but if
|
||||
# it does, we don't want to continue indefinitely.
|
||||
if len(files_to_do) == len(files):
|
||||
print("No further files appear to have been checked! Unchecked files are:")
|
||||
for file in sorted(files_to_do):
|
||||
print(f" {file}")
|
||||
return_code = max(return_code, 1)
|
||||
break
|
||||
files = sorted(files_to_do)
|
||||
iteration += 1
|
||||
return return_code
|
||||
|
||||
|
||||
|
@ -83,17 +56,11 @@ def main() -> int:
|
|||
args = parser.parse_args()
|
||||
current_dir = Path(".")
|
||||
if args.files:
|
||||
file_list = [Path(arg) for arg in args.files if arg.endswith(".py")]
|
||||
file_list = args.files
|
||||
else:
|
||||
# We don't want to check the files in the submodule if any, partly because they should already have
|
||||
# been checked in the original repo, and partly because we don't want the module name clashes mypy would
|
||||
# otherwise report.
|
||||
submodule_name = "innereye-deeplearning"
|
||||
files = set(current_dir.glob('*.py'))
|
||||
for path in current_dir.glob('*'):
|
||||
if path.name != submodule_name:
|
||||
files.update(path.rglob('*.py'))
|
||||
file_list = list(files)
|
||||
file_list = list(str(f) for f in current_dir.glob('*.py'))
|
||||
for dir in ["InnerEye", "Tests", "TestsOutsidePackage", "TestSubmodule"]:
|
||||
file_list.append(dir)
|
||||
|
||||
mypy = args.mypy or which("mypy")
|
||||
if not mypy:
|
||||
|
|
5
score.py
5
score.py
|
@ -26,7 +26,7 @@ from InnerEye.ML.model_testing import DEFAULT_RESULT_IMAGE_NAME
|
|||
from InnerEye.ML.photometric_normalization import PhotometricNormalization
|
||||
from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
|
||||
from InnerEye.ML.pipelines.inference import FullImageInferencePipelineBase, InferencePipeline
|
||||
from InnerEye.ML.utils.config_util import ModelConfigLoader
|
||||
from InnerEye.ML.utils.config_loader import ModelConfigLoader
|
||||
from InnerEye.ML.utils.io_util import ImageWithHeader, load_nifti_image, reverse_tuple_float3, store_as_ubyte_nifti, \
|
||||
load_dicom_series_and_save
|
||||
|
||||
|
@ -67,8 +67,7 @@ def init_from_model_inference_json(model_folder: Path, use_gpu: bool = True) ->
|
|||
logging.info(f'model_inference_config: {model_inference_config}')
|
||||
full_path_to_checkpoints = [model_folder / x for x in model_inference_config.checkpoint_paths]
|
||||
logging.info(f'full_path_to_checkpoints: {full_path_to_checkpoints}')
|
||||
loader = ModelConfigLoader[SegmentationModelBase](
|
||||
model_configs_namespace=model_inference_config.model_configs_namespace)
|
||||
loader = ModelConfigLoader(model_configs_namespace=model_inference_config.model_configs_namespace)
|
||||
model_config = loader.create_model_config_from_name(model_name=model_inference_config.model_name)
|
||||
return create_inference_pipeline(model_config, full_path_to_checkpoints, use_gpu)
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче