Enable Bring-your-own-Lightning-model (#417)

- Enable brining arbitrary PyTorch-Lightning models to the InnerEye toolbox
- Upgrade mypy and simplify the way we invoke it
This commit is contained in:
Anton Schwaighofer 2021-04-19 16:28:41 +01:00 коммит произвёл GitHub
Родитель 780e420973
Коммит 0d479ba3d8
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
72 изменённых файлов: 3176 добавлений и 1301 удалений

Просмотреть файл

@ -2,3 +2,4 @@
ignore = E226,E302,E41,W391, E701, W291, E722, W503, E128, E126, E127, E731, E401
max-line-length = 160
max-complexity = 25
exclude = fastMRI/

Просмотреть файл

@ -45,6 +45,13 @@ jobs:
PYTHONPATH: ${{ github.workspace }}
if: always()
- name: Run HelloContainer model
run: |
$CONDA/envs/InnerEye/bin/python ./InnerEye/ML/runner.py --model=HelloContainer
env:
PYTHONPATH: ${{ github.workspace }}
if: always()
windows:
runs-on: windows-latest
steps:

3
.gitmodules поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
[submodule "fastMRI"]
path = fastMRI
url = https://github.com/facebookresearch/fastMRI

Просмотреть файл

@ -13,6 +13,9 @@ created.
### Added
- ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Added a generic way of adding PyTorch Lightning
models to the toolbox. It is now possible to train almost any Lightning model with the InnerEye toolbox in AzureML,
with only minimum code changes required. See [the MD documentation](docs/bring_your_own_model.md) for details.
- ([#430](https://github.com/microsoft/InnerEye-DeepLearning/pull/430)) Update conversion to 1.0.1 InnerEye-DICOM-RT to
add: manufacturer, SoftwareVersions, Interpreter and ROIInterpretedTypes.
- ([#385](https://github.com/microsoft/InnerEye-DeepLearning/pull/385)) Add the ability to train a model on multiple
@ -70,6 +73,7 @@ created.
- ([#437](https://github.com/microsoft/InnerEye-DeepLearning/pull/437)) Fixed multi-node DDP bug in PL v1.2.8. Re-add
end-to-end test for multi-node.
### Removed
- ([#417](https://github.com/microsoft/InnerEye-DeepLearning/pull/417)) Removed an output file that only contains metadata for a legacy consumer
### Deprecated

Просмотреть файл

@ -275,19 +275,6 @@ class SourceConfig:
self.script_params = retained_args
@dataclass
class ExperimentResultLocation:
"""
Information that is need to recover where the results of an experiment reside.
"""
results_container_name: Optional[str] = None
results_uri: Optional[str] = None
dataset_folder: Optional[str] = None
dataset_uri: Optional[str] = None
azure_job_name: Optional[str] = None
commandline_overrides: Optional[str] = None
@dataclass
class ParserResult:
"""

Просмотреть файл

@ -19,17 +19,18 @@ from azureml.core.datastore import Datastore
from azureml.core.runconfig import MpiConfiguration, RunConfiguration
from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
from azureml.data import FileDataset
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from InnerEye.Azure import azure_util
from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, RUN_RECOVERY_FROM_ID_KEY_NAME, \
RUN_RECOVERY_ID_KEY_NAME, \
merge_conda_dependencies
is_offline_run_context, merge_conda_dependencies
from InnerEye.Azure.secrets_handling import read_all_settings
from InnerEye.Azure.tensorboard_monitor import AMLTensorBoardMonitorConfig, monitor
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.config_loader import ModelConfigLoader
SLEEP_TIME_SECONDS = 30
INPUT_DATA_KEY = "input_data"
@ -42,15 +43,12 @@ ENVIRONMENT_VERSION = "1"
def submit_to_azureml(azure_config: AzureConfig,
source_config: SourceConfig,
model_config_overrides: str,
azure_dataset_id: str) -> Run:
"""
The main entry point. It creates an AzureML workspace if needed, submits an experiment using the code
as specified in source_config, and waits for completion if needed.
:param azure_config: azure related configurations to setup valid workspace
:param source_config: The information about which code should be submitted, and which arguments should be used.
:param model_config_overrides: A string that describes which model parameters were overwritten by commandline
arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
:param azure_dataset_id: The name of the dataset on blob storage to be used for this run.
"""
azure_run: Optional[Run] = None
@ -68,8 +66,7 @@ def submit_to_azureml(azure_config: AzureConfig,
for s in [signal.SIGINT, signal.SIGTERM]:
signal.signal(s, interrupt_handler)
# create train/test experiment
azure_run = create_and_submit_experiment(azure_config, source_config, model_config_overrides,
azure_dataset_id)
azure_run = create_and_submit_experiment(azure_config, source_config, azure_dataset_id)
if azure_config.wait_for_completion:
# We want the job output to be visible on the console, but the program should not exit if the
@ -79,13 +76,12 @@ def submit_to_azureml(azure_config: AzureConfig,
return azure_run
def set_run_tags(run: Run, azure_config: AzureConfig, model_config_overrides: str) -> None:
def set_run_tags(run: Run, azure_config: AzureConfig, commandline_args: str) -> None:
"""
Set metadata for the run
:param run: Run to set metadata for.
:param azure_config: The configurations for the present AzureML job
:param model_config_overrides: A string that describes which model parameters were overwritten by commandline
arguments in the present run.
:param commandline_args: A string that holds all commandline arguments that were used for the present run.
"""
git_information = azure_config.get_git_information()
run.set_tags({
@ -103,7 +99,7 @@ def set_run_tags(run: Run, azure_config: AzureConfig, model_config_overrides: st
"source_message": git_information.commit_message,
"source_author": git_information.commit_author,
"source_dirty": str(git_information.is_dirty),
"overrides": model_config_overrides,
"commandline_args": commandline_args,
CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY: -1,
})
@ -125,14 +121,11 @@ def create_experiment_name(azure_config: AzureConfig) -> str:
def create_and_submit_experiment(
azure_config: AzureConfig,
source_config: SourceConfig,
model_config_overrides: str,
azure_dataset_id: str) -> Run:
"""
Creates an AzureML experiment in the workspace and submits it for execution.
:param azure_config: azure related configurations to setup valid workspace
:param source_config: The information about which code should be submitted, and which arguments should be used.
:param model_config_overrides: A string that describes which model parameters were overwritten by commandline
arguments in the present run. This is only used for diagnostic purposes (it is set as a Tag on the run).
:param azure_dataset_id: The name of the dataset in blob storage to be used for this run.
:returns: Run object for the submitted AzureML run
"""
@ -144,8 +137,12 @@ def create_and_submit_experiment(
# submit a training/testing run associated with the experiment
run: Run = exp.submit(script_run_config)
# set metadata for the run
set_run_tags(run, azure_config, model_config_overrides)
if is_offline_run_context(run):
# This codepath will only be executed in unit tests, when exp.submit is mocked.
return run
# Set metadata for the run.
set_run_tags(run, azure_config, commandline_args=(" ".join(source_config.script_params)))
print("\n==============================================================================")
print(f"Successfully queued new run {run.id} in experiment: {exp.name}")
@ -276,6 +273,21 @@ def get_or_create_python_environment(azure_config: AzureConfig,
return env
def get_dataset_consumption(azure_config: AzureConfig, azure_dataset_id: str) -> DatasetConsumptionConfig:
"""
Creates a configuration for using an AzureML dataset inside of an AzureML run. This will make the AzureML
dataset with given name available as a named input, using INPUT_DATA_KEY as the key.
:param azure_config: azure related configurations to use for model scale-out behaviour
:param azure_dataset_id: The name of the dataset in blob storage to be used for this run. This can be an empty
string to not use any datasets.
"""
azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
if not azureml_dataset:
raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
return named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
def create_run_config(azure_config: AzureConfig,
source_config: SourceConfig,
azure_dataset_id: str = "",
@ -292,11 +304,7 @@ def create_run_config(azure_config: AzureConfig,
:return: The configured script run.
"""
if azure_dataset_id:
azureml_dataset = get_or_create_dataset(azure_config, azure_dataset_id=azure_dataset_id)
if not azureml_dataset:
raise ValueError(f"AzureML dataset {azure_dataset_id} could not be found or created.")
named_input = azureml_dataset.as_named_input(INPUT_DATA_KEY)
dataset_consumption = named_input.as_mount() if azure_config.use_dataset_mount else named_input.as_download()
dataset_consumption = get_dataset_consumption(azure_config, azure_dataset_id)
else:
dataset_consumption = None
# AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
@ -354,8 +362,7 @@ def create_runner_parser(model_config_class: type = None) -> argparse.ArgumentPa
def parse_args_and_add_yaml_variables(parser: ArgumentParser,
yaml_config_file: Optional[Path] = None,
project_root: Optional[Path] = None,
fail_on_unknown_args: bool = False,
args: List[str] = None) -> ParserResult:
fail_on_unknown_args: bool = False) -> ParserResult:
"""
Reads arguments from sys.argv, modifies them with secrets from local YAML files,
and parses them using the given argument parser.
@ -364,14 +371,12 @@ def parse_args_and_add_yaml_variables(parser: ArgumentParser,
:param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv.
:param fail_on_unknown_args: If True, raise an exception if the parser encounters an argument that it does not
recognize. If False, unrecognized arguments will be ignored, and added to the "unknown" field of the parser result.
:param args: arguments to parse
:return: The parsed arguments, and overrides
"""
settings_from_yaml = read_all_settings(yaml_config_file, project_root=project_root)
return parse_arguments(parser,
settings_from_yaml=settings_from_yaml,
fail_on_unknown_args=fail_on_unknown_args,
args=args)
fail_on_unknown_args=fail_on_unknown_args)
def _create_default_namespace(parser: ArgumentParser) -> Namespace:
@ -471,7 +476,7 @@ def run_duration_string_to_seconds(s: str) -> Optional[int]:
elif suffix == "d":
multiplier = 24 * 60 * 60
else:
raise ArgumentError("s", f"Invalid suffix: Must be one of 's', 'm', 'h', 'd', but got: {s}")
raise ArgumentError("s", f"Invalid suffix: Must be one of 's', 'm', 'h', 'd', but got: {s}") # type: ignore
return int(float(s[:-1]) * multiplier)

Просмотреть файл

@ -45,15 +45,6 @@ INNEREYE_SDK_NAME = "innereye"
INNEREYE_SDK_VERSION = "1.0"
def get_results_blob_path(run_id: str) -> str:
"""
Creates the name of the top level folder that contains the results for a given AzureML run.
:param run_id: The AzureML run ID for which the folder should be created.
:return: A full Azure blob storage path, starting with the container name.
"""
return AZUREML_RUN_FOLDER + run_id
def create_run_recovery_id(run: Run) -> str:
"""
Creates an recovery id for a run so it's checkpoints could be recovered for training/testing
@ -293,6 +284,21 @@ def merge_conda_files(files: List[Path], result_file: Path) -> None:
ruamel.yaml.dump(unified_definition, f, indent=2, default_flow_style=False)
def get_all_environment_files(project_root: Path) -> List[Path]:
"""
Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
and possibly a second environment.yml file that lives at the project root folder.
:param project_root: The root folder of the code that starts the present training run.
:return: A list with 1 or 2 entries that are conda environment files.
"""
innereye_yaml = fixed_paths.get_environment_yaml_file()
project_yaml = project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME
files = [innereye_yaml]
if innereye_yaml != project_yaml:
files.append(project_yaml)
return files
def merge_conda_dependencies(files: List[Path]) -> Tuple[CondaDependencies, str]:
"""
Creates a CondaDependencies object from the Conda environments specified in one or more files.

Просмотреть файл

@ -1,52 +0,0 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import json
from pathlib import Path
from typing import Optional
from InnerEye.Azure.azure_config import AzureConfig, ExperimentResultLocation
BUILDINFORMATION_JSON = "buildinformation.json"
def build_information_to_dot_net_json(azure_config: AzureConfig, result_location: ExperimentResultLocation) -> str:
"""
Converts the build metadata to a JSON string.
:param azure_config: Azure configuration file with build information.
:param result_location: ExperimentResultLocation object with result locations.
"""
git_information = azure_config.get_git_information()
return json.dumps({
"BuildNumber": azure_config.build_number,
"BuildRequestedFor": azure_config.build_user,
"BuildSourceBranchName": git_information.branch,
"BuildSourceVersion": git_information.commit_id,
"BuildSourceAuthor": git_information.commit_author,
"ModelName": azure_config.model,
"ResultsContainerName": result_location.results_container_name,
"ResultsUri": result_location.results_uri,
"DatasetFolder": result_location.dataset_folder,
"DatasetFolderUri": result_location.dataset_uri,
"AzureBatchJobName": result_location.azure_job_name})
def build_information_to_dot_net_json_file(azure_config: AzureConfig,
result_location: ExperimentResultLocation,
folder: Optional[Path] = None) -> None:
"""
Writes the build metadata to a file called buildinformation.json in the given folder.
:param azure_config: Azure configuration file
:param result_location: ExperimentResultLocation object with result locations.
:param folder: Results are written to this folder, if not None. Else, results are written in the root folder.
"""
filename = Path(BUILDINFORMATION_JSON)
if folder is not None:
if not folder.exists():
folder.mkdir(parents=True)
full_file = filename if folder is None else folder / filename
with full_file.open("w") as f:
f.write(build_information_to_dot_net_json(azure_config, result_location))

Просмотреть файл

@ -389,3 +389,29 @@ def remove_file_or_directory(pth: Path) -> None:
pth.rmdir()
elif pth.exists():
pth.unlink()
def add_folder_to_sys_path_if_needed(folder_under_repo_root: str) -> None:
"""
Checks if the Python paths in sys.path already contain the given folder, which is expected to be relative
to the repository root. If that folder is not yet in sys.path, add it.
"""
full_folder = repository_root_directory() / folder_under_repo_root
for path_str in sys.path:
path = Path(path_str)
if path == full_folder:
return
print(f"Adding {full_folder} to sys.path")
sys.path.append(str(full_folder))
@contextmanager
def change_working_directory(path_or_str: PathOrString) -> Generator:
"""
Context manager for changing the current working directory
"""
new_path = Path(path_or_str).expanduser()
old_path = Path.cwd()
os.chdir(new_path)
yield
os.chdir(old_path)

Просмотреть файл

@ -34,6 +34,8 @@ DEFAULT_RESULT_ZIP_DICOM_NAME = "segmentation.dcm.zip"
DEFAULT_AML_LOGS_DIR = "azureml-logs"
DEFAULT_LOGS_DIR_NAME = "logs"
LOG_FILE_NAME = "stdout.txt"
DEFAULT_MODEL_SUMMARIES_DIR_PATH = Path(DEFAULT_LOGS_DIR_NAME) / "model_summaries"
# The folder at the project root directory that holds datasets for local execution.
DATASETS_DIR_NAME = "datasets"

Просмотреть файл

@ -32,17 +32,6 @@ def full_ml_test_data_path(path: str = "") -> Path:
return _full_test_data_path("ML", path)
def full_azure_test_data_path(path: str = "") -> Path:
"""
Takes a relative path inside of the Azure/tests/test_data folder, and returns its
full absolute path.
:param path: A path relative to the Tests/Azure/test_data
:return: The full absolute path of the argument.
"""
return _full_test_data_path("Azure", path)
def _full_test_data_path(prefix: str, suffix: str) -> Path:
root = tests_root_directory()
return root / prefix / "test_data" / suffix

Просмотреть файл

@ -279,3 +279,21 @@ class GenericConfig(param.Parameterized):
reason = f"parameter is {reason}"
# We could raise an error here instead - to be discussed.
logging.warning(f"Override {key}={desired} failed: {reason} in class {self.__class__.name}")
def create_from_matching_params(from_object: param.Parameterized, cls_: Type[T]) -> T:
"""
Creates an object of the given target class, and then copies all attributes from the `from_object` to
the newly created object, if there is a matching attribute. The target class must be a subclass of
param.Parameterized.
:param from_object: The object to read attributes from.
:param cls_: The name of the class for the newly created object.
:return: An instance of cls_
"""
c = cls_()
if not isinstance(c, param.Parameterized):
raise ValueError(f"The created object must be a subclass of param.Parameterized, but got {type(c)}")
for param_name, p in c.params().items():
if not p.constant and not p.readonly:
setattr(c, param_name, getattr(from_object, param_name))
return c

Просмотреть файл

@ -9,16 +9,17 @@ from dataclasses import dataclass
from enum import Enum, unique
from math import isclose
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
import pandas as pd
import param
from azureml.core import ScriptRunConfig
from azureml.core import Model, ScriptRunConfig
from azureml.train.hyperdrive import HyperDriveConfig
from pandas import DataFrame
from InnerEye.Common.common_util import any_pairwise_larger, any_smaller_or_equal_than, check_is_any_of
from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Common.common_util import ModelProcessing, any_pairwise_larger, any_smaller_or_equal_than, check_is_any_of
from InnerEye.Common.generic_parsing import IntTuple
from InnerEye.Common.type_annotations import TupleFloat2, TupleFloat3, TupleInt3, TupleStringOptionalFloat
from InnerEye.ML.common import ModelExecutionMode
@ -264,7 +265,9 @@ class SegmentationModelBase(ModelConfigBase):
#: The number of image levels used in Unet (in encoding and decoding paths).
num_downsampling_paths: int = param.Integer(4, bounds=(1, None),
instantiate=False, doc="The number of levels used in a UNet architecture in encoding and decoding paths.")
instantiate=False,
doc="The number of levels used in a UNet architecture in encoding and "
"decoding paths.")
#: The size of the random crops that will be drawn from the input images during training. This is also the
#: input size of the model.
@ -666,7 +669,7 @@ class SegmentationModelBase(ModelConfigBase):
"""
Loads a dataset from the dataset_csv file, and stores it in the present object.
"""
assert self.local_dataset is not None # for mypy
assert self.local_dataset is not None, "The dataset must be provided in self.local_dataset"
self.dataset_data_frame = pd.read_csv(self.local_dataset / self.dataset_csv,
dtype=str,
converters=self.col_type_converters,
@ -793,3 +796,7 @@ class SegmentationModelBase(ModelConfigBase):
By default no transformation is performed.
"""
return ModelTransformsPerExecutionMode()
PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
ModelDeploymentHookSignature = Callable[[SegmentationModelBase, AzureConfig, Model, ModelProcessing], Any]

Просмотреть файл

@ -0,0 +1,140 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from pathlib import Path
from typing import Any, Dict, List, Tuple
import numpy as np
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from torch.optim import Adam, Optimizer
from torch.optim.lr_scheduler import StepLR, _LRScheduler
from torch.utils.data import DataLoader, Dataset
from InnerEye.Common import fixed_paths_for_tests
from InnerEye.ML.lightning_container import LightningContainer
class HelloDataset(Dataset):
"""
A simple 1dim regression task, read from a data file stored in the test data folder.
"""
# Creating the data file:
# import numpy as np
# import torch
#
# N = 100
# x = torch.rand((N, 1)) * 10
# y = 0.2 * x + 0.1 * torch.randn(x.size())
# xy = torch.cat((x, y), dim=1)
# np.savetxt("Tests/ML/test_data/hellocontainer.csv", xy.numpy(), delimiter=",")
def __init__(self, root_folder: Path, start_index: int, end_index: int) -> None:
"""
Creates the 1-dim regression dataset.
:param root_folder: The folder in which the data file lives ("hellocontainer.csv")
:param start_index: The first row to read.
:param end_index: The last row to read (exclusive)
"""
super().__init__()
raw_data = np.loadtxt(root_folder / "hellocontainer.csv", delimiter=",")[start_index:end_index]
self.data = torch.tensor(raw_data, dtype=torch.float)
def __len__(self) -> int:
return self.data.shape[0]
def __getitem__(self, item: int) -> Dict[str, torch.Tensor]:
return {'x': self.data[item][0:1], 'y': self.data[item][1:2]}
class HelloDataModule(LightningDataModule):
"""
A data module that gives the training, validation and test data for a simple 1-dim regression task.
"""
def __init__(self, root_folder: Path) -> None:
super().__init__()
self.train = HelloDataset(root_folder, start_index=0, end_index=50)
self.val = HelloDataset(root_folder, start_index=50, end_index=70)
self.test = HelloDataset(root_folder, start_index=70, end_index=100)
def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(self.train, batch_size=5)
def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(self.val, batch_size=5)
def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(self.test, batch_size=5)
class HelloRegression(LightningModule):
"""
A simple 1-dim regression model.
"""
def __init__(self) -> None:
super().__init__()
self.model = torch.nn.Linear(in_features=1, out_features=1, bias=True)
self.test_mse: List[torch.Tensor] = []
def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore
return self.model(x)
def training_step(self, batch: Dict[str, torch.Tensor], *args: Any, **kwargs: Any) -> torch.Tensor: # type: ignore
input = batch["x"]
target = batch["y"]
prediction = self.forward(input)
loss = torch.nn.functional.mse_loss(prediction, target)
self.log("loss", loss, on_epoch=True, on_step=False)
return loss
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
optimizer = Adam(self.parameters(), lr=1e-1)
scheduler = StepLR(optimizer, step_size=20, gamma=0.5)
return [optimizer], [scheduler]
def on_test_epoch_start(self) -> None:
self.test_mse = []
def test_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor: # type: ignore
input = batch["x"]
target = batch["y"]
prediction = self.forward(input)
loss = torch.nn.functional.mse_loss(prediction, target)
self.test_mse.append(loss)
return loss
def on_test_epoch_end(self) -> None:
average_mse = torch.mean(torch.stack(self.test_mse))
Path("test_mse.txt").write_text(str(average_mse.item()))
class HelloContainer(LightningContainer):
"""
An example for using the InnerEye functionality to "bring your own lightning model". This container has methods
to generate the actual Lightning model, and read out the datamodule that will be used for training.
The number of training epochs is controlled at container level.
You can train this model by running `python InnerEye/ML/runner.py --model=HelloContainer` on the local box,
or via `python InnerEye/ML/runner.py --model=HelloContainer --azureml=True` in AzureML
"""
def __init__(self) -> None:
super().__init__()
self.local_dataset = fixed_paths_for_tests.full_ml_test_data_path()
self.num_epochs = 20
# This method must be overridden by any subclass of LightningContainer
def create_model(self) -> LightningModule:
return HelloRegression()
# This method must be overridden by any subclass of LightningContainer
def get_data_module(self) -> LightningDataModule:
assert self.local_dataset is not None
return HelloDataModule(root_folder=self.local_dataset) # type: ignore
# This is an optional override: This report creation method can read out any files that were written during
# training, and cook them into a nice looking report. Here, the report is a simple text file.
def create_report(self) -> None:
# This just prints out the test MSE, but you could also generate a Jupyter notebook here, for example.
test_mse = float(Path("test_mse.txt").read_text())
report = f"Performance on test set: MSE = {test_mse}"
print(report)
Path("report.txt").write_text(report)

Просмотреть файл

@ -0,0 +1,68 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
# individual warnings only.
# flake8: noqa
from typing import Optional
import param
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from torch.utils.tensorboard import SummaryWriter
from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
from InnerEye.ML.lightning_container import LightningContainer
add_folder_to_sys_path_if_needed("fastMRI")
from fastmri.data.subsample import create_mask_for_mask_type
from fastmri.data.transforms import VarNetDataTransform
from fastmri.pl_modules import FastMriDataModule, VarNetModule
class VarNetWithImageLogging(VarNetModule):
"""
A clone of the VarNet model that logs images to only the Tensorboard loggers. The original VarNet hardcodes
a single logger that must be Tensorboard.
"""
def log_image(self, name: str, image: torch.Tensor) -> None:
experiments = self.logger.experiment if isinstance(self.logger.experiment, list) \
else [self.logger.experiment]
for experiment in experiments:
if isinstance(experiment, SummaryWriter):
experiment.add_image(name, image, global_step=self.global_step)
class FastMri(LightningContainer):
# All fields that are declared here will be automatically available as commandline arguments.
challenge: str = param.String(default="multicoil", doc="Chooses between the singlecoil or multicoil"
"acquisition setup.")
sample_rate: Optional[float] = param.Number(default=None, doc="Fraction of slices of the training data split to "
"use. Default: 1.0")
def __init__(self) -> None:
super().__init__()
self.azure_dataset_id = "fastmrimini_brain"
def create_model(self) -> LightningModule:
return VarNetWithImageLogging()
def get_data_module(self) -> LightningDataModule:
mask = create_mask_for_mask_type(mask_type_str="equispaced",
center_fractions=[0.08],
accelerations=[4])
# use random masks for train transform, fixed masks for val transform
train_transform = VarNetDataTransform(mask_func=mask, use_seed=False)
val_transform = VarNetDataTransform(mask_func=mask)
test_transform = VarNetDataTransform()
return FastMriDataModule(data_path=self.local_dataset,
challenge=self.challenge,
sample_rate=self.sample_rate,
train_transform=train_transform,
val_transform=val_transform,
test_transform=test_transform)

Просмотреть файл

@ -7,7 +7,7 @@ from typing import Any
from azureml.core import ScriptRunConfig
from azureml.train.hyperdrive import BanditPolicy, HyperDriveConfig, PrimaryMetricGoal, RandomParameterSampling, uniform
from networkx.tests.test_convert_pandas import pd
import pandas as pd
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.metrics_constants import TrackedMetrics

Просмотреть файл

@ -183,39 +183,186 @@ class DeepLearningFileSystemConfig(Parameterized):
logs_folder=logs_folder,
project_root=self.project_root
)
raise ValueError("This method should only be called for offline runs, when the logs folder is inside the "
"outputs folder.")
raise ValueError("This method should only be called for runs outside AzureML, when the logs folder is "
"inside the outputs folder.")
class DeepLearningConfig(GenericConfig, CudaAwareConfig):
class WorkflowParams(param.Parameterized):
"""
A class that holds all settings that are shared across segmentation models and regression/classification models.
This class contains all parameters that affect how the whole training and testing workflow is executed.
"""
_model_category: ModelCategory = param.ClassSelector(class_=ModelCategory,
doc="The high-level model category described by this config.")
_model_name: str = param.String(None, doc="The human readable name of the model (for example, Liver). This is "
"usually set from the class name.")
random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
azure_dataset_id: str = param.String(doc="If provided, the ID of the dataset to use. This dataset must exist as a "
"folder of the same name in the 'datasets' "
"container in the datasets storage account.")
local_dataset: Optional[Path] = param.ClassSelector(class_=Path,
default=None,
allow_None=True,
doc="The path of the dataset to use, when training is running "
"outside Azure.")
num_dataload_workers: int = param.Integer(8, bounds=(0, None),
doc="The number of data loading workers (processes). When set to 0,"
"data loading is running in the same process (no process startup "
"cost, hence good for use in unit testing. However, it "
"does not give the same result as running with 1 worker process)")
shuffle: bool = param.Boolean(True, doc="If true, the dataset will be shuffled randomly during training.")
num_epochs: int = param.Integer(100, bounds=(1, None), doc="Number of epochs to train.")
start_epoch: int = param.Integer(0, bounds=(0, None), doc="The first epoch to train. Set to 0 to start a new "
"training. Set to a value larger than zero for starting"
" from a checkpoint.")
number_of_cross_validation_splits: int = param.Integer(0, bounds=(0, None),
doc="Number of cross validation splits for k-fold cross "
"validation")
cross_validation_split_index: int = param.Integer(DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, bounds=(-1, None),
doc="The index of the cross validation fold this model is "
"associated with when performing k-fold cross validation")
perform_training_set_inference: bool = \
param.Boolean(False,
doc="If True, run full image inference on the training set at the end of training. If False and "
"perform_validation_and_test_set_inference is True (default), only run inference on "
"validation and test set. If both flags are False do not run inference.")
perform_validation_and_test_set_inference: bool = \
param.Boolean(True,
doc="If True (default), run full image inference on validation and test set after training.")
weights_url: str = param.String(doc="If provided, a url from which weights will be downloaded and used for model "
"initialization.")
local_weights_path: Optional[Path] = param.ClassSelector(class_=Path,
default=None,
allow_None=True,
doc="The path to the weights to use for model "
"initialization, when training outside AzureML.")
generate_report: bool = param.Boolean(default=True,
doc="If True (default), write a modelling report in HTML format. If False,"
"do not write that report.")
# The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
# "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
# can reduce the chance of stuck jobs.
multiprocessing_start_method: MultiprocessingStartMethod = \
param.ClassSelector(class_=MultiprocessingStartMethod,
default=(MultiprocessingStartMethod.spawn if is_windows()
else MultiprocessingStartMethod.fork),
doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
"fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
"Set to forkserver as a possible remedy for stuck jobs.")
monitoring_interval_seconds: int = param.Integer(0, doc="Seconds delay between logging GPU/CPU resource "
"statistics. If 0 or less, do not log any resource "
"statistics.")
def validate(self) -> None:
if self.weights_url and self.local_weights_path:
raise ValueError("Cannot specify both local_weights_path and weights_url.")
if self.number_of_cross_validation_splits == 1:
raise ValueError("At least two splits required to perform cross validation, but got "
f"{self.number_of_cross_validation_splits}. To train without cross validation, set "
"number_of_cross_validation_splits=0.")
if 0 < self.number_of_cross_validation_splits <= self.cross_validation_split_index:
raise ValueError(f"Cross validation split index is out of bounds: {self.cross_validation_split_index}, "
f"which is invalid for CV with {self.number_of_cross_validation_splits} splits.")
elif self.number_of_cross_validation_splits == 0 and self.cross_validation_split_index != -1:
raise ValueError(f"Cross validation split index must be -1 for a non cross validation run, "
f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
f"and cross_validation_split_index={self.cross_validation_split_index}")
@property
def is_offline_run(self) -> bool:
"""
Returns True if the run is executing outside AzureML, or False if inside AzureML.
"""
return is_offline_run_context(RUN_CONTEXT)
@property
def perform_cross_validation(self) -> bool:
"""
True if cross validation will be be performed as part of the training procedure.
:return:
"""
return self.number_of_cross_validation_splits > 1
def get_effective_random_seed(self) -> int:
"""
Returns the random seed set as part of this configuration. If the configuration corresponds
to a cross validation split, then the cross validation fold index will be added to the
set random seed in order to return the effective random seed.
:return:
"""
seed = self.random_seed
if self.perform_cross_validation:
# offset the random seed based on the cross validation split index so each
# fold has a different initial random state.
seed += self.cross_validation_split_index
return seed
class DatasetParams(param.Parameterized):
azure_dataset_id: str = param.String(doc="If provided, the ID of the dataset to use when running in AzureML. "
"This dataset must exist as a folder of the same name in the 'datasets' "
"container in the datasets storage account. This dataset will be mounted "
"and made available at the 'local_dataset' path when running in AzureML.")
local_dataset: Optional[Path] = \
param.ClassSelector(class_=Path, default=None, allow_None=True,
doc="The path of the dataset to use, when training is running outside Azure.")
class OutputParams(param.Parameterized):
output_to: str = param.String(default="",
doc="If provided, the run outputs will be written to the given folder. If not "
"provided, outputs will go into a subfolder of the project root folder.")
file_system_config: DeepLearningFileSystemConfig = param.ClassSelector(default=DeepLearningFileSystemConfig(),
class_=DeepLearningFileSystemConfig,
instantiate=False,
doc="File system related configs")
_model_name: str = param.String("", doc="The human readable name of the model (for example, Liver). This is "
"usually set from the class name.")
@property
def model_name(self) -> str:
"""
Gets the human readable name of the model (e.g., Liver). This is usually set from the class name.
:return: A model name as a string.
"""
return self._model_name
def set_output_to(self, output_to: PathOrString) -> None:
"""
Adjusts the file system settings in the present object such that all outputs are written to the given folder.
:param output_to: The absolute path to a folder that should contain the outputs.
"""
if isinstance(output_to, Path):
output_to = str(output_to)
self.output_to = output_to
self.create_filesystem()
def create_filesystem(self, project_root: Path = fixed_paths.repository_root_directory()) -> None:
"""
Creates new file system settings (outputs folder, logs folder) based on the information stored in the
present object. If any of the folders do not yet exist, they are created.
:param project_root: The root folder for the codebase that triggers the training run.
"""
self.file_system_config = DeepLearningFileSystemConfig.create(
project_root=project_root,
model_name=self.model_name,
is_offline_run=is_offline_run_context(RUN_CONTEXT),
output_to=self.output_to
)
@property
def outputs_folder(self) -> Path:
"""Gets the full path in which the model outputs should be stored."""
return self.file_system_config.outputs_folder
@property
def logs_folder(self) -> Path:
"""Gets the full path in which the model logs should be stored."""
return self.file_system_config.logs_folder
@property
def checkpoint_folder(self) -> Path:
"""Gets the full path in which the model checkpoints should be stored during training."""
return self.outputs_folder / CHECKPOINT_FOLDER
@property
def visualization_folder(self) -> Path:
"""Gets the full path in which the visualizations notebooks should be saved during training."""
return self.outputs_folder / VISUALIZATION_FOLDER
def get_path_to_checkpoint(self) -> Path:
"""
Returns the full path to a recovery checkpoint.
"""
return create_recovery_checkpoint_path(self.checkpoint_folder)
def get_path_to_best_checkpoint(self) -> Path:
"""
Returns the full path to a checkpoint file that was found to be best during training, whatever criterion
was applied there.
"""
return get_best_checkpoint_path(self.checkpoint_folder)
class OptimizerParams(param.Parameterized):
l_rate: float = param.Number(1e-4, doc="The initial learning rate", bounds=(0, None))
_min_l_rate: float = param.Number(0.0, doc="The minimum learning rate for the Polynomial and Cosine schedulers.",
bounds=(0.0, None))
@ -256,35 +403,87 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
doc="The betas parameter of Adam, default is (0.9, 0.999)")
momentum: float = param.Number(0.6, doc="The momentum parameter of the optimizers")
weight_decay: float = param.Number(1e-4, doc="The weight decay used to control L2 regularization")
def validate(self) -> None:
if len(self.adam_betas) < 2:
raise ValueError(
"The adam_betas parameter should be the coefficients used for computing running averages of "
"gradient and its square")
if self.l_rate_scheduler == LRSchedulerType.MultiStep:
if not self.l_rate_multi_step_milestones:
raise ValueError("Must specify l_rate_multi_step_milestones to use LR scheduler MultiStep")
if sorted(set(self.l_rate_multi_step_milestones)) != self.l_rate_multi_step_milestones:
raise ValueError("l_rate_multi_step_milestones must be a strictly increasing list")
if self.l_rate_multi_step_milestones[0] <= 0:
raise ValueError("l_rate_multi_step_milestones cannot be negative or 0.")
@property
def min_l_rate(self) -> float:
return self._min_l_rate
@min_l_rate.setter
def min_l_rate(self, value: float) -> None:
if value > self.l_rate:
raise ValueError("l_rate must be >= min_l_rate, found: {}, {}".format(self.l_rate, value))
self._min_l_rate = value
class TrainerParams(CudaAwareConfig):
num_epochs: int = param.Integer(100, bounds=(1, None), doc="Number of epochs to train.")
recovery_checkpoint_save_interval: int = param.Integer(10, bounds=(0, None),
doc="Save epoch checkpoints when epoch number is a multiple "
"of recovery_checkpoint_save_interval. The intended use "
"is to allow restore training from failed runs.")
train_batch_size: int = param.Integer(4, bounds=(0, None),
doc="The number of crops that make up one minibatch during training.")
detect_anomaly: bool = param.Boolean(False, doc="If true, test gradients for anomalies (NaN or Inf) during "
"training.")
use_mixed_precision: bool = param.Boolean(False, doc="If true, mixed precision training is activated during "
"training.")
max_num_gpus: int = param.Integer(default=-1, doc="The maximum number of GPUS to use. If set to a value < 0, use"
"all available GPUs.")
pl_progress_bar_refresh_rate: Optional[int] = \
param.Integer(default=None,
doc="PyTorch Lightning trainer flag 'progress_bar_refresh_rate': How often to refresh progress "
"bar (in steps). Value 0 disables progress bar. Value None chooses automatically.")
pl_num_sanity_val_steps: int = \
param.Integer(default=0,
doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
"steps to run before training, to identify possible problems")
pl_deterministic: bool = \
param.Integer(default=True,
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
"you may see training speed increases.")
start_epoch: int = param.Integer(0, bounds=(0, None), doc="The first epoch to train. Set to 0 to start a new "
"training. Set to a value larger than zero for starting"
" from a checkpoint.")
class DeepLearningConfig(WorkflowParams,
DatasetParams,
OutputParams,
OptimizerParams,
TrainerParams,
CudaAwareConfig,
GenericConfig):
"""
A class that holds all settings that are shared across segmentation models and regression/classification models.
"""
_model_category: ModelCategory = param.ClassSelector(class_=ModelCategory,
doc="The high-level model category described by this config.")
num_dataload_workers: int = param.Integer(8, bounds=(0, None),
doc="The number of data loading workers (processes). When set to 0,"
"data loading is running in the same process (no process startup "
"cost, hence good for use in unit testing. However, it "
"does not give the same result as running with 1 worker process)")
shuffle: bool = param.Boolean(True, doc="If true, the dataset will be shuffled randomly during training.")
train_batch_size: int = param.Integer(4, bounds=(0, None),
doc="The number of crops that make up one minibatch during training.")
use_model_parallel: bool = param.Boolean(False, doc="If true, neural network model is partitioned across all "
"available GPUs to fit in a large model. It shall not be used "
"together with data parallel.")
monitoring_interval_seconds: int = param.Integer(0, doc="Seconds delay between logging GPU/CPU resource "
"statistics. If 0 or less, do not log any resource "
"statistics.")
number_of_cross_validation_splits: int = param.Integer(0, bounds=(0, None),
doc="Number of cross validation splits for k-fold cross "
"validation")
cross_validation_split_index: int = param.Integer(DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, bounds=(-1, None),
doc="The index of the cross validation fold this model is "
"associated with when performing k-fold cross validation")
file_system_config: DeepLearningFileSystemConfig = param.ClassSelector(default=DeepLearningFileSystemConfig(),
class_=DeepLearningFileSystemConfig,
instantiate=False,
doc="File system related configs")
pin_memory: bool = param.Boolean(True, doc="Value of pin_memory argument to DataLoader")
_overrides: Dict[str, Any] = param.Dict(instantiate=True,
doc="Model config properties that were overridden from the commandline")
restrict_subjects: Optional[str] = \
param.String(doc="Use at most this number of subjects for train, val, or test set (must be > 0 or None). "
"If None, do not modify the train, val, or test sets. If a string of the form 'i,j,k' where "
@ -294,14 +493,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
"limit test set to 5. If any of i,j,k is '+', discarded members of the other sets are added "
"to that set.",
allow_None=True)
perform_training_set_inference: bool = \
param.Boolean(False,
doc="If True, run full image inference on the training set at the end of training. If False and "
"perform_validation_and_test_set_inference is True (default), only run inference on "
"validation and test set. If both flags are False do not run inference.")
perform_validation_and_test_set_inference: bool = \
param.Boolean(True,
doc="If True (default), run full image inference on validation and test set after training.")
_dataset_data_frame: Optional[DataFrame] = \
param.DataFrame(default=None,
doc="The dataframe that contains the dataset for the model. This is usually read from disk "
@ -315,19 +506,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
"on Linux, inference is currently disabled as the data loaders hang. "
"If False, use the default data loader logic that starts new processes for "
"each epoch.")
# The default multiprocessing start_method in both PyTorch and the Python standard library is "fork" for Linux and
# "spawn" (the only available method) for Windows. There is some evidence that using "forkserver" on Linux
# can reduce the chance of stuck jobs.
multiprocessing_start_method: MultiprocessingStartMethod = \
param.ClassSelector(class_=MultiprocessingStartMethod,
default=(MultiprocessingStartMethod.spawn if is_windows()
else MultiprocessingStartMethod.fork),
doc="Method to be used to start child processes in pytorch. Should be one of forkserver, "
"fork or spawn. If not specified, fork is used on Linux and spawn on Windows. "
"Set to forkserver as a possible remedy for stuck jobs.")
output_to: str = param.String(default="",
doc="If provided, the run outputs will be written to the given folder. If not "
"provided, outputs will go into a subfolder of the project root folder.")
max_batch_grad_cam: int = param.Integer(default=0, doc="Max number of validation batches for which "
"to save gradCam images. By default "
"visualizations are saved for all images "
@ -336,7 +514,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
doc="Target smoothing value for label smoothing")
log_to_parent_run: bool = param.Boolean(default=False, doc="If true, hyperdrive child runs will log their metrics"
"to their parent run.")
use_imbalanced_sampler_for_training: bool = param.Boolean(default=False,
doc="If True, use an imbalanced sampler during training.")
drop_last_batch_in_training: bool = param.Boolean(default=False,
@ -358,28 +535,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
"weights are updated using mean_teacher_"
"weight = alpha * (mean_teacher_weight) "
" + (1-alpha) * (current_student_weights). ")
weights_url: str = param.String(doc="If provided, a url from which weights will be downloaded and used for model "
"initialization.")
local_weights_path: Optional[Path] = param.ClassSelector(class_=Path,
default=None,
allow_None=True,
doc="The path to the weights to use for model "
"initialization, "
"when training is running outside Azure.")
max_num_gpus: int = param.Integer(default=-1, doc="The maximum number of GPUS to use. If set to a value < 0, use"
"all available GPUs.")
generate_report: bool = param.Boolean(default=True,
doc="If True (default), write a modelling report in HTML format. If False,"
"do not write that report.")
pl_num_sanity_val_steps: int = \
param.Integer(default=0, doc="PyTorch Lightning trainer flag 'num_sanity_val_steps': Number of validation "
"steps to run before training, to identify possible problems")
pl_deterministic: bool = \
param.Integer(default=True,
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
"you may see training speed increases.")
#: Name of the csv file providing information on the dataset to be used.
dataset_csv: str = param.String(
DATASET_CSV_FILE_NAME,
@ -394,49 +549,19 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
super().__init__(throw_if_unknown_param=True, **params)
logging.info("Creating the default output folder structure.")
self.create_filesystem(fixed_paths.repository_root_directory())
# Disable the PL progress bar because all InnerEye models have their own console output
self.pl_progress_bar_refresh_rate = 0
def validate(self) -> None:
"""
Validates the parameters stored in the present object.
"""
if len(self.adam_betas) < 2:
raise ValueError(
"The adam_betas parameter should be the coefficients used for computing running averages of "
"gradient and its square")
WorkflowParams.validate(self)
OptimizerParams.validate(self)
if self.azure_dataset_id is None and self.local_dataset is None:
raise ValueError("Either of local_dataset or azure_dataset_id must be set.")
if self.weights_url and self.local_weights_path:
raise ValueError("Cannot specify both local_weights_path and weights_url.")
if self.number_of_cross_validation_splits == 1:
raise ValueError(f"At least two splits required to perform cross validation found "
f"number_of_cross_validation_splits={self.number_of_cross_validation_splits}")
if 0 < self.number_of_cross_validation_splits <= self.cross_validation_split_index:
raise ValueError(f"Cross validation split index is out of bounds: {self.cross_validation_split_index}, "
f"which is invalid for CV with {self.number_of_cross_validation_splits} splits.")
elif self.number_of_cross_validation_splits == 0 and self.cross_validation_split_index != -1:
raise ValueError(f"Cross validation split index must be -1 for a non cross validation run, "
f"found number_of_cross_validation_splits = {self.number_of_cross_validation_splits} "
f"and cross_validation_split_index={self.cross_validation_split_index}")
if self.l_rate_scheduler == LRSchedulerType.MultiStep:
if not self.l_rate_multi_step_milestones:
raise ValueError("Must specify l_rate_multi_step_milestones to use LR scheduler MultiStep")
if sorted(set(self.l_rate_multi_step_milestones)) != self.l_rate_multi_step_milestones:
raise ValueError("l_rate_multi_step_milestones must be a strictly increasing list")
if self.l_rate_multi_step_milestones[0] <= 0:
raise ValueError("l_rate_multi_step_milestones cannot be negative or 0.")
@property
def model_name(self) -> str:
"""
Gets the human readable name of the model (e.g., Liver). This is usually set from the class name.
:return: A model name as a string.
"""
return self._model_name
@property
def model_category(self) -> ModelCategory:
"""
@ -463,48 +588,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
def compute_grad_cam(self) -> bool:
return self.max_batch_grad_cam > 0
@property
def min_l_rate(self) -> float:
return self._min_l_rate
@min_l_rate.setter
def min_l_rate(self, value: float) -> None:
if value > self.l_rate:
raise ValueError("l_rate must be >= min_l_rate, found: {}, {}".format(self.l_rate, value))
self._min_l_rate = value
@property
def outputs_folder(self) -> Path:
"""Gets the full path in which the model outputs should be stored."""
return self.file_system_config.outputs_folder
@property
def logs_folder(self) -> Path:
"""Gets the full path in which the model logs should be stored."""
return self.file_system_config.logs_folder
@property
def checkpoint_folder(self) -> Path:
"""Gets the full path in which the model checkpoints should be stored during training."""
return self.outputs_folder / CHECKPOINT_FOLDER
@property
def visualization_folder(self) -> Path:
"""Gets the full path in which the visualizations notebooks should be saved during training."""
return self.outputs_folder / VISUALIZATION_FOLDER
@property
def perform_cross_validation(self) -> bool:
"""
True if cross validation will be be performed as part of the training procedure.
:return:
"""
return self.number_of_cross_validation_splits > 1
@property
def overrides(self) -> Optional[Dict[str, Any]]:
return self._overrides
@property
def dataset_data_frame(self) -> Optional[DataFrame]:
"""
@ -521,29 +604,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
"""
self._dataset_data_frame = data_frame
def set_output_to(self, output_to: PathOrString) -> None:
"""
Adjusts the file system settings in the present object such that all outputs are written to the given folder.
:param output_to: The absolute path to a folder that should contain the outputs.
"""
if isinstance(output_to, Path):
output_to = str(output_to)
self.output_to = output_to
self.create_filesystem()
def create_filesystem(self, project_root: Path = fixed_paths.repository_root_directory()) -> None:
"""
Creates new file system settings (outputs folder, logs folder) based on the information stored in the
present object. If any of the folders do not yet exist, they are created.
:param project_root: The root folder for the codebase that triggers the training run.
"""
self.file_system_config = DeepLearningFileSystemConfig.create(
project_root=project_root,
model_name=self.model_name,
is_offline_run=self.is_offline_run,
output_to=self.output_to
)
def get_train_epochs(self) -> List[int]:
"""
Returns the epochs for which training will be performed.
@ -565,34 +625,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
"""
return self.get_total_number_of_training_epochs()
def get_path_to_checkpoint(self) -> Path:
"""
Returns full path to a recovery checkpoint.
:return: path to a checkpoint given an epoch
"""
return create_recovery_checkpoint_path(self.checkpoint_folder)
def get_path_to_best_checkpoint(self) -> Path:
"""
Returns full path to a checkpoint given an epoch
:return: path to a checkpoint given an epoch
"""
return get_best_checkpoint_path(self.checkpoint_folder)
def get_effective_random_seed(self) -> int:
"""
Returns the random seed set as part of this configuration. If the configuration corresponds
to a cross validation split, then the cross validation fold index will be added to the
set random seed in order to return the effective random seed.
:return:
"""
seed = self.random_seed
if self.perform_cross_validation:
# offset the random seed based on the cross validation split index so each
# fold has a different initial random state.
seed += self.cross_validation_split_index
return seed
@property # type: ignore
def use_gpu(self) -> bool: # type: ignore
"""
@ -617,30 +649,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
raise ValueError("Can't set use_gpu to True if there is not CUDA capable GPU present.")
self._use_gpu = value
def write_args_file(self) -> None:
"""
Writes the current config to disk in the default output folder.
"""
self.outputs_folder.mkdir(exist_ok=True, parents=True)
dst = self.outputs_folder / ARGS_TXT
dst.write_text(data=str(self))
def should_wait_for_other_cross_val_child_runs(self) -> bool:
"""
Returns True if the current run is an online run and is the 0th cross validation split.
In this case, this will be the run that will wait for all other child runs to finish in order
to aggregate their results.
:return:
"""
return (not self.is_offline_run) and self.cross_validation_split_index == 0
@property
def is_offline_run(self) -> bool:
"""
Returns True if the run is executing outside AzureML, or False if inside AzureML.
"""
return is_offline_run_context(RUN_CONTEXT)
@property
def compute_mean_teacher_model(self) -> bool:
"""
@ -654,7 +662,7 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
# Avoid callable params, the bindings that are printed out can be humongous.
# Avoid dataframes
skip_params = {name for name, value in self.param.params().items()
if isinstance(value, (param.Callable, DataFrame))}
if isinstance(value, (param.Callable, param.DataFrame))}
for key, value in self.param.get_param_values():
if key not in skip_params:
arguments_str += f"\t{key:40}: {value}\n"
@ -677,7 +685,6 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
See https://pytorch.org/tutorials/beginner/saving_loading_models.html#warmstarting-model-using-parameters
-from-a-different-model
for an explanation on why strict=False is useful when loading parameters from other models.
:param path_to_checkpoint: Path to the checkpoint file.
:return: Dictionary with model and optimizer state dicts. The dict should have at least the following keys:
1. Key ModelAndInfo.MODEL_STATE_DICT_KEY and value set to the model state dict.
@ -685,7 +692,15 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
Other (optional) entries corresponding to keys ModelAndInfo.OPTIMIZER_STATE_DICT_KEY and
ModelAndInfo.MEAN_TEACHER_STATE_DICT_KEY are also supported.
"""
import torch
map_location = None if self.use_gpu else 'cpu'
checkpoint = torch.load(str(path_to_checkpoint), map_location=map_location)
return checkpoint
return load_checkpoint(path_to_checkpoint=path_to_checkpoint, use_gpu=self.use_gpu)
def load_checkpoint(path_to_checkpoint: Path, use_gpu: bool = True) -> Dict[str, Any]:
"""
Loads a Torch checkpoint from the given file. If use_gpu==False, map all parameters to the GPU, otherwise
left the device of all parameters unchanged.
"""
import torch
map_location = None if use_gpu else 'cpu'
checkpoint = torch.load(str(path_to_checkpoint), map_location=map_location)
return checkpoint

Просмотреть файл

@ -4,20 +4,25 @@
# ------------------------------------------------------------------------------------------
import logging
import numbers
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import param
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.utilities import rank_zero_only
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
from InnerEye.Common.common_util import EPOCH_METRICS_FILE_NAME
from InnerEye.Common.common_util import EPOCH_METRICS_FILE_NAME, logging_section
from InnerEye.Common.metrics_constants import LoggingColumns, MetricType, TRAIN_PREFIX, VALIDATION_PREFIX
from InnerEye.Common.type_annotations import DictStrFloat
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.deep_learning_config import DatasetParams, DeepLearningConfig, WorkflowParams, OutputParams, \
TrainerParams
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.lightning_loggers import StoringLogger
from InnerEye.ML.metrics import EpochTimers, MAX_ITEM_LOAD_TIME_SEC, store_epoch_metrics
from InnerEye.ML.metrics_dict import DataframeLogger
@ -25,20 +30,43 @@ from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.utils import model_util
from InnerEye.ML.utils.device_aware_module import DeviceAwareModule
from InnerEye.ML.utils.lr_scheduler import SchedulerWithWarmUp
from InnerEye.ML.utils.ml_util import RandomStateSnapshot, set_random_seed
from InnerEye.ML.utils.ml_util import RandomStateSnapshot, set_random_seed, validate_dataset_paths
from InnerEye.ML.utils.model_util import generate_and_print_model_summary
from InnerEye.ML.visualizers.patch_sampling import visualize_random_crops_for_dataset
class TrainingAndValidationDataLightning(LightningDataModule):
class TrainAndValDataLightning(LightningDataModule):
"""
A class that wraps training and validation data from an InnerEye model configuration to a Lightning data module.
When doing inference on the trained models, we use InferenceDataLightning. This is particularly important for
segmentation models, where training and validation happens on equal sized patches, but inference is running on
images of arbitrary size.
"""
def _init__(self, config: ModelConfigBase) -> None:
def __init__(self, config: ModelConfigBase) -> None:
super().__init__()
self.config = config
self.data_loaders: Dict[ModelExecutionMode, DataLoader] = {}
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
"""
Writes the dataset files for later use in cross validation analysis. This is only executed once per
distributed training run.
"""
# Save the dataset files for later use in cross validation analysis
self.config.write_dataset_files()
def setup(self, stage: Optional[str] = None) -> None:
"""
Checks if the dataset folder is present, and the dataset file exists. This is execute on each node in
distributed training.
"""
# Check for existing dataset.csv file in the correct locations. Skip that if a dataset has already been
# loaded (typically only during tests)
if self.config.dataset_data_frame is None:
assert self.config.local_dataset is not None
validate_dataset_paths(self.config.local_dataset, self.config.dataset_csv)
self.config.read_dataset_if_needed()
self.data_loaders = self.config.create_data_loaders()
def train_dataloader(self) -> DataLoader: # type: ignore
@ -48,7 +76,91 @@ class TrainingAndValidationDataLightning(LightningDataModule):
return self.data_loaders[ModelExecutionMode.VAL]
def test_dataloader(self) -> DataLoader: # type: ignore
raise NotImplementedError("For segmentation models, the test dataset should not be evaluated patch-wise.")
raise NotImplementedError("There is no test dataset stored here, because this object is only meant to be "
"used for training and validation.")
class InferenceDataLightning(LightningDataModule):
"""
A class that wraps data for running model inference on InnerEye models, as a Lightning data module.
Note that training and validation data is handled by TrainAndValDataLightning.
"""
def __init__(self, config: ModelConfigBase) -> None:
super().__init__()
self.config = config
self.train_data: Dataset = Dataset()
self.val_data: Dataset = Dataset()
self.test_data: Dataset = Dataset()
def setup(self, stage: Optional[str] = None) -> None:
"""
Initializes the datasets stored in the present object, by calling the config object to
prepare the torch Dataset objects for train/val/test.
"""
self.train_data = self.config.get_torch_dataset_for_inference(ModelExecutionMode.TRAIN)
self.val_data = self.config.get_torch_dataset_for_inference(ModelExecutionMode.VAL)
self.test_data = self.config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(self.train_data)
def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(self.val_data)
def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(self.test_data)
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
pass
class InnerEyeContainer(LightningContainer):
"""
A container that wraps the creation of Lightning datasets for the built-in InnerEye models.
"""
def __init__(self, config: ModelConfigBase):
super().__init__()
self.config = config
self._model_name = config.model_name
# Fields like cross validation index are defined at container level, but the InnerEye models define them
# at model level. Copy everything over.
for type_to_copy in [WorkflowParams, DatasetParams, TrainerParams, OutputParams]:
assert issubclass(type_to_copy, param.Parameterized)
self.apply_overrides({p: getattr(config, p) for p in type_to_copy.params()}, # type: ignore
should_validate=False)
def setup(self) -> None:
"""
This hook reads the dataset file, and possibly sets required pre-processing objects, like one-hot encoder
for categorical features, that need to be available before creating the model.
"""
self.config.read_dataset_if_needed()
def create_model(self) -> LightningModule: # type: ignore
from InnerEye.ML.lightning_models import create_lightning_model
return create_lightning_model(self.config)
def get_data_module(self) -> LightningDataModule:
return TrainAndValDataLightning(self.config) # type: ignore
def get_inference_data_module(self) -> LightningDataModule:
return InferenceDataLightning(self.config) # type: ignore
def before_training_on_rank_zero(self) -> None:
# Save the dataset files for later use in cross validation analysis
self.config.write_dataset_files()
if isinstance(self.config, SegmentationModelBase):
with logging_section("Visualizing the effect of sampling random crops for training"):
visualize_random_crops_for_dataset(self.config)
# Print out a detailed breakdown of layers, memory consumption and time.
assert isinstance(self.model, InnerEyeLightning)
generate_and_print_model_summary(self.config, self.model.model)
def load_checkpoint_and_modify(self, path_to_checkpoint: Path) -> Dict[str, Any]:
return self.config.load_checkpoint_and_modify(path_to_checkpoint=path_to_checkpoint)
class InnerEyeLightning(LightningModule):
@ -61,6 +173,7 @@ class InnerEyeLightning(LightningModule):
def __init__(self, config: DeepLearningConfig, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.outputs_folder = config.outputs_folder
self.checkpoint_folder = config.checkpoint_folder
self.model: DeviceAwareModule = DeviceAwareModule()
# These two will be set later in set_optimizer_and_scheduler.
# The ddp_spawn accelerator only works if the model configuration object is
@ -85,20 +198,17 @@ class InnerEyeLightning(LightningModule):
fixed_columns=fixed_logger_columns)
self.val_epoch_metrics_logger = DataframeLogger(self.val_metrics_folder / EPOCH_METRICS_FILE_NAME,
fixed_columns=fixed_logger_columns)
# Fields to store diagnostics for unit testing
self.train_diagnostics: List[Any] = []
self.val_diagnostics: List[Any] = []
# Stores information the checkpoint that created this model, if any.
self.checkpoint_loading_message = ""
def set_optimizer_and_scheduler(self, config: DeepLearningConfig) -> None:
self.optimizer = model_util.create_optimizer(config, self.model.parameters())
self.l_rate_scheduler = SchedulerWithWarmUp(config, self.optimizer)
self.l_rate_scheduler = SchedulerWithWarmUp(config, self.optimizer, num_epochs=config.num_epochs)
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
return [self.optimizer], [self.l_rate_scheduler] # type: ignore
def close_all_loggers(self) -> None:
def on_fit_end(self) -> None:
"""
Flushes all logger objects that the present object holds.
"""

Просмотреть файл

@ -0,0 +1,292 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import abc
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional, Tuple
import param
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler
from InnerEye.Common.generic_parsing import GenericConfig, create_from_matching_params
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.deep_learning_config import DatasetParams, OptimizerParams, OutputParams, TrainerParams, \
WorkflowParams, load_checkpoint
from InnerEye.ML.utils import model_util
from InnerEye.ML.utils.lr_scheduler import SchedulerWithWarmUp
class InnerEyeInference(abc.ABC):
"""
A base class that defines the methods that need to be present for doing inference on a trained model. This
form of inference is slightly different from what PyTorch Lightning does in its `Trainer.test` method. In
particular, this inference can be executed on any of the training, validation, or test set.
The inference code calls the methods in this order:
model.on_inference_start()
for dataset_split in [Train, Val, Test]
model.on_inference_epoch_start(dataset_split, is_ensemble_model=False)
for batch_idx, item in enumerate(dataloader[dataset_split])):
model_outputs = model.forward(item)
model.inference_step(item, batch_idx, model_outputs)
model.on_inference_epoch_end()
model.on_inference_end()
"""
def on_inference_start(self) -> None:
"""
Runs initialization for everything that inference might require. This can initialize
output files, set up metric computation, etc. This is run only once.
"""
pass
def on_inference_epoch_start(self, dataset_split: ModelExecutionMode, is_ensemble_model: bool) -> None:
"""
Runs initialization for inference, when starting inference on a new dataset split (train/val/test).
Depending on the settings, this can be called anywhere between 0 (no inference at all) to 3 times (inference
on all of train/val/test split).
:param dataset_split: Indicates whether the item comes from the training, validation or test set.
:param is_ensemble_model: If False, the model_outputs come from an individual model. If True, the model
outputs come from multiple models.
"""
pass
def inference_step(self, batch: Any, batch_idx: int, model_output: torch.Tensor) -> None:
"""
This hook is called when the model has finished making a prediction. It can write the results to a file,
or compute metrics and store them.
:param batch: The batch of data for which the model made a prediction.
:param model_output: The model outputs. This would usually be a torch.Tensor, but can be any datatype.
"""
# We don't want abstract methods here, it avoids class creation for unit tests, and we also want this
# method to be left optional (it should be possible to also use Lightning's native test_step method)
raise NotImplementedError("Method on_inference_start must be overwritten in a derived class.")
def on_inference_epoch_end(self) -> None:
"""
Called when the inference on one of the dataset splits (train/val/test) has finished.
Depending on the settings, this can be called anywhere between 0 (no inference at all) to 3 times (inference
on all of train/val/test split).
"""
pass
def on_inference_end(self) -> None:
"""
Called when all inference epochs have finished. This can write all metrics to disk, for example. This method
is called exactly once.
"""
pass
def aggregate_ensemble_model_outputs(self, model_outputs: Iterator[torch.Tensor]) -> torch.Tensor:
"""
Aggregates the outputs of multiple models when using an ensemble model. In the default implementation,
this averages the tensors coming from all the models.
:param model_outputs: An iterator over the model outputs for all ensemble members.
:return: The aggregate model outputs.
"""
aggregate_output: Optional[torch.Tensor] = None
count = 0
for m in model_outputs:
count += 1
if aggregate_output is None:
aggregate_output = m
else:
aggregate_output += m
if count == 0 or aggregate_output is None:
raise ValueError("There were no results to aggregate.")
aggregate_output /= count
return aggregate_output
class LightningModuleWithOptimizer(LightningModule):
"""
A base class that supplies a method to configure optimizers and LR schedulers. To use this in your model,
inherit from this class instead of from LightningModule.
If this class is used, all configuration options for the optimizers and LR schedulers will be also available as
commandline arguments (for example, you can supply the InnerEye runner with "--l_rate=1e-2" to change the learning
rate.
"""
# These fields will be set by the LightningContainer when the model is created.
_optimizer_params = OptimizerParams()
_trainer_params = TrainerParams()
def configure_optimizers(self) -> Tuple[List[Optimizer], List[_LRScheduler]]:
"""
This is the default implementation of the method that provides the optimizer and LR scheduler for
PyTorch Lightning. It reads out the optimizer and scheduler settings from the model fields,
and creates the two objects.
Override this method for full flexibility to define any optimizer and scheduler.
:return: A tuple of (optimizer, LR scheduler)
"""
optimizer = model_util.create_optimizer(self._optimizer_params, self.parameters())
l_rate_scheduler = SchedulerWithWarmUp(self._optimizer_params, optimizer,
num_epochs=self._trainer_params.num_epochs)
return [optimizer], [l_rate_scheduler]
class LightningContainer(GenericConfig,
WorkflowParams,
DatasetParams,
OutputParams,
TrainerParams,
OptimizerParams):
"""
A LightningContainer contains all information to train a user-specified PyTorch Lightning model. The model that
should be trained is returned by the `create_model` method. The training data must be returned in the form of
a LightningDataModule, by the `get_data_module` method.
"""
def __init__(self) -> None:
super().__init__()
self._model: Optional[LightningModule] = None
self._model_name = type(self).__name__
def validate(self) -> None:
WorkflowParams.validate(self)
OptimizerParams.validate(self)
def setup(self) -> None:
"""
This method is called as one of the first operations of the training/testing workflow, before any other
operations on the present object. At the point when called, the dataset is already available in
the location given by self.local_dataset. Use this method to prepare datasets or data loaders, for example.
"""
pass
def create_model(self) -> LightningModule:
"""
This method must create the actual Lightning model that will be trained. It can read out parameters from the
container and pass them into the model, for example.
"""
pass
def get_data_module(self) -> LightningDataModule:
"""
Gets the data that is used for the training, validation, and test steps.
This should read a dataset from the self.local_dataset folder or download from a web location.
The format of the data is not specified any further.
The method must take cross validation into account, and ensure that logic to create training and validation
sets takes cross validation with a given number of splits is correctly taken care of.
:return: A LightningDataModule
"""
return None # type: ignore
def get_inference_data_module(self) -> LightningDataModule:
"""
Gets the data that is used to evaluate the trained model. By default, this returns the value
of get_data_module(), but you can override this to get for example full image datasets for
segmentation models.
This should read a dataset from the self.local_dataset folder or download from a web location.
The format of the data is not specified any further.
The method must take cross validation into account, and ensure that logic to create training and validation
sets takes cross validation with a given number of splits is correctly taken care of.
:return: A LightningDataModule
"""
# You can override this if inference uses different data, for example segmentation models use
# full images rather than equal sized crops.
return self.get_data_module()
def get_trainer_arguments(self) -> Dict[str, Any]:
"""
Gets additional parameters that will be passed on to the PyTorch Lightning trainer.
"""
return dict()
def create_report(self) -> None:
"""
This method is called after training and testing has been completed. It can aggregate all files that were
written during training and testing, and compile them into some helpful overarching output.
The report should be written to self.
"""
pass
def before_training_on_rank_zero(self) -> None:
"""
A hook that will be called before starting model training, before creating the Lightning Trainer object.
In distributed training, this is only run on rank zero. It is executed after the before_training_on_all_ranks
hook.
"""
pass
def before_training_on_all_ranks(self) -> None:
"""
A hook that will be called before starting model training.
In distributed training, this hook will be called on all ranks. It is executed before the
the before_training_on_rank_zero hook.
"""
pass
def load_checkpoint_and_modify(self, path_to_checkpoint: Path) -> Dict[str, Any]:
"""
This method is called when a file with weights for network initialization is supplied at container level,
in the self.weights_url or self.local_weights_path fields. It can load that file as a Torch checkpoint,
and rename parameters.
By default, uses torch.load to read and return the state dict from the checkpoint file, and does no modification
of the checkpoint file.
Overloading this function:
When weights_url or local_weights_path is set, the file downloaded may not be in the exact
format expected by the model's load_state_dict() - for example, pretrained Imagenet weights for networks
may have mismatched layer names in different implementations.
In such cases, you can overload this function to extract the state dict from the checkpoint.
NOTE: The model checkpoint will be loaded using the torch function load_state_dict() with argument strict=False,
so extra care needs to be taken to check that the state dict is valid.
Check the logs for warnings related to missing and unexpected keys.
See https://pytorch.org/tutorials/beginner/saving_loading_models.html#warmstarting-model-using-parameters
-from-a-different-model
for an explanation on why strict=False is useful when loading parameters from other models.
:param path_to_checkpoint: Path to the checkpoint file.
:return: Dictionary with model and optimizer state dicts. The dict should have at least the following keys:
1. Key ModelAndInfo.MODEL_STATE_DICT_KEY and value set to the model state dict.
2. Key ModelAndInfo.EPOCH_KEY and value set to the checkpoint epoch.
Other (optional) entries corresponding to keys ModelAndInfo.OPTIMIZER_STATE_DICT_KEY and
ModelAndInfo.MEAN_TEACHER_STATE_DICT_KEY are also supported.
"""
return load_checkpoint(path_to_checkpoint=path_to_checkpoint, use_gpu=self.use_gpu)
# The code from here on does not need to be modified.
@property
def model(self) -> LightningModule:
"""
Returns the PyTorch Lightning module that the present container object manages.
:return: A PyTorch Lightning module
"""
if self._model is None:
raise ValueError("No Lightning module has been set yet.")
return self._model
def create_lightning_module_and_store(self) -> None:
"""
Creates the Lightning model by calling `create_lightning_module` and stores it in the `lightning_module`
property.
"""
self._model = self.create_model()
if isinstance(self._model, LightningModuleWithOptimizer):
self._model._optimizer_params = create_from_matching_params(self, OptimizerParams)
self._model._trainer_params = create_from_matching_params(self, TrainerParams)
def __str__(self) -> str:
"""Returns a string describing the present object, as a list of key: value strings."""
arguments_str = "\nContainer:\n"
# Avoid callable params, the bindings that are printed out can be humongous.
# Avoid dataframes
skip_params = {name for name, value in self.param.params().items()
if isinstance(value, (param.Callable, param.DataFrame))}
for key, value in self.param.get_param_values():
if key not in skip_params:
arguments_str += f"\t{key:40}: {value}\n"
# Print out all other separate vars that are not under the guidance of the params library,
# skipping the two that are introduced by params
skip_vars = {"param", "initialized"}
for key, value in vars(self).items():
if key not in skip_vars and key[0] != "_":
arguments_str += f"\t{key:40}: {value}\n"
return arguments_str

Просмотреть файл

@ -8,31 +8,11 @@ from pathlib import Path
import torch
from InnerEye.ML.lightning_base import InnerEyeLightning
from InnerEye.ML.lightning_models import ScalarLightning, SegmentationLightning
from InnerEye.ML.lightning_models import create_lightning_model
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
def create_lightning_model(config: ModelConfigBase, set_optimizer_and_scheduler: bool = True) -> InnerEyeLightning:
"""
Creates a PyTorch Lightning model that matches the provided InnerEye model configuration object.
The `optimizer` and `l_rate_scheduler` object of the Lightning model will also be populated.
:param set_optimizer_and_scheduler: If True (default), initialize the optimizer and LR scheduler of the model.
If False, skip that step (this is only meant to be used for unit tests.)
:param config: An InnerEye model configuration object
:return: A PyTorch Lightning model object.
"""
if config.is_segmentation_model:
model: InnerEyeLightning = SegmentationLightning(config)
elif config.is_scalar_model:
model = ScalarLightning(config)
else:
raise NotImplementedError(f"Don't know how to handle config of type {type(config)}")
if set_optimizer_and_scheduler:
model.set_optimizer_and_scheduler(config)
return model
def load_from_lightning_checkpoint(config: ModelConfigBase, checkpoint_path: Path) -> InnerEyeLightning:
"""
Reads a PyTorch model from a checkpoint. First, a PyTorch Lightning model is created matching the InnerEye

Просмотреть файл

@ -2,12 +2,13 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from typing import Any, Dict, Iterable, Optional
from typing import Any, Dict, Iterable, List, Optional
from pytorch_lightning.loggers import LightningLoggerBase
from pytorch_lightning.utilities import rank_zero_only
from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context
from InnerEye.Common.metrics_constants import TRAIN_PREFIX, VALIDATION_PREFIX
from InnerEye.Common.type_annotations import DictStrFloat
@ -21,6 +22,9 @@ class StoringLogger(LightningLoggerBase):
super().__init__()
self.results: Dict[int, DictStrFloat] = {}
self.hyperparams: Any = None
# Fields to store diagnostics for unit testing
self.train_diagnostics: List[Any] = []
self.val_diagnostics: List[Any] = []
@rank_zero_only
def log_metrics(self, metrics: DictStrFloat, step: Optional[int] = None) -> None:
@ -44,7 +48,7 @@ class StoringLogger(LightningLoggerBase):
self.hyperparams = params
def experiment(self) -> Any:
return ""
return None
def name(self) -> Any:
return ""
@ -93,6 +97,48 @@ class StoringLogger(LightningLoggerBase):
"""
return {epoch: self.extract_by_prefix(epoch, prefix_filter) for epoch in self.epochs}
def get_metric(self, is_training: bool, metric_type: str) -> List[float]:
"""
Gets a scalar metric out of either the list of training or the list of validation results. This returns
the value that a specific metric attains in all of the epochs.
:param is_training: If True, read metrics that have a "train/" prefix, otherwise those that have a "val/"
prefix.
:param metric_type: The metric to extract.
:return: A list of floating point numbers, with one entry per entry in the the training or validation results.
"""
full_metric_name = (TRAIN_PREFIX if is_training else VALIDATION_PREFIX) + metric_type
return [self.results[epoch][full_metric_name] for epoch in self.epochs]
def get_train_metric(self, metric_type: str) -> List[float]:
"""
Gets a scalar metric from the list of training results. This returns
the value that a specific metric attains in all of the epochs.
:param metric_type: The metric to extract.
:return: A list of floating point numbers, with one entry per entry in the the training results.
"""
return self.get_metric(is_training=True, metric_type=metric_type)
def get_val_metric(self, metric_type: str) -> List[float]:
"""
Gets a scalar metric from the list of validation results. This returns
the value that a specific metric attains in all of the epochs.
:param metric_type: The metric to extract.
:return: A list of floating point numbers, with one entry per entry in the the validation results.
"""
return self.get_metric(is_training=False, metric_type=metric_type)
def train_results_per_epoch(self) -> List[DictStrFloat]:
"""
Gets the full set of training metrics that the logger stores, as a list of dictionaries per epoch.
"""
return list(self.to_metrics_dicts(prefix_filter=TRAIN_PREFIX).values())
def val_results_per_epoch(self) -> List[DictStrFloat]:
"""
Gets the full set of validation metrics that the logger stores, as a list of dictionaries per epoch.
"""
return list(self.to_metrics_dicts(prefix_filter=VALIDATION_PREFIX).values())
class AzureMLLogger(LightningLoggerBase):
"""
@ -115,7 +161,7 @@ class AzureMLLogger(LightningLoggerBase):
pass
def experiment(self) -> Any:
return ""
return None
def name(self) -> Any:
return ""

Просмотреть файл

@ -21,6 +21,7 @@ from InnerEye.ML.lightning_metrics import Accuracy05, AccuracyAtOptimalThreshold
OptimalThreshold, ScalarMetricsBase
from InnerEye.ML.metrics import compute_dice_across_patches
from InnerEye.ML.metrics_dict import DataframeLogger, MetricsDict, SequenceMetricsDict
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.scalar_config import ScalarModelBase
from InnerEye.ML.sequence_config import SequenceModelBase
from InnerEye.ML.utils import image_util, metrics_util, model_util
@ -129,9 +130,9 @@ class SegmentationLightning(InnerEyeLightning):
if isinstance(center_indices, torch.Tensor):
center_indices = center_indices.cpu().numpy()
if is_training:
self.train_diagnostics.append(center_indices)
self.storing_logger.train_diagnostics.append(center_indices)
else:
self.val_diagnostics.append(center_indices)
self.storing_logger.val_diagnostics.append(center_indices)
# if self.train_val_params.in_training_mode:
# # store the sample train patch from this epoch for visualization
# if batch_index == self.example_to_save and self.config.store_dataset_sample:
@ -380,3 +381,23 @@ def transfer_batch_to_device(batch: Any, device: torch.device) -> Any:
return batch
else:
return move_data_to_device(batch, device)
def create_lightning_model(config: ModelConfigBase, set_optimizer_and_scheduler: bool = True) -> InnerEyeLightning:
"""
Creates a PyTorch Lightning model that matches the provided InnerEye model configuration object.
The `optimizer` and `l_rate_scheduler` object of the Lightning model will also be populated.
:param set_optimizer_and_scheduler: If True (default), initialize the optimizer and LR scheduler of the model.
If False, skip that step (this is only meant to be used for unit tests.)
:param config: An InnerEye model configuration object
:return: A PyTorch Lightning model object.
"""
if config.is_segmentation_model:
model: InnerEyeLightning = SegmentationLightning(config)
elif config.is_scalar_model:
model = ScalarLightning(config)
else:
raise NotImplementedError(f"Don't know how to handle config of type {type(config)}")
if set_optimizer_and_scheduler:
model.set_optimizer_and_scheduler(config)
return model

Просмотреть файл

@ -8,7 +8,7 @@ import subprocess
import sys
from pathlib import Path
from time import sleep
from typing import Optional, Tuple, TypeVar
from typing import Any, Dict, Optional, Tuple, TypeVar
import numpy as np
import torch
@ -18,27 +18,18 @@ from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.plugins import DDPPlugin
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from InnerEye.Azure.azure_util import RUN_CONTEXT
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, logging_section
from InnerEye.Common.metrics_constants import TRAIN_PREFIX, VALIDATION_PREFIX
from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, change_working_directory
from InnerEye.Common.resource_monitor import ResourceMonitor
from InnerEye.ML.common import ModelExecutionMode, RECOVERY_CHECKPOINT_FILE_NAME, cleanup_checkpoint_folder
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.deep_learning_config import VISUALIZATION_FOLDER
from InnerEye.ML.lightning_base import TrainingAndValidationDataLightning
from InnerEye.ML.lightning_helpers import create_lightning_model
from InnerEye.ML.deep_learning_config import ARGS_TXT, VISUALIZATION_FOLDER
from InnerEye.ML.lightning_base import InnerEyeContainer, InnerEyeLightning
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.lightning_loggers import AzureMLLogger, StoringLogger
from InnerEye.ML.lightning_models import SUBJECT_OUTPUT_PER_RANK_PREFIX, ScalarLightning, \
get_subject_output_file_per_rank
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.utils import ml_util
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
from InnerEye.ML.utils.model_util import generate_and_print_model_summary
from InnerEye.ML.utils.training_util import ModelTrainingResults
from InnerEye.ML.visualizers.patch_sampling import visualize_random_crops_for_dataset
MAX_ITEM_LOAD_TIME_SEC = 0.5
MAX_LOAD_TIME_WARNINGS = 3
TEMP_PREFIX = "temp/"
T = TypeVar('T')
@ -68,23 +59,38 @@ def upload_output_file_as_temp(file_path: Path, outputs_folder: Path) -> None:
upload_name = TEMP_PREFIX + str(file_path.relative_to(outputs_folder))
RUN_CONTEXT.upload_file(upload_name, path_or_stream=str(file_path))
def create_lightning_trainer(config: ModelConfigBase,
def write_args_file(config: Any, outputs_folder: Path) -> None:
"""
Writes the given config to disk in plain text in the default output folder.
"""
output = str(config)
outputs_folder.mkdir(exist_ok=True, parents=True)
dst = outputs_folder / ARGS_TXT
dst.write_text(output)
logging.info(output)
def create_lightning_trainer(container: LightningContainer,
resume_from_checkpoint: Optional[Path] = None,
num_nodes: int = 1) -> Tuple[Trainer, StoringLogger]:
num_nodes: int = 1,
**kwargs: Dict[str, Any]) -> \
Tuple[Trainer, Optional[StoringLogger]]:
"""
Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
return value.
:param config: The model configuration.
:param container: The container with model and data.
:param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
:param num_nodes: The number of nodes to use in distributed training.
:param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
:return: A tuple [Trainer object, diagnostic logger]
"""
# For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation
# models, this still appears to be the best way of choosing them because validation loss on the relatively small
# training patches is not stable enough. Going by the validation loss somehow works for the Prostate model, but
# not for the HeadAndNeck model.
best_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder),
best_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
# filename=BEST_CHECKPOINT_FILE_NAME,
# monitor=f"{VALIDATION_PREFIX}{MetricType.LOSS.value}",
# save_top_k=1,
@ -93,144 +99,141 @@ def create_lightning_trainer(config: ModelConfigBase,
# Store 1 recovery checkpoint every recovery_checkpoint_save_interval epochs. Due to a bug in Lightning, this
# will still write alternate files recovery.ckpt and recovery-v0.ckpt, which are cleaned up later in
# cleanup_checkpoint_folder
recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(config.checkpoint_folder),
recovery_checkpoint_callback = ModelCheckpoint(dirpath=str(container.checkpoint_folder),
filename=RECOVERY_CHECKPOINT_FILE_NAME,
period=config.recovery_checkpoint_save_interval
period=container.recovery_checkpoint_save_interval
)
num_gpus = torch.cuda.device_count() if config.use_gpu else 0
num_gpus = torch.cuda.device_count() if container.use_gpu else 0
logging.info(f"Number of available GPUs: {num_gpus}")
if config.max_num_gpus >= 0 and config.max_num_gpus < num_gpus:
num_gpus = config.max_num_gpus
if 0 <= container.max_num_gpus < num_gpus:
num_gpus = container.max_num_gpus
logging.info(f"Restricting the number of GPUs to {num_gpus}")
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
# For unit tests, only "ddp_spawn" works
accelerator = "ddp" if num_gpus * num_nodes > 1 else None
plugins = [InnerEyeDDPPlugin(num_nodes=num_nodes, sync_batchnorm=True)] if num_gpus * num_nodes > 1 else None
logging.info(f"Using {num_gpus} GPUs with accelerator '{accelerator}'")
storing_logger = StoringLogger()
tensorboard_logger = TensorBoardLogger(save_dir=str(config.logs_folder), name="Lightning", version="")
loggers = [storing_logger, tensorboard_logger, AzureMLLogger()]
# This leads to problems with run termination.
# if not is_offline_run_context(RUN_CONTEXT):
# mlflow_logger = MLFlowLogger(experiment_name=RUN_CONTEXT.experiment.name,
# tracking_uri=RUN_CONTEXT.experiment.workspace.get_mlflow_tracking_uri())
# # The MLFlow logger needs to get its ID from the AzureML run context, otherwise there will be two sets of
# # results for each run, one from native AzureML and one from the MLFlow logger.
# mlflow_logger._run_id = RUN_CONTEXT.id
# loggers.append(mlflow_logger)
tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="")
loggers = [tensorboard_logger, AzureMLLogger()]
storing_logger: Optional[StoringLogger]
if isinstance(container, InnerEyeContainer):
storing_logger = StoringLogger()
loggers.append(storing_logger)
else:
storing_logger = None
# Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
precision = 32 if num_gpus == 0 else 16 if config.use_mixed_precision else 32
precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
# The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
# https://pytorch.org/docs/stable/notes/randomness.html
# For the classification models, we observed only a small performance deterioration (increase in 10sec on total
# training time of 22min) when switching to deterministic.
if config.pl_deterministic:
if container.pl_deterministic:
deterministic = True
benchmark = False
else:
deterministic = False
benchmark = True
trainer = Trainer(default_root_dir=str(config.outputs_folder),
# Read out additional model-specific args here.
# We probably want to keep essential ones like numgpu and logging.
trainer = Trainer(default_root_dir=str(container.outputs_folder),
deterministic=deterministic,
benchmark=benchmark,
accelerator=accelerator,
max_epochs=config.num_epochs,
num_sanity_val_steps=config.pl_num_sanity_val_steps,
max_epochs=container.num_epochs,
num_sanity_val_steps=container.pl_num_sanity_val_steps,
callbacks=[best_checkpoint_callback, recovery_checkpoint_callback],
logger=loggers,
progress_bar_refresh_rate=0, # Disable the progress bar completely
progress_bar_refresh_rate=container.pl_progress_bar_refresh_rate,
num_nodes=num_nodes,
gpus=num_gpus,
precision=precision,
sync_batchnorm=True,
terminate_on_nan=config.detect_anomaly,
terminate_on_nan=container.detect_anomaly,
resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
plugins=plugins
)
plugins=plugins,
**kwargs)
return trainer, storing_logger
def model_train(config: ModelConfigBase,
checkpoint_handler: CheckpointHandler,
num_nodes: int = 1) -> ModelTrainingResults:
def start_resource_monitor(config: LightningContainer) -> ResourceMonitor:
# initialize and start GPU monitoring
gpu_tensorboard = config.logs_folder / "gpu_utilization"
# Result file in CSV format should NOT live in the logs folder, the streaming upload that is
# used for this folder might corrupt the file.
gpu_csv = config.outputs_folder / "gpu_utilization"
gpu_csv.mkdir(parents=True, exist_ok=True)
logging.info(f"Starting resource monitor. GPU utilization will be written to Tensorboard in "
f"{gpu_tensorboard}, aggregate metrics to {gpu_csv}")
resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds,
tensorboard_folder=gpu_tensorboard,
csv_results_folder=gpu_csv)
resource_monitor.start()
return resource_monitor
def model_train(checkpoint_handler: CheckpointHandler,
container: LightningContainer,
num_nodes: int = 1) -> Tuple[Trainer, Optional[StoringLogger]]:
"""
The main training loop. It creates the Pytorch model based on the configuration options passed in,
creates a Pytorch Lightning trainer, and trains the model.
If a checkpoint was specified, then it loads the checkpoint before resuming training.
:param config: The arguments which specify all required information.
:param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
:param num_nodes: The number of nodes to use in distributed training.
:param container: A container object that holds the training data in PyTorch Lightning format
and the model to train.
:return: A tuple of [Trainer, StoringLogger]. Trainer is the Lightning Trainer object that was used for fitting
the model. The StoringLogger object is returned when training an InnerEye built-in model, this is None when
fitting other models.
"""
# Get the path to the checkpoint to recover from
checkpoint_path = checkpoint_handler.get_recovery_path_train()
# This reads the dataset file, and possibly sets required pre-processing objects, like one-hot encoder
# for categorical features, that need to be available before creating the model.
config.read_dataset_if_needed()
lightning_model = container.model
container.before_training_on_all_ranks()
resource_monitor: Optional[ResourceMonitor] = None
# Execute some bookkeeping tasks only once if running distributed:
if is_rank_zero():
logging.info(f"Model checkpoints are saved at {container.checkpoint_folder}")
container.before_training_on_rank_zero()
write_args_file(container.config if isinstance(container, InnerEyeContainer) else container,
outputs_folder=container.outputs_folder)
if container.monitoring_interval_seconds > 0:
resource_monitor = start_resource_monitor(container)
# Create the trainer object. Backup the environment variables before doing that, in case we need to run a second
# training in the unit tests.d
old_environ = dict(os.environ)
seed_everything(config.get_effective_random_seed())
trainer, storing_logger = create_lightning_trainer(config, checkpoint_path, num_nodes=num_nodes)
# Set random seeds just before training. For segmentation models, we have
# something that changes the random seed in the before_training_on_rank_zero hook.
seed_everything(container.get_effective_random_seed())
trainer, storing_logger = create_lightning_trainer(container,
checkpoint_path,
num_nodes=num_nodes,
**container.get_trainer_arguments())
logging.info(f"GLOBAL_RANK: {os.getenv('GLOBAL_RANK')}, LOCAL_RANK {os.getenv('LOCAL_RANK')}. "
f"trainer.global_rank: {trainer.global_rank}")
logging.debug("Creating the PyTorch model.")
lightning_model = create_lightning_model(config)
lightning_model.storing_logger = storing_logger
# InnerEye models use this logger for diagnostics
if isinstance(lightning_model, InnerEyeLightning):
if storing_logger is None:
raise ValueError("InnerEye models require the storing_logger for diagnostics")
lightning_model.storing_logger = storing_logger
resource_monitor = None
# Execute some bookkeeping tasks only once if running distributed:
if is_rank_zero():
config.write_args_file()
logging.info(str(config))
# Save the dataset files for later use in cross validation analysis
config.write_dataset_files()
logging.info(f"Model checkpoints are saved at {config.checkpoint_folder}")
# set the random seed for all libraries
ml_util.set_random_seed(config.get_effective_random_seed(), "Patch visualization")
# Visualize how patches are sampled for segmentation models. This changes the random generator, but we don't
# want training to depend on how many patients we visualized, and hence set the random seed again right after.
if isinstance(config, SegmentationModelBase):
with logging_section("Visualizing the effect of sampling random crops for training"):
visualize_random_crops_for_dataset(config)
# Print out a detailed breakdown of layers, memory consumption and time.
generate_and_print_model_summary(config, lightning_model.model)
if config.monitoring_interval_seconds > 0:
# initialize and start GPU monitoring
gpu_tensorboard = config.logs_folder / "gpu_utilization"
# Result file in CSV format should NOT live in the logs folder, the streaming upload that is
# used for this folder might corrupt the file.
gpu_csv = config.outputs_folder / "gpu_utilization"
gpu_csv.mkdir(parents=True, exist_ok=True)
logging.info(f"Starting resource monitor. GPU utilization will be written to Tensorboard in "
f"{gpu_tensorboard}, aggregate metrics to {gpu_csv}")
resource_monitor = ResourceMonitor(interval_seconds=config.monitoring_interval_seconds,
tensorboard_folder=gpu_tensorboard,
csv_results_folder=gpu_csv)
resource_monitor.start()
# Training loop
logging.info("Starting training")
lightning_data = TrainingAndValidationDataLightning(config) # type: ignore
# When trying to store the config object in the constructor, it does not appear to get stored at all, later
# reference of the object simply fail. Hence, have to set explicitly here.
lightning_data.config = config
trainer.fit(lightning_model, datamodule=lightning_data)
trainer.logger.close() # type: ignore
lightning_model.close_all_loggers()
# When training models that are not built-in InnerEye models, we have no guarantee that they write
# files to the right folder. Best guess is to change the current working directory to where files should go.
data_module = container.get_data_module()
with change_working_directory(container.outputs_folder):
trainer.fit(lightning_model, datamodule=data_module)
trainer.logger.close() # type: ignore
world_size = getattr(trainer, "world_size", 0)
is_azureml_run = not config.is_offline_run
is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
# Per-subject model outputs for regression models are written per rank, and need to be aggregated here.
# Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them.
if is_azureml_run and world_size > 1 and isinstance(lightning_model, ScalarLightning):
upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, config.outputs_folder)
upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, config.outputs_folder)
upload_output_file_as_temp(lightning_model.train_subject_outputs_logger.csv_path, container.outputs_folder)
upload_output_file_as_temp(lightning_model.val_subject_outputs_logger.csv_path, container.outputs_folder)
# DDP will start multiple instances of the runner, one for each GPU. Those should terminate here after training.
# We can now use the global_rank of the Lightining model, rather than environment variables, because DDP has set
# all necessary properties.
@ -239,7 +242,7 @@ def model_train(config: ModelConfigBase,
sys.exit()
logging.info("Choosing the best checkpoint and removing redundant files.")
cleanup_checkpoint_folder(config.checkpoint_folder)
cleanup_checkpoint_folder(container.checkpoint_folder)
# Lightning modifies a ton of environment variables. If we first run training and then the test suite,
# those environment variables will mislead the training runs in the test suite, and make them crash.
# Hence, restore the original environment after training.
@ -254,17 +257,9 @@ def model_train(config: ModelConfigBase,
for rank in range(world_size):
for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]:
file = mode.value + "/" + get_subject_output_file_per_rank(rank)
RUN_CONTEXT.download_file(name=TEMP_PREFIX + file, output_file_path=config.outputs_folder / file)
RUN_CONTEXT.download_file(name=TEMP_PREFIX + file, output_file_path=container.outputs_folder / file)
# Concatenate all temporary file per execution mode
aggregate_and_create_subject_metrics_file(config.outputs_folder)
model_training_results = ModelTrainingResults(
train_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=TRAIN_PREFIX).values()),
val_results_per_epoch=list(storing_logger.to_metrics_dicts(prefix_filter=VALIDATION_PREFIX).values()),
train_diagnostics=lightning_model.train_diagnostics,
val_diagnostics=lightning_model.val_diagnostics,
optimal_temperature_scale_values_per_checkpoint_epoch=[]
)
aggregate_and_create_subject_metrics_file(container.outputs_folder)
logging.info("Finished training")
@ -272,20 +267,20 @@ def model_train(config: ModelConfigBase,
# checkpoints correctly.
checkpoint_handler.additional_training_done()
# Upload visualization directory to AML run context to be able to see it
# in the Azure UI.
if config.max_batch_grad_cam > 0 and config.visualization_folder.exists():
RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(config.visualization_folder))
# Upload visualization directory to AML run context to be able to see it in the Azure UI.
if isinstance(container, InnerEyeContainer):
if container.config.max_batch_grad_cam > 0 and container.visualization_folder.exists():
RUN_CONTEXT.upload_folder(name=VISUALIZATION_FOLDER, path=str(container.visualization_folder))
if resource_monitor:
logging.info("Shutting down the resource monitor process.")
if not config.is_offline_run:
if is_azureml_run:
for gpu_name, metrics_per_gpu in resource_monitor.read_aggregate_metrics().items():
# Log as a table, with GPU being the first column
RUN_CONTEXT.log_row("GPU utilization", GPU=gpu_name, **metrics_per_gpu)
resource_monitor.kill()
return model_training_results
return trainer, storing_logger
def aggregate_and_create_subject_metrics_file(outputs_folder: Path) -> None:
@ -298,16 +293,15 @@ def aggregate_and_create_subject_metrics_file(outputs_folder: Path) -> None:
for mode in [ModelExecutionMode.TRAIN, ModelExecutionMode.VAL]:
temp_files = (outputs_folder / mode.value).rglob(SUBJECT_OUTPUT_PER_RANK_PREFIX + "*")
result_file = outputs_folder / mode.value / SUBJECT_METRICS_FILE_NAME
result_file = result_file.open("a")
for i, file in enumerate(temp_files):
temp_file_contents = file.read_text()
if i == 0:
# Copy the first file as-is, including the first line with the column headers
result_file.write(temp_file_contents)
else:
# For all files but the first one, cut off the header line.
result_file.write(os.linesep + os.linesep.join(temp_file_contents.splitlines()[1:]))
result_file.close()
with result_file.open("a") as f:
for i, file in enumerate(temp_files):
temp_file_contents = file.read_text()
if i == 0:
# Copy the first file as-is, including the first line with the column headers
f.write(temp_file_contents)
else:
# For all files but the first one, cut off the header line.
f.write(os.linesep + os.linesep.join(temp_file_contents.splitlines()[1:]))
class InnerEyeDDPPlugin(DDPPlugin):

Просмотреть файл

@ -21,7 +21,7 @@ from InnerEye.ML.dataset.full_image_dataset import load_dataset_sources
from InnerEye.ML.deep_learning_config import ARGS_TXT
from InnerEye.ML.photometric_normalization import PhotometricNormalization
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.io_util import load_images_from_dataset_source
@ -73,7 +73,7 @@ def main(yaml_file_path: Path) -> None:
In addition, the arguments '--image_channel' and '--gt_channel' must be specified (see below).
"""
config, runner_config, args = get_configs(SegmentationModelBase(should_validate=False), yaml_file_path)
local_dataset = MLRunner(config, runner_config).mount_or_download_dataset()
local_dataset = MLRunner(config, azure_config=runner_config).mount_or_download_dataset()
assert local_dataset is not None
dataframe = pd.read_csv(local_dataset / DATASET_CSV_FILE_NAME)
normalizer_config = NormalizeAndVisualizeConfig(**args)

Просмотреть файл

@ -449,7 +449,7 @@ class InferenceBatch(CTImagesMaskedBatch):
@inbatch_parallel(init='indices', post='_post_custom_components', target='threads')
def set_component(self, batch_idx: int, component: InferenceBatch.Components, data: np.ndarray) \
-> Dict[InferenceBatch.Components, Any]:
-> Dict[str, Any]:
logging.debug("Updated data in pipeline component: {}, for batch: {}.".format(component.value, batch_idx))
return {
component.value: {'type': component.value, 'data': data}

Просмотреть файл

@ -16,6 +16,9 @@ from azureml._restclient.constants import RunStatus
from azureml.core import Environment, Run
from azureml.core.model import Model
from azureml.data import FileDataset
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.utilities.cloud_io import load as pl_load
from torch.utils.data import DataLoader
from InnerEye.Azure import azure_util
from InnerEye.Azure.azure_config import AzureConfig
@ -23,30 +26,33 @@ from InnerEye.Azure.azure_runner import ENVIRONMENT_VERSION, INPUT_DATA_KEY, get
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, \
DEFAULT_CROSS_VALIDATION_SPLIT_INDEX, EFFECTIVE_RANDOM_SEED_KEY_NAME, IS_ENSEMBLE_KEY_NAME, \
MODEL_ID_KEY_NAME, PARENT_RUN_CONTEXT, PARENT_RUN_ID_KEY_NAME, RUN_CONTEXT, RUN_RECOVERY_FROM_ID_KEY_NAME, \
RUN_RECOVERY_ID_KEY_NAME, create_run_recovery_id, get_results_blob_path, merge_conda_files
RUN_RECOVERY_ID_KEY_NAME, create_run_recovery_id, is_offline_run_context, \
merge_conda_files
from InnerEye.Common import fixed_paths
from InnerEye.Common.build_config import ExperimentResultLocation, build_information_to_dot_net_json_file
from InnerEye.Common.common_util import BASELINE_COMPARISONS_FOLDER, BASELINE_WILCOXON_RESULTS_FILE, \
CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, ModelProcessing, \
CROSSVAL_RESULTS_FOLDER, ENSEMBLE_SPLIT_NAME, FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
ModelProcessing, \
OTHER_RUNS_SUBDIR_NAME, SCATTERPLOTS_SUBDIR_NAME, SUBJECT_METRICS_FILE_NAME, \
get_best_epoch_results_path, is_windows, logging_section, print_exception, remove_file_or_directory
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, PYTHON_ENVIRONMENT_NAME
change_working_directory, get_best_epoch_results_path, is_windows, logging_section, logging_to_file, \
print_exception, remove_file_or_directory
from InnerEye.Common.fixed_paths import INNEREYE_PACKAGE_NAME, LOG_FILE_NAME, PYTHON_ENVIRONMENT_NAME
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, FINAL_ENSEMBLE_MODEL_FOLDER, FINAL_MODEL_FOLDER, \
ModelCategory, MultiprocessingStartMethod
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig, FINAL_ENSEMBLE_MODEL_FOLDER, \
FINAL_MODEL_FOLDER, ModelCategory, MultiprocessingStartMethod
from InnerEye.ML.lightning_base import InnerEyeContainer
from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer
from InnerEye.ML.metrics import InferenceMetrics, InferenceMetricsForSegmentation
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.model_inference_config import ModelInferenceConfig
from InnerEye.ML.model_testing import model_test
from InnerEye.ML.model_training import model_train
from InnerEye.ML.reports.notebook_report import get_ipynb_report_name, generate_classification_crossval_notebook, \
from InnerEye.ML.model_training import create_lightning_trainer, model_train
from InnerEye.ML.reports.notebook_report import generate_classification_crossval_notebook, \
generate_classification_multilabel_notebook, generate_classification_notebook, generate_segmentation_notebook, \
reports_folder
get_ipynb_report_name, reports_folder
from InnerEye.ML.runner import ModelDeploymentHookSignature, PostCrossValidationHookSignature, get_all_environment_files
from InnerEye.ML.scalar_config import ScalarModelBase
from InnerEye.ML.sequence_config import SequenceModelBase
from InnerEye.ML.utils import ml_util
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
from InnerEye.ML.visualizers import activation_maps
from InnerEye.ML.visualizers.plot_cross_validation import \
@ -78,7 +84,8 @@ def download_dataset(azure_dataset_id: str,
contains a dataset csv file, no download is started.
:param azure_dataset_id: The name of a dataset that is registered in the AzureML workspace.
:param target_folder: The folder in which to download the dataset from Azure.
:param dataset_csv: Name of the csv file describing the dataset.
:param dataset_csv: Name of the csv file describing the dataset. This is only used to check if the dataset has been
downloaded already.
:param azure_config: All Azure-related configuration options.
:return: A path on the local machine that contains the dataset.
"""
@ -88,11 +95,18 @@ def download_dataset(azure_dataset_id: str,
raise ValueError(f"Expected to get a FileDataset, but got {type(azure_dataset)}")
# The downloaded dataset may already exist from a previous run.
expected_dataset_path = target_folder / azure_dataset_id
expected_dataset_file = expected_dataset_path / dataset_csv
logging.info(f"Model training will use dataset '{azure_dataset_id}' in Azure.")
if expected_dataset_path.is_dir() and expected_dataset_file.is_file():
logging.info(f"The dataset appears to be downloaded already in {expected_dataset_path}. Skipping.")
return expected_dataset_path
if expected_dataset_path.is_dir():
if dataset_csv:
if (expected_dataset_path / dataset_csv).is_file():
logging.info(f"The file {dataset_csv} is already downloaded in {expected_dataset_path}. Skipping.")
return expected_dataset_path
else:
existing_files = sum(1 for _ in expected_dataset_path.rglob("*"))
if existing_files > 1:
logging.info(f"There are already {existing_files} files in {expected_dataset_path}. Skipping.")
return expected_dataset_path
logging.info("Starting to download the dataset - WARNING, this could take very long!")
with logging_section("Downloading dataset"):
t0 = time.perf_counter()
@ -121,15 +135,20 @@ def log_metrics(val_metrics: Optional[InferenceMetricsForSegmentation],
class MLRunner:
def __init__(self,
model_config: ModelConfigBase,
model_config: Optional[DeepLearningConfig] = None,
container: Optional[LightningContainer] = None,
azure_config: Optional[AzureConfig] = None,
project_root: Optional[Path] = None,
post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None) -> None:
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None,
output_subfolder: str = "") -> None:
"""
Driver class to run a ML experiment. Note that the project root argument MUST be supplied when using InnerEye
as a package!
:param model_config: Model related configurations
:param model_config: If None, run the training as per the `container` argument (bring-your-own-model). If not
None, this is the model configuration for a built-in InnerEye model.
:param container: The LightningContainer object to use for training. If None, assume that the training is
for a built-in InnerEye model.
:param azure_config: Azure related configurations
:param project_root: Project root. This should only be omitted if calling run_ml from the test suite. Supplying
it is crucial when using InnerEye as a package or submodule!
@ -138,50 +157,112 @@ class MLRunner:
:param model_deployment_hook: an optional function for deploying a model in an application-specific way.
If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML
Model as arguments, and return an optional Path and a further object of any type.
:param output_subfolder: If provided, the output folder structure will have an additional subfolder,
when running outside AzureML.
"""
if model_config is not None and container is not None:
raise ValueError("Only one of the two arguments 'model_config', 'container' must be provided.")
self.model_config = model_config
if container is None:
assert isinstance(model_config, ModelConfigBase), \
"When using a built-in InnerEye model, the configuration should be an instance of ModelConfigBase"
container = InnerEyeContainer(model_config)
self.container = container
self.azure_config: AzureConfig = azure_config or AzureConfig()
self.project_root: Path = project_root or fixed_paths.repository_root_directory()
self.post_cross_validation_hook = post_cross_validation_hook
self.model_deployment_hook = model_deployment_hook
self.output_subfolder = output_subfolder
self._has_setup_run = False
def setup(self, use_mount_or_download_dataset: bool = True) -> None:
"""
If the present object is using one of the InnerEye built-in models, create a (fake) container for it
and call the setup method. It sets the random seeds, and then creates the actual Lightning modules.
:param use_mount_or_download_dataset: If True, try to download or mount the dataset that is used by the model.
If False, assume that the dataset is already available (this should only be used for unit tests).
"""
if self._has_setup_run:
return
if (not self.azure_config.only_register_model) and use_mount_or_download_dataset:
# Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails
# and config.local_dataset was not already set.
# This must happen before container setup because that could already read datasets.
self.container.local_dataset = self.mount_or_download_dataset()
# Ensure that we use fixed seeds before initializing the PyTorch models
seed_everything(self.container.get_effective_random_seed())
# Creating the folder structure must happen before the LightningModule is created, because the output
# parameters of the container will be copied into the module.
if self.output_subfolder:
# This codepath is only executed for cross validation runs outside AzureML: The folder structure
# uses an existing folder structure set by the caller, and just a subfolder is added.
self.container.file_system_config = self.container.file_system_config.add_subfolder(self.output_subfolder)
else:
self.container.create_filesystem(self.project_root)
# A lot of the code for the built-in InnerEye models expects the output paths directly in the config files.
if isinstance(self.container, InnerEyeContainer):
self.container.config.local_dataset = self.container.local_dataset
self.container.config.file_system_config = self.container.file_system_config
self.container.setup()
self.container.create_lightning_module_and_store()
self._has_setup_run = True
@property
def is_offline_run(self) -> bool:
"""
Returns True if the present run is outside of AzureML, and False if it is inside of AzureML.
:return:
"""
return is_offline_run_context(RUN_CONTEXT)
@property
def innereye_config(self) -> DeepLearningConfig:
"""
Gets the model configuration object for all built-in InnerEye models. Raises an exception if the present
object trains a LightningContainer that is not a built-in InnerEye model.
"""
if self.model_config is None or not isinstance(self.model_config, DeepLearningConfig):
raise ValueError("This property should only be used with built-in InnerEye models, but model "
f"configuration is of type {type(self.model_config)}")
return self.model_config
def start_logging_to_file(self) -> None:
if self.container is None:
self.setup()
logging_to_file(self.container.logs_folder / LOG_FILE_NAME)
def is_offline_cross_val_parent_run(self) -> bool:
"""
Returns true if the current run is an offline run with cross validation splits > 0
and cross_validation_split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX (ie: a parent)
"""
return self.model_config.cross_validation_split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX and \
self.model_config.perform_cross_validation and self.model_config.is_offline_run
return self.container.cross_validation_split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX and \
self.container.perform_cross_validation and self.is_offline_run
def spawn_offline_cross_val_classification_child_runs(self) -> None:
"""
Trains and Tests k models based on their respective data splits sequentially.
Stores the results on the Validation set to the outputs directory of the parent run.
"""
_config = self.model_config
assert isinstance(_config, ScalarModelBase)
parent_run_file_system = _config.file_system_config
assert isinstance(self.innereye_config, ScalarModelBase)
def _spawn_run(cross_val_split_index: int) -> None:
split_model_config = copy.deepcopy(_config)
assert isinstance(split_model_config, ScalarModelBase)
split_model_config.cross_validation_split_index = cross_val_split_index
_local_split_folder_name = str(cross_val_split_index)
split_model_config.file_system_config = parent_run_file_system.add_subfolder(_local_split_folder_name)
split_config = copy.deepcopy(self.innereye_config)
split_config.cross_validation_split_index = cross_val_split_index
logging.info(f"Running model train and test on cross validation split: {cross_val_split_index}")
split_ml_runner = MLRunner(model_config=split_model_config,
split_ml_runner = MLRunner(model_config=split_config,
container=None,
azure_config=self.azure_config,
project_root=self.project_root,
post_cross_validation_hook=self.post_cross_validation_hook,
model_deployment_hook=self.model_deployment_hook)
model_deployment_hook=self.model_deployment_hook,
output_subfolder=str(cross_val_split_index))
split_ml_runner.run()
for i in range(_config.number_of_cross_validation_splits):
for i in range(self.innereye_config.number_of_cross_validation_splits):
_spawn_run(i)
config_and_files = get_config_and_results_for_offline_runs(self.model_config)
config_and_files = get_config_and_results_for_offline_runs(self.innereye_config)
plot_cross_validation_from_files(config_and_files, Path(config_and_files.config.outputs_directory))
def set_run_tags_from_parent(self) -> None:
@ -208,8 +289,8 @@ class MLRunner:
]
new_tags = {tag: run_tags_parent.get(tag, "") for tag in tags_to_copy}
new_tags[RUN_RECOVERY_ID_KEY_NAME] = create_run_recovery_id(run=RUN_CONTEXT)
new_tags[CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY] = str(self.model_config.cross_validation_split_index)
new_tags[EFFECTIVE_RANDOM_SEED_KEY_NAME] = str(self.model_config.get_effective_random_seed())
new_tags[CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY] = str(self.container.cross_validation_split_index)
new_tags[EFFECTIVE_RANDOM_SEED_KEY_NAME] = str(self.container.get_effective_random_seed())
RUN_CONTEXT.set_tags(new_tags)
def run(self) -> None:
@ -217,66 +298,113 @@ class MLRunner:
Driver function to run a ML experiment. If an offline cross validation run is requested, then
this function is recursively called for each cross validation split.
"""
self.setup()
if self.is_offline_cross_val_parent_run():
if self.model_config.is_segmentation_model:
if self.innereye_config.is_segmentation_model:
raise NotImplementedError("Offline cross validation is only supported for classification models.")
self.spawn_offline_cross_val_classification_child_runs()
return
# Get the AzureML context in which the script is running
if not self.model_config.is_offline_run and PARENT_RUN_CONTEXT is not None:
if not self.is_offline_run and PARENT_RUN_CONTEXT is not None:
logging.info("Setting tags from parent run.")
self.set_run_tags_from_parent()
self.save_build_info_for_dotnet_consumers()
# Set data loader start method
self.set_multiprocessing_start_method()
# configure recovery container if provided
checkpoint_handler = CheckpointHandler(model_config=self.model_config,
checkpoint_handler = CheckpointHandler(container=self.container,
azure_config=self.azure_config,
project_root=self.project_root,
run_context=RUN_CONTEXT)
checkpoint_handler.download_recovery_checkpoints_or_weights()
trainer: Optional[Trainer] = None
# do training and inference, unless the "only register" switch is set (which requires a run_recovery
# to be valid).
if not self.azure_config.only_register_model:
# Set local_dataset to the mounted path specified in azure_runner.py, if any, or download it if that fails
# and config.local_dataset was not already set.
self.model_config.local_dataset = self.mount_or_download_dataset()
# Check for existing dataset.csv file in the correct locations. Skip that if a dataset has already been
# loaded (typically only during tests)
if self.model_config.dataset_data_frame is None:
assert self.model_config.local_dataset is not None
ml_util.validate_dataset_paths(
self.model_config.local_dataset,
self.model_config.dataset_csv)
# train a new model if required
if self.azure_config.train:
with logging_section("Model training"):
model_train(self.model_config, checkpoint_handler, num_nodes=self.azure_config.num_nodes)
else:
self.model_config.write_dataset_files()
trainer, _ = model_train(checkpoint_handler,
container=self.container,
num_nodes=self.azure_config.num_nodes)
# log the number of epochs used for model training
RUN_CONTEXT.log(name="Train epochs", value=self.container.num_epochs)
elif isinstance(self.container, InnerEyeContainer):
self.innereye_config.write_dataset_files()
self.create_activation_maps()
# log the number of epochs used for model training
RUN_CONTEXT.log(name="Train epochs", value=self.model_config.num_epochs)
if isinstance(self.container, InnerEyeContainer):
# Inference for the InnerEye built-in models
# We specify the ModelProcessing as DEFAULT here even if the run_recovery points to an ensemble run, because
# the current run is a single one. See the documentation of ModelProcessing for more details.
self.run_inference_and_register_model(checkpoint_handler, ModelProcessing.DEFAULT)
# We specify the ModelProcessing as DEFAULT here even if the run_recovery points to an ensemble run, because
# the current run is a single one. See the documentation of ModelProcessing for more details.
self.run_inference_and_register_model(checkpoint_handler, ModelProcessing.DEFAULT)
if self.container.generate_report:
self.generate_report(ModelProcessing.DEFAULT)
if self.model_config.generate_report:
self.generate_report(ModelProcessing.DEFAULT)
# If this is an cross validation run, and the present run is child run 0, then wait for the sibling runs,
# build the ensemble model, and write a report for that.
if self.container.number_of_cross_validation_splits > 0:
should_wait_for_other_child_runs = (not self.is_offline_run) and \
self.container.cross_validation_split_index == 0
if should_wait_for_other_child_runs:
self.wait_for_runs_to_finish()
self.create_ensemble_model_and_run_inference()
else:
# Inference for all models that are specified via LightningContainers.
self.run_inference_for_lightning_models(checkpoint_handler.get_checkpoints_to_test(), trainer)
# We can't enforce that files are written to the output folder, hence change the working directory manually
with change_working_directory(self.container.outputs_folder):
self.container.create_report()
# If this is an cross validation run, and the present run is child run 0, then wait for the sibling runs,
# build the ensemble model, and write a report for that.
if self.model_config.number_of_cross_validation_splits > 0:
if self.model_config.should_wait_for_other_cross_val_child_runs():
self.wait_for_runs_to_finish()
self.create_ensemble_model()
def run_inference_for_lightning_models(self, checkpoint_paths: List[Path], trainer: Optional[Trainer]) -> None:
"""
Run inference on the test set for all models that are specified via a LightningContainer.
"""
if len(checkpoint_paths) != 1:
raise ValueError(f"This method expects exactly 1 checkpoint for inference, but got {len(checkpoint_paths)}")
lightning_model = self.container.model
# Run the customized inference code only if the the "inference" step has been overridden
if isinstance(lightning_model, InnerEyeInference) and \
type(lightning_model).inference_step != InnerEyeInference.inference_step:
logging.info("Running inference via the InnerEyeInference.inference_step method")
# Read the data modules before changing the working directory, in case the code relies on relative paths
data = self.container.get_inference_data_module()
dataloaders: List[Tuple[DataLoader, ModelExecutionMode]] = []
if self.container.perform_validation_and_test_set_inference:
dataloaders.append((data.test_dataloader(), ModelExecutionMode.TEST)) # type: ignore
dataloaders.append((data.val_dataloader(), ModelExecutionMode.VAL)) # type: ignore
if self.container.perform_training_set_inference:
dataloaders.append((data.train_dataloader(), ModelExecutionMode.TRAIN)) # type: ignore
map_location = "gpu" if self.container.use_gpu else "cpu"
checkpoint = pl_load(checkpoint_paths[0], map_location=map_location)
lightning_model.load_state_dict(checkpoint['state_dict'])
lightning_model.eval()
with change_working_directory(self.container.outputs_folder):
lightning_model.on_inference_start()
for loader, split in dataloaders:
logging.info(f"Starting inference on {split.value} set")
lightning_model.on_inference_epoch_start(dataset_split=split, is_ensemble_model=False)
for batch_idx, item in enumerate(loader):
model_output = lightning_model.forward(item[0])
lightning_model.inference_step(item, batch_idx, model_output=model_output)
lightning_model.on_inference_epoch_end()
lightning_model.on_inference_end()
elif type(lightning_model).test_step != LightningModule.test_step:
# Run Lightning's built-in test procedure if the `test_step` method has been overridden
logging.info("Running inference via the LightningModule.test_step method")
trainer = trainer or create_lightning_trainer(self.container)[0]
# When training models that are not built-in InnerEye models, we have no guarantee that they write
# files to the right folder. Best guess is to change the current working directory to where files should go.
with change_working_directory(self.container.outputs_folder):
trainer.test(self.container.model,
test_dataloaders=self.container.get_data_module().test_dataloader(),
ckpt_path=str(checkpoint_paths[0]))
logging.info("Finished inference.")
else:
logging.warning("None of the suitable test methods is overridden. Skipping inference completely.")
def run_inference_and_register_model(self, checkpoint_handler: CheckpointHandler,
model_proc: ModelProcessing) -> None:
@ -311,17 +439,17 @@ class MLRunner:
model (from the run we recovered) should already have been registered, so we should only
do so if this run is specifically for that purpose.
"""
if self.model_config.is_offline_run:
if self.is_offline_run:
return False
return self.azure_config.train or self.azure_config.only_register_model
def create_activation_maps(self) -> None:
if self.model_config.is_segmentation_model and self.model_config.activation_map_layers is not None:
if self.innereye_config.is_segmentation_model and self.innereye_config.activation_map_layers is not None:
logging.info("Extracting activation maps for layer")
activation_maps.extract_activation_maps(self.model_config)
activation_maps.extract_activation_maps(self.innereye_config) # type: ignore
logging.info("Successfully extracted and saved activation maps")
def mount_or_download_dataset(self) -> Path:
def mount_or_download_dataset(self) -> Optional[Path]:
"""
Makes the dataset that the model uses available on the executing machine. If the present training run is outside
of AzureML, it expects that either the model has a `local_dataset` field set, in which case no action will be
@ -331,53 +459,46 @@ class MLRunner:
mounted or downloaded.
Returns the path of the dataset on the executing machine.
"""
azure_dataset_id = self.model_config.azure_dataset_id
if self.model_config.is_offline_run:
azure_dataset_id = self.container.azure_dataset_id
local_dataset = self.container.local_dataset
if self.is_offline_run:
# A dataset, either local or in Azure, is required for the built-in InnerEye models. When models are
# specified via a LightningContainer, these dataset fields are optional, because the container datasets
# could be downloaded even from the web.
is_dataset_required = isinstance(self.container, InnerEyeContainer)
# The present run is outside of AzureML: If local_dataset is set, use that as the path to the data.
# Otherwise, download the dataset specified by the azure_dataset_id
local_dataset = self.model_config.local_dataset
if (not azure_dataset_id) and (local_dataset is None):
raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
if is_dataset_required:
if (not azure_dataset_id) and (local_dataset is None):
raise ValueError("The model must contain either local_dataset or azure_dataset_id.")
if local_dataset:
expected_dir = Path(local_dataset)
if not expected_dir.is_dir():
raise FileNotFoundError(f"The model uses a dataset in {expected_dir}, but that does not exist.")
logging.info(f"Model training will use the local dataset provided in {expected_dir}")
return expected_dir
return download_dataset(azure_dataset_id=azure_dataset_id,
target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
dataset_csv=self.model_config.dataset_csv,
azure_config=self.azure_config)
if azure_dataset_id:
dataset_csv = ""
if isinstance(self.model_config, DeepLearningConfig):
dataset_csv = self.model_config.dataset_csv
return download_dataset(azure_dataset_id=azure_dataset_id,
target_folder=self.project_root / fixed_paths.DATASETS_DIR_NAME,
dataset_csv=dataset_csv, azure_config=self.azure_config)
return None
# Inside of AzureML, datasets can be either mounted or downloaded.
if not azure_dataset_id:
raise ValueError("The model must contain azure_dataset_id for running on AML")
mounted = try_to_mount_input_dataset()
if not mounted:
raise ValueError("Unable to mount or download input dataset.")
return mounted
def save_build_info_for_dotnet_consumers(self) -> None:
results_container = get_results_blob_path(RUN_CONTEXT.id)
result_location = ExperimentResultLocation(
azure_job_name=RUN_CONTEXT.id,
dataset_folder=self.model_config.azure_dataset_id,
results_container_name=results_container,
commandline_overrides=str(self.model_config.overrides),
dataset_uri=self.model_config.azure_dataset_id,
results_uri="",
)
# Fill in the missing information in the build config (everything that is not available at the time
# of evoking the runner), and then save in the format needed for the .NET consumers
build_information_to_dot_net_json_file(
self.azure_config, result_location, folder=self.model_config.outputs_folder)
if azure_dataset_id:
mounted = try_to_mount_input_dataset()
if not mounted:
raise ValueError("Unable to mount or download input dataset.")
return mounted
return None
def set_multiprocessing_start_method(self) -> None:
"""
Set the (PyTorch) multiprocessing start method.
"""
method = self.model_config.multiprocessing_start_method
method = self.container.multiprocessing_start_method
if is_windows():
if method != MultiprocessingStartMethod.spawn:
logging.warning(f"Cannot set multiprocessing start method to '{method.name}' "
@ -402,7 +523,7 @@ class MLRunner:
logging.warning("Abandoning model registration - no valid checkpoint paths found")
return
if not self.model_config.is_offline_run:
if not self.is_offline_run:
split_index = RUN_CONTEXT.get_tags().get(CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY, None)
if split_index == DEFAULT_CROSS_VALIDATION_SPLIT_INDEX:
RUN_CONTEXT.tag(IS_ENSEMBLE_KEY_NAME, str(model_proc == ModelProcessing.ENSEMBLE_CREATION))
@ -447,7 +568,7 @@ class MLRunner:
model_subfolder = FINAL_MODEL_FOLDER if model_proc == ModelProcessing.DEFAULT else FINAL_ENSEMBLE_MODEL_FOLDER
# This is the path under which AzureML will know the files: Either "final_model" or "final_ensemble_model"
artifacts_path = model_subfolder
final_model_folder = self.model_config.file_system_config.run_folder / model_subfolder
final_model_folder = self.innereye_config.file_system_config.run_folder / model_subfolder
# Copy all code from project and InnerEye into the model folder, and copy over checkpoints.
# This increases the size of the data stored for the run. The other option would be to store all checkpoints
# right in the final model folder - however, then that would also contain any other checkpoints that the model
@ -467,7 +588,7 @@ class MLRunner:
# When registering the model on the run, we need to provide a relative path inside of the run's output
# folder in `model_path`
model = run_to_register_on.register_model(
model_name=self.model_config.model_name,
model_name=self.innereye_config.model_name,
model_path=artifacts_path,
tags=RUN_CONTEXT.get_tags(),
description=model_description
@ -487,9 +608,9 @@ class MLRunner:
logging.info(f"Registered {model_proc.value} model: {model.name}, with Id: {model.id}")
# create a version of the model for deployment if the hook is provided
if self.model_deployment_hook is not None:
assert isinstance(self.model_config, SegmentationModelBase)
assert isinstance(self.innereye_config, SegmentationModelBase)
deployment_result = self.model_deployment_hook(
self.model_config, self.azure_config, model, model_proc)
self.innereye_config, self.azure_config, model, model_proc)
return model, deployment_result
@staticmethod
@ -539,17 +660,17 @@ class MLRunner:
try:
# Checkpoints live in a folder structure in the checkpoint folder. There can be multiple of
# them, with identical names, coming from an ensemble run. Hence, preserve their folder structure.
checkpoint_relative = checkpoint.relative_to(self.model_config.checkpoint_folder)
checkpoint_relative = checkpoint.relative_to(self.innereye_config.checkpoint_folder)
except ValueError:
raise ValueError(f"Checkpoint file {checkpoint} was expected to be in a subfolder of "
f"{self.model_config.checkpoint_folder}")
f"{self.innereye_config.checkpoint_folder}")
# Checkpoints go into a newly created folder "checkpoints" inside of the model folder
relative_checkpoint_paths.append(str(Path(CHECKPOINT_FOLDER) / checkpoint_relative))
else:
raise ValueError(f"Expected an absolute path to a checkpoint file, but got: {checkpoint}")
model_folder.mkdir(parents=True, exist_ok=True)
model_inference_config = ModelInferenceConfig(model_name=self.model_config.model_name,
model_configs_namespace=self.model_config.__class__.__module__,
model_inference_config = ModelInferenceConfig(model_name=self.innereye_config.model_name,
model_configs_namespace=self.innereye_config.__class__.__module__,
checkpoint_paths=relative_checkpoint_paths)
# Inference configuration must live in the root folder of the registered model
full_path_to_config = model_folder / fixed_paths.MODEL_INFERENCE_JSON_FILE_NAME
@ -591,11 +712,12 @@ class MLRunner:
val_metrics = None
test_metrics = None
config = self.model_config
config = self.innereye_config
def run_model_test(data_split: ModelExecutionMode) -> Optional[InferenceMetrics]:
return model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler,
return model_test(config, data_split=data_split, checkpoint_handler=checkpoint_handler, # type: ignore
model_proc=model_proc)
if config.perform_validation_and_test_set_inference:
# perform inference on test set
test_metrics = run_model_test(ModelExecutionMode.TEST)
@ -610,7 +732,7 @@ class MLRunner:
# log the metrics to AzureML experiment if possible. When doing ensemble runs, log to the Hyperdrive parent run,
# so that we get the metrics of child run 0 and the ensemble separated.
if config.is_segmentation_model and not config.is_offline_run:
if config.is_segmentation_model and not self.is_offline_run:
run_for_logging = PARENT_RUN_CONTEXT if model_proc.ENSEMBLE_CREATION else RUN_CONTEXT
log_metrics(val_metrics=val_metrics, test_metrics=test_metrics, # type: ignore
train_metrics=train_metrics, run_context=run_for_logging) # type: ignore
@ -633,9 +755,9 @@ class MLRunner:
:return: True if all sibling runs of the current run have finished (they either completed successfully,
or failed). False if any of them is still pending (running or queued).
"""
if (not self.model_config.is_offline_run) \
if (not self.is_offline_run) \
and (azure_util.is_cross_validation_child_run(RUN_CONTEXT)):
n_splits = self.model_config.get_total_number_of_cross_validation_runs()
n_splits = self.innereye_config.get_total_number_of_cross_validation_runs()
child_runs = azure_util.fetch_child_runs(PARENT_RUN_CONTEXT,
expected_number_cross_validation_splits=n_splits)
pending_runs = [x.id for x in child_runs
@ -648,14 +770,14 @@ class MLRunner:
else:
raise NotImplementedError("are_sibling_runs_finished only works for cross validation runs in AzureML.")
def create_ensemble_model(self) -> None:
def create_ensemble_model_and_run_inference(self) -> None:
"""
Create an ensemble model from the results of the sibling runs of the present run. The present run here will
be cross validation child run 0.
"""
assert PARENT_RUN_CONTEXT, "This function should only be called in a Hyperdrive run"
with logging_section("Downloading checkpoints from sibling runs"):
checkpoint_handler = CheckpointHandler(model_config=self.model_config,
checkpoint_handler = CheckpointHandler(container=self.container,
azure_config=self.azure_config,
project_root=self.project_root,
run_context=PARENT_RUN_CONTEXT)
@ -665,13 +787,13 @@ class MLRunner:
model_proc=ModelProcessing.ENSEMBLE_CREATION)
crossval_dir = self.plot_cross_validation_and_upload_results()
if self.model_config.generate_report:
if self.innereye_config.generate_report:
self.generate_report(ModelProcessing.ENSEMBLE_CREATION)
# CrossValResults should have been uploaded to the parent run, so we don't need it here.
remove_file_or_directory(crossval_dir)
# We can also remove OTHER_RUNS under the root, as it is no longer useful and only contains copies of files
# available elsewhere. However, first we need to upload relevant parts of OTHER_RUNS/ENSEMBLE.
other_runs_dir = self.model_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME
other_runs_dir = self.innereye_config.outputs_folder / OTHER_RUNS_SUBDIR_NAME
other_runs_ensemble_dir = other_runs_dir / ENSEMBLE_SPLIT_NAME
if PARENT_RUN_CONTEXT is not None:
if other_runs_ensemble_dir.exists():
@ -690,9 +812,9 @@ class MLRunner:
from InnerEye.ML.visualizers.plot_cross_validation import crossval_config_from_model_config, \
plot_cross_validation, unroll_aggregate_metrics
# perform aggregation as cross val splits are now ready
plot_crossval_config = crossval_config_from_model_config(self.model_config)
plot_crossval_config = crossval_config_from_model_config(self.innereye_config)
plot_crossval_config.run_recovery_id = PARENT_RUN_CONTEXT.tags[RUN_RECOVERY_ID_KEY_NAME]
plot_crossval_config.outputs_directory = self.model_config.outputs_folder
plot_crossval_config.outputs_directory = self.innereye_config.outputs_folder
plot_crossval_config.azure_config = self.azure_config
cross_val_results_root = plot_cross_validation(plot_crossval_config)
if isinstance(self.model_config, ScalarModelBase) and not isinstance(self.model_config, SequenceModelBase):
@ -701,10 +823,10 @@ class MLRunner:
full_metrics_csv = cross_val_results_root / FULL_METRICS_DATAFRAME_FILE
generate_classification_crossval_notebook(notebook_path, self.model_config, full_metrics_csv)
if self.post_cross_validation_hook:
self.post_cross_validation_hook(self.model_config, cross_val_results_root)
self.post_cross_validation_hook(self.innereye_config, cross_val_results_root)
# upload results to the parent run's outputs so that the files are visible inside the AzureML UI.
PARENT_RUN_CONTEXT.upload_folder(name=CROSSVAL_RESULTS_FOLDER, path=str(cross_val_results_root))
if self.model_config.is_scalar_model:
if self.innereye_config.is_scalar_model:
try:
aggregates = pd.read_csv(cross_val_results_root / METRICS_AGGREGATES_FILE)
unrolled_aggregate_metrics = unroll_aggregate_metrics(aggregates)
@ -715,7 +837,7 @@ class MLRunner:
return cross_val_results_root
def generate_report(self, model_proc: ModelProcessing) -> None:
config = self.model_config
config = self.innereye_config
if config.model_category not in [ModelCategory.Segmentation, ModelCategory.Classification]:
logging.info(f"No reporting available for a model with category {config.model_category}")
return
@ -753,7 +875,8 @@ class MLRunner:
if len(config.class_names) > 1:
generate_classification_multilabel_notebook(
result_notebook=reports_dir / get_ipynb_report_name(f"{config.model_category.value}_multilabel"),
result_notebook=reports_dir / get_ipynb_report_name(
f"{config.model_category.value}_multilabel"),
config=config,
train_metrics=path_to_best_epoch_train,
val_metrics=path_to_best_epoch_val,

Просмотреть файл

@ -4,50 +4,62 @@
# ------------------------------------------------------------------------------------------
import os
import sys
import warnings
from pathlib import Path
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
# individual warnings only.
# flake8: noqa
# Workaround for an issue with how AzureML and Pytorch Lightning interact: When spawning additional processes for DDP,
# the working directory is not correctly picked up in sys.path
print("Starting InnerEye runner.")
print(f"Starting InnerEye runner at {sys.argv[0]}")
innereye_root = Path(__file__).absolute().parent.parent.parent
if (innereye_root / "InnerEye").is_dir():
innereye_root_str = str(innereye_root)
if innereye_root_str not in sys.path:
print(f"Adding to sys.path: {innereye_root_str}")
print(f"Adding InnerEye folder to sys.path: {innereye_root_str}")
sys.path.insert(0, innereye_root_str)
# We change the current working directory before starting the actual training. However, this throws off starting
# the child training threads because sys.argv[0] is a relative path when running in AzureML. Turn that into an absolute
# path.
runner_path = Path(sys.argv[0])
if not runner_path.is_absolute():
sys.argv[0] = str(runner_path.absolute())
import logging
from pathlib import Path
from typing import Any, Callable, List, Optional, Tuple
from typing import Any, Optional, Tuple
from azureml._base_sdk_common import user_agent
from azureml.core import Model, Run
from azureml.core import Run
from InnerEye.Azure import azure_util
from InnerEye.Azure.azure_config import AzureConfig, ParserResult, SourceConfig
from InnerEye.Azure.azure_runner import create_runner_parser, parse_args_and_add_yaml_variables, \
parse_arguments, set_environment_variables_for_multi_node, submit_to_azureml
from InnerEye.Azure.azure_util import is_run_and_child_runs_completed
from InnerEye.Azure.azure_util import get_all_environment_files, is_run_and_child_runs_completed
from InnerEye.Azure.run_pytest import download_pytest_result, run_pytest
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import FULL_METRICS_DATAFRAME_FILE, METRICS_AGGREGATES_FILE, \
ModelProcessing, disable_logging_to_file, is_linux, logging_to_file, logging_to_stdout, print_exception
disable_logging_to_file, is_linux, logging_to_stdout
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.config import ModelDeploymentHookSignature, PostCrossValidationHookSignature
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.config_loader import ModelConfigLoader
LOG_FILE_NAME = "stdout.txt"
try:
# This import can fail when the code runs inside the azure_runner.yml Conda environment, that we use
# for the PR builds
from InnerEye.ML.lightning_container import LightningContainer
PostCrossValidationHookSignature = Callable[[ModelConfigBase, Path], None]
ModelDeploymentHookSignature = Callable[[SegmentationModelBase, AzureConfig, Model, ModelProcessing], Any]
has_torch = True
except ModuleNotFoundError as ex:
has_torch = False
def may_initialize_rpdb() -> None:
def initialize_rpdb() -> None:
"""
On Linux only, import and initialize rpdb, to enable remote debugging if necessary.
"""
@ -85,23 +97,10 @@ def suppress_logging_noise() -> None:
os.environ['MKL_THREADING_LAYER'] = 'GNU'
def get_all_environment_files(project_root: Path) -> List[Path]:
"""
Returns a list of all Conda environment files that should be used. This is firstly the InnerEye conda file,
and possibly a second environment.yml file that lives at the project root folder.
:param project_root: The root folder of the code that starts the present training run.
:return: A list with 1 or 2 entries that are conda environment files.
"""
innereye_yaml = fixed_paths.get_environment_yaml_file()
project_yaml = project_root / fixed_paths.ENVIRONMENT_YAML_FILE_NAME
files = [innereye_yaml]
if innereye_yaml != project_yaml:
files.append(project_yaml)
return files
class Runner:
"""
This class contains the high-level logic to start a training run: choose a model configuration by name,
submit to AzureML if needed, or otherwise start the actual training and test loop.
:param project_root: The root folder that contains all of the source code that should be executed.
:param yaml_config_file: The path to the YAML file that contains values to supply into sys.argv.
:param post_cross_validation_hook: A function to call after waiting for completion of cross validation runs.
@ -116,17 +115,17 @@ class Runner:
project_root: Path,
yaml_config_file: Path,
post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None,
command_line_args: Optional[List[str]] = None):
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None):
self.project_root = project_root
self.yaml_config_file = yaml_config_file
self.post_cross_validation_hook = post_cross_validation_hook
self.model_deployment_hook = model_deployment_hook
self.command_line_args = command_line_args
# model_config and azure_config are placeholders for now, and are set properly when command line args are
# parsed.
self.model_config: ModelConfigBase = ModelConfigBase(azure_dataset_id="")
self.model_config: Optional[DeepLearningConfig] = None
self.azure_config: AzureConfig = AzureConfig()
# This should be typed as LightningContainer, but we don't always have that imported
self.lightning_container: Any = None
def parse_and_load_model(self) -> ParserResult:
"""
@ -138,47 +137,84 @@ class Runner:
"""
# Create a parser that will understand only the args we need for an AzureConfig
parser1 = create_runner_parser()
parser1_result = parse_args_and_add_yaml_variables(parser1,
yaml_config_file=self.yaml_config_file,
project_root=self.project_root,
args=self.command_line_args,
fail_on_unknown_args=False)
azure_config = AzureConfig(**parser1_result.args)
parser_result = parse_args_and_add_yaml_variables(parser1,
yaml_config_file=self.yaml_config_file,
project_root=self.project_root,
fail_on_unknown_args=False)
azure_config = AzureConfig(**parser_result.args)
azure_config.project_root = self.project_root
self.azure_config = azure_config
self.model_config = None # type: ignore
self.model_config = None
self.lightning_container = None
if not azure_config.model:
raise ValueError("Parameter 'model' needs to be set to tell InnerEye which model to run.")
model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser1_result.args)
# Create the model as per the "model" commandline option
model_config = model_config_loader.create_model_config_from_name(
model_name=azure_config.model
)
# This model will be either a classification model or a segmentation model. Those have different
# fields that could be overridden on the command line. Create a parser that understands the fields we need
# for the actual model type. We feed this parser will the YAML settings and commandline arguments that the
# first parser did not recognize.
parser2 = type(model_config).create_argparser()
parser2_result = parse_arguments(parser2,
settings_from_yaml=parser1_result.unknown_settings_from_yaml,
args=parser1_result.unknown,
fail_on_unknown_args=True)
# Apply the overrides and validate. Overrides can come from either YAML settings or the commandline.
model_config.apply_overrides(parser1_result.unknown_settings_from_yaml)
model_config.apply_overrides(parser2_result.overrides)
model_config.validate()
# Set the file system related configs, they might be affected by the overrides that were applied.
logging.info("Creating the adjusted output folder structure.")
model_config.create_filesystem(self.project_root)
model_config_loader: ModelConfigLoader = ModelConfigLoader(**parser_result.args)
# Create the model as per the "model" commandline option. This can return either a built-in config
# of type DeepLearningConfig, or a LightningContainer.
config_or_container = model_config_loader.create_model_config_from_name(model_name=azure_config.model)
def parse_overrides_and_apply(c: object, previous_parser_result: ParserResult) -> ParserResult:
assert isinstance(c, GenericConfig)
parser = type(c).create_argparser()
# For each parser, feed in the unknown settings from the previous parser. All commandline args should
# be consumed by name, hence fail if there is something that is still unknown.
parser_result = parse_arguments(parser,
settings_from_yaml=previous_parser_result.unknown_settings_from_yaml,
args=previous_parser_result.unknown,
fail_on_unknown_args=True)
# Apply the overrides and validate. Overrides can come from either YAML settings or the commandline.
c.apply_overrides(parser_result.known_settings_from_yaml)
c.apply_overrides(parser_result.overrides)
c.validate()
return parser_result
# Now create a parser that understands overrides at model/container level.
parser_result = parse_overrides_and_apply(config_or_container, parser_result)
if has_torch and isinstance(config_or_container, LightningContainer):
self.lightning_container = config_or_container
elif isinstance(config_or_container, DeepLearningConfig):
# Built-in InnerEye models: A fake container for these models will be created in MLRunner
self.model_config = config_or_container
else:
raise ValueError(f"Don't know how to handle a loaded configuration of type {type(config_or_container)}")
if azure_config.extra_code_directory:
exist = "exists" if Path(azure_config.extra_code_directory).exists() else "does not exist"
logging.info(f"extra_code_directory is {azure_config.extra_code_directory}, which {exist}")
else:
logging.info("extra_code_directory is unset")
self.model_config = model_config
return parser2_result
return parser_result
def run(self) -> Tuple[ModelConfigBase, Optional[Run]]:
def _get_property_from_config_or_container(self, name: str) -> Any:
"""
Reads out a property or attribute from either the model configuration (if that is a built-in InnerEye
model) or the lightning container.
:param name: The name of the property to read.
:return: The property value, coming from either the model config or the container.
"""
if isinstance(self.model_config, DeepLearningConfig):
return getattr(self.model_config, name)
elif self.lightning_container is not None:
return getattr(self.lightning_container, name)
else:
raise ValueError(f"Did not expect config of type {type(self.model_config)} and container of type "
f"{type(self.lightning_container)}")
@property
def perform_cross_validation(self) -> bool:
"""
Returns True if cross validation will be be performed as part of the training procedure.
"""
return self._get_property_from_config_or_container("perform_cross_validation")
@property
def azure_dataset_id(self) -> str:
"""
Returns the name of the Azure dataset that should be used.
"""
return self._get_property_from_config_or_container("azure_dataset_id")
def run(self) -> Tuple[Optional[DeepLearningConfig], Optional[Run]]:
"""
The main entry point for training and testing models from the commandline. This chooses a model to train
via a commandline argument, runs training or testing, and writes all required info to disk and logs.
@ -188,10 +224,12 @@ class Runner:
# Usually, when we set logging to DEBUG, we want diagnostics about the model
# build itself, but not the tons of debug information that AzureML submissions create.
logging_to_stdout(logging.INFO)
may_initialize_rpdb()
initialize_rpdb()
user_agent.append(azure_util.INNEREYE_SDK_NAME, azure_util.INNEREYE_SDK_VERSION)
self.parse_and_load_model()
if self.model_config is not None and self.model_config.perform_cross_validation:
if self.perform_cross_validation:
if self.lightning_container is not None:
raise NotImplementedError("Cross validation for LightingContainer models is not yet supported.")
# force hyperdrive usage if performing cross validation
self.azure_config.hyperdrive = True
run_object: Optional[Run] = None
@ -208,23 +246,24 @@ class Runner:
"""
# The adal package creates a logging.info line each time it gets an authentication token, avoid that.
logging.getLogger('adal-python').setLevel(logging.WARNING)
if not self.model_config.azure_dataset_id:
raise ValueError("When running on AzureML, the 'azure_dataset_id' property must be set.")
model_config_overrides = str(self.model_config.overrides)
# PyJWT prints out warnings that are beyond our control
warnings.filterwarnings("ignore", category=DeprecationWarning)
if isinstance(self.model_config, DeepLearningConfig) and not self.azure_dataset_id:
raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
"property must be set.")
hyperdrive_func = lambda run_config: self.model_config.get_hyperdrive_config(run_config) # type: ignore
source_config = SourceConfig(
root_folder=self.project_root,
entry_script=Path(sys.argv[0]).resolve(),
conda_dependencies_files=get_all_environment_files(self.project_root),
hyperdrive_config_func=lambda run_config: self.model_config.get_hyperdrive_config(run_config),
# For large jobs, upload of results times out frequently because of large checkpoint files. Default is 600
hyperdrive_config_func=hyperdrive_func,
# For large jobs, upload of results can time out because of large checkpoint files. Default is 600
upload_timeout_seconds=86400,
)
source_config.set_script_params_except_submit_flag()
assert self.model_config.azure_dataset_id is not None # to stop mypy complaining about next line
azure_run = submit_to_azureml(self.azure_config, source_config, model_config_overrides,
self.model_config.azure_dataset_id)
azure_run = submit_to_azureml(self.azure_config, source_config, self.azure_dataset_id)
logging.info("Job submission to AzureML done.")
if self.azure_config.pytest_mark:
if self.azure_config.pytest_mark and self.azure_config.wait_for_completion:
# The AzureML job can optionally run pytest. Attempt to download it to the current directory.
# A build step will pick up that file and publish it to Azure DevOps.
# If pytest_mark is set, this file must exist.
@ -246,21 +285,17 @@ class Runner:
# build itself, but not the tons of debug information that AzureML submissions create.
logging_to_stdout(self.azure_config.log_level)
suppress_logging_noise()
error_messages = []
# For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both
# only works when using DDP_spawn, but that has as a side-effect that it messes up memory consumption of the
# large models.
if self.azure_config.pytest_mark:
try:
outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder)
if not pytest_passed:
pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
logging.error(pytest_failures)
error_messages.append(pytest_failures)
except Exception as ex:
print_exception(ex, "Unable to run PyTest.")
error_messages.append(f"Unable to run PyTest: {ex}")
outputs_folder = Path.cwd() / fixed_paths.DEFAULT_AML_UPLOAD_DIR
pytest_passed, results_file_path = run_pytest(self.azure_config.pytest_mark, outputs_folder)
if not pytest_passed:
# Terminate if pytest has failed. This makes the smoke test in
# PR builds fail if pytest fails.
pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
raise ValueError(pytest_failures)
else:
# Set environment variables for multi-node training if needed.
# In particular, the multi-node environment variables should NOT be set in single node
@ -268,20 +303,14 @@ class Runner:
# (https://github.com/microsoft/InnerEye-DeepLearning/issues/395)
if self.azure_config.num_nodes > 1:
set_environment_variables_for_multi_node()
logging.info("Creating the output folder structure.")
ml_runner = self.create_ml_runner()
ml_runner.setup()
ml_runner.start_logging_to_file()
try:
logging_to_file(self.model_config.logs_folder / LOG_FILE_NAME)
try:
self.create_ml_runner().run()
except Exception as ex:
print_exception(ex, "Model training/testing failed.")
error_messages.append(f"Training failed: {ex}")
ml_runner.run()
finally:
disable_logging_to_file()
# Terminate if pytest or model training has failed. This makes the smoke test in
# PR builds fail if pytest fails.
if error_messages:
raise ValueError(
f"At least one component of the runner failed: {os.linesep} {os.linesep.join(error_messages)}")
def create_ml_runner(self) -> Any:
"""
@ -289,10 +318,11 @@ class Runner:
"""
# This import statement cannot be at the beginning of the file because it will cause import
# of packages that are not available inside the azure_runner.yml environment, in particular pytorch.
# That is also why we specify the return type as Any rather than MLRunner.
# That is also why we specify the return type is Any rather than MLRunner.
from InnerEye.ML.run_ml import MLRunner
return MLRunner(
model_config=self.model_config,
container=self.lightning_container,
azure_config=self.azure_config,
project_root=self.project_root,
post_cross_validation_hook=self.post_cross_validation_hook,
@ -319,17 +349,15 @@ def default_post_cross_validation_hook(config: ModelConfigBase, root_folder: Pat
def run(project_root: Path,
yaml_config_file: Path,
post_cross_validation_hook: Optional[PostCrossValidationHookSignature] = None,
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None,
command_line_args: Optional[List[str]] = None) -> \
Tuple[ModelConfigBase, Optional[Run]]:
model_deployment_hook: Optional[ModelDeploymentHookSignature] = None) -> \
Tuple[Optional[DeepLearningConfig], Optional[Run]]:
"""
The main entry point for training and testing models from the commandline. This chooses a model to train
via a commandline argument, runs training or testing, and writes all required info to disk and logs.
:return: If submitting to AzureML, returns the model configuration that was used for training,
including commandline overrides applied (if any). For details on the arguments, see the constructor of Runner.
"""
runner = Runner(project_root, yaml_config_file, post_cross_validation_hook,
model_deployment_hook, command_line_args)
runner = Runner(project_root, yaml_config_file, post_cross_validation_hook, model_deployment_hook)
return runner.run()

Просмотреть файл

@ -17,7 +17,7 @@ from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.plotting import segmentation_and_groundtruth_plot, surface_distance_ground_truth_plot
from InnerEye.ML.utils import surface_distance_utils as sd_util
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.csv_util import get_worst_performing_outliers, load_csv
from InnerEye.ML.utils.image_util import multi_label_array_to_binary
from InnerEye.ML.utils.io_util import load_nifti_image
@ -105,10 +105,8 @@ def main() -> None:
if config_model is None:
raise ValueError("The name of the model to train must be given in the --model argument.")
model_config = ModelConfigLoader[SegmentationModelBase]().create_model_config_from_name(
config_model,
overrides=parser_result.overrides
)
model_config = ModelConfigLoader().create_model_config_from_name(config_model)
model_config.apply_overrides(parser_result.overrides, should_validate=True)
execution_mode = surface_distance_config.execution_mode
run_mode = surface_distance_config.run_mode

Просмотреть файл

@ -5,6 +5,7 @@
import logging
import os
import uuid
from builtins import property
from pathlib import Path
from typing import List, Optional
from urllib.parse import urlparse
@ -15,7 +16,8 @@ from azureml.core import Run
from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Common import fixed_paths
from InnerEye.ML.deep_learning_config import DeepLearningConfig, WEIGHTS_FILE
from InnerEye.ML.deep_learning_config import OutputParams, WEIGHTS_FILE
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.utils.run_recovery import RunRecovery
@ -25,22 +27,29 @@ class CheckpointHandler:
azure config and model config.
"""
def __init__(self, model_config: DeepLearningConfig, azure_config: AzureConfig,
def __init__(self, container: LightningContainer, azure_config: AzureConfig,
project_root: Path, run_context: Optional[Run] = None):
self.azure_config = azure_config
self.model_config = model_config
self.container = container
self.run_recovery: Optional[RunRecovery] = None
self.project_root = project_root
self.run_context = run_context
self.local_weights_path: Optional[Path] = None
self.has_continued_training = False
@property
def output_params(self) -> OutputParams:
"""
Gets the part of the configuration that is responsible for output paths.
"""
return self.container
def download_checkpoints_from_hyperdrive_child_runs(self, hyperdrive_parent_run: Run) -> None:
"""
Downloads the best checkpoints from all child runs of a Hyperdrive parent runs. This is used to gather results
for ensemble creation.
"""
self.run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(self.model_config,
self.run_recovery = RunRecovery.download_best_checkpoints_from_child_runs(self.output_params,
hyperdrive_parent_run)
# Check paths are good, just in case
for path in self.run_recovery.checkpoints_roots:
@ -55,11 +64,11 @@ class CheckpointHandler:
"""
if self.azure_config.run_recovery_id:
run_to_recover = self.azure_config.fetch_run(self.azure_config.run_recovery_id.strip())
self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.model_config, run_to_recover)
self.run_recovery = RunRecovery.download_all_checkpoints_from_run(self.output_params, run_to_recover)
else:
self.run_recovery = None
if self.model_config.weights_url or self.model_config.local_weights_path:
if self.container.weights_url or self.container.local_weights_path:
self.local_weights_path = self.get_and_save_modified_weights()
def additional_training_done(self) -> None:
@ -74,11 +83,11 @@ class CheckpointHandler:
checkpoint from there, otherwise use the checkpoints from the current run.
:return: Constructed checkpoint path to recover from.
"""
if self.model_config.start_epoch > 0 and not self.run_recovery:
start_epoch = self.container.start_epoch
if start_epoch > 0 and not self.run_recovery:
raise ValueError("Start epoch is > 0, but no run recovery object has been provided to resume training.")
if self.run_recovery and self.model_config.start_epoch == 0:
if self.run_recovery and start_epoch == 0:
raise ValueError("Run recovery set, but start epoch is 0. Please provide start epoch > 0 (for which a "
"checkpoint was saved in the previous run) to resume training from that run.")
@ -88,7 +97,7 @@ class CheckpointHandler:
checkpoints = self.run_recovery.get_recovery_checkpoint_paths()
if len(checkpoints) > 1:
raise ValueError(f"Recovering training of ensemble runs is not supported. Found more than one "
f"checkpoint for epoch {self.model_config.start_epoch}")
f"checkpoint for epoch {start_epoch}")
return checkpoints[0]
elif self.local_weights_path:
return self.local_weights_path
@ -129,7 +138,7 @@ class CheckpointHandler:
if self.has_continued_training:
# Checkpoint is from the current run, whether a new run or a run recovery which has been doing more
# training, so we look for it there.
checkpoint_from_current_run = self.model_config.get_path_to_best_checkpoint()
checkpoint_from_current_run = self.output_params.get_path_to_best_checkpoint()
if checkpoint_from_current_run.is_file():
logging.info("Using checkpoints from current run.")
checkpoint_paths = [checkpoint_from_current_run]
@ -172,7 +181,7 @@ class CheckpointHandler:
target_folder = self.project_root / fixed_paths.MODEL_WEIGHTS_DIR_NAME
target_folder.mkdir(exist_ok=True)
url = self.model_config.weights_url
url = self.container.weights_url
# assign the same filename as in the download url if possible, so that we can check for duplicates
# If that fails, map to a random uuid
@ -198,9 +207,9 @@ class CheckpointHandler:
"""
Get the path to the local weights to use or download them and set local_weights_path
"""
if self.model_config.local_weights_path:
weights_path = self.model_config.local_weights_path
elif self.model_config.weights_url:
if self.container.local_weights_path:
weights_path = self.container.local_weights_path
elif self.container.weights_url:
weights_path = self.download_weights()
else:
raise ValueError("Cannot download/modify weights - neither local_weights_path nor weights_url is set in"
@ -219,8 +228,8 @@ class CheckpointHandler:
if not weights_path or not weights_path.is_file():
raise FileNotFoundError(f"Could not find the weights file at {weights_path}")
modified_weights = self.model_config.load_checkpoint_and_modify(weights_path)
target_file = self.model_config.outputs_folder / WEIGHTS_FILE
modified_weights = self.container.load_checkpoint_and_modify(weights_path)
target_file = self.output_params.outputs_folder / WEIGHTS_FILE
torch.save(modified_weights, target_file)
return target_file
@ -228,4 +237,4 @@ class CheckpointHandler:
"""
Returns true if the optimizer should be loaded from checkpoint. Looks at the model config to determine this.
"""
return self.model_config.start_epoch > 0
return self.container.start_epoch > 0

Просмотреть файл

@ -5,21 +5,19 @@
import importlib
import inspect
import logging
from importlib._bootstrap import ModuleSpec
from importlib.util import find_spec
from pathlib import Path
from typing import Any, Dict, Generic, List, Optional, TypeVar
from typing import Any, Dict, List, Optional
import param
from importlib._bootstrap import ModuleSpec
from InnerEye.Common.common_util import path_to_namespace
from InnerEye.Common.generic_parsing import GenericConfig
from InnerEye.ML.model_config_base import ModelConfigBase
C = TypeVar('C', bound=ModelConfigBase)
from InnerEye.ML.deep_learning_config import DeepLearningConfig
class ModelConfigLoader(GenericConfig, Generic[C]):
class ModelConfigLoader(GenericConfig):
"""
Helper class to manage model config loading
"""
@ -43,20 +41,21 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
from InnerEye.ML import configs
return configs.__name__
def create_model_config_from_name(self, model_name: str, overrides: Optional[Dict[str, Any]] = None) -> C:
def create_model_config_from_name(self, model_name: str) -> DeepLearningConfig:
"""
Returns a segmentation or classification model configuration for a model of the given name.
Returns a model configuration for a model of the given name. This can be either a segmentation or
classification configuration for an InnerEye built-in model, or a LightningContainer.
To avoid having to import torch here, there are no references to LightningContainer.
Searching for a class member called <model_name> in the search modules provided recursively.
:param model_name: Name of the model for which to get the configs for.
:param overrides: Model properties to override.
"""
if not model_name:
raise ValueError("Unable to load a model configuration because the model name is missing.")
configs: Dict[str, C] = {}
configs: Dict[str, DeepLearningConfig] = {}
def _get_model_config(module_spec: ModuleSpec) -> Optional[C]:
def _get_model_config(module_spec: ModuleSpec) -> Optional[DeepLearningConfig]:
"""
Given a module specification check to see if it has a class property with
the <model_name> provided, and instantiate that config class with the
@ -66,6 +65,7 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
"""
# noinspection PyBroadException
try:
logging.debug(f"Importing {module_spec.name}")
target_module = importlib.import_module(module_spec.name)
# The "if" clause checks that obj is a class, of the desired name, that is
# defined in this module rather than being imported into it (and hence potentially
@ -74,7 +74,7 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
if inspect.isclass(obj)
and name == model_name
and inspect.getmodule(obj) == target_module)
logging.info(f"Found class {_class.name} in file {module_spec.origin}")
logging.info(f"Found class {_class} in file {module_spec.origin}")
# ignore the exception which will occur if the provided module cannot be loaded
# or the loaded module does not have the required class as a member
except Exception as e:
@ -82,13 +82,7 @@ class ModelConfigLoader(GenericConfig, Generic[C]):
if exception_text != "":
logging.warning(f"(from attempt to import module {module_spec.name}): {exception_text}")
return None
model_config: ModelConfigBase = _class()
# apply the overrides to the model
if overrides is not None:
model_config.apply_overrides(overrides)
# The parameters have presumably changed, so we need to re-validate.
model_config.validate()
model_config: DeepLearningConfig = _class()
return model_config
def _search_recursively_and_store(module_search_spec: ModuleSpec) -> None:

Просмотреть файл

@ -261,7 +261,7 @@ def load_dicom_image(path: PathOrString) -> np.ndarray:
"""
ds = dicom.dcmread(path)
pixels = ds.pixel_array
bits_stored = ds.BitsStored
bits_stored = int(ds.BitsStored) # type: ignore
if ds.PhotometricInterpretation == PhotometricInterpretation.MONOCHROME1.value:
pixel_repr = ds.PixelRepresentation
if pixel_repr == 0: # unsigned

Просмотреть файл

@ -9,7 +9,7 @@ from typing import Dict, List
from torch.optim.lr_scheduler import CosineAnnealingLR, ExponentialLR, LambdaLR, MultiStepLR, StepLR, _LRScheduler
from torch.optim.optimizer import Optimizer
from InnerEye.ML.deep_learning_config import DeepLearningConfig, LRSchedulerType, LRWarmUpType
from InnerEye.ML.deep_learning_config import LRSchedulerType, LRWarmUpType, OptimizerParams
def get_current_learning_rates(optimizer: Optimizer) -> List[float]:
@ -23,6 +23,7 @@ class LinearWarmUp(_LRScheduler):
"""
Implements linear warmup up to a given initial learning rate.
"""
def __init__(self, optimizer: Optimizer, warmup_epochs: int, final_lr: float, last_epoch: int = -1):
if warmup_epochs < 0:
raise ValueError("The number of warmup epochs must be >= 0.")
@ -60,9 +61,10 @@ class SchedulerWithWarmUp(_LRScheduler):
of the normal schedulers.
"""
def __init__(self, args: DeepLearningConfig, optimizer: Optimizer, last_epoch: int = -1):
def __init__(self, args: OptimizerParams, optimizer: Optimizer, num_epochs: int, last_epoch: int = -1):
self.optimizer = optimizer
self.last_epoch = last_epoch
self.num_epochs = num_epochs
self.warmup_epochs = 0 if args.l_rate_warmup == LRWarmUpType.NoWarmUp else args.l_rate_warmup_epochs
self._scheduler = self.get_scheduler(args)
# This must be called after self.get_scheduler, because we want the optimizer to have the learning rate
@ -75,12 +77,12 @@ class SchedulerWithWarmUp(_LRScheduler):
self.min_l_rate = args.min_l_rate
super().__init__(optimizer, last_epoch)
def get_scheduler(self, args: DeepLearningConfig) -> _LRScheduler:
def get_scheduler(self, args: OptimizerParams) -> _LRScheduler:
"""
Create the LR scheduler that will be used after warmup, based on the config params.
"""
scheduler: _LRScheduler
epochs_after_warmup = args.num_epochs - self.warmup_epochs
epochs_after_warmup = self.num_epochs - self.warmup_epochs
if args.l_rate_scheduler == LRSchedulerType.Exponential:
scheduler = ExponentialLR(optimizer=self.optimizer,
gamma=args.l_rate_exponential_gamma,

Просмотреть файл

@ -125,7 +125,7 @@ def get_number_of_voxels_per_class(labels: torch.Tensor) -> torch.Tensor:
if len(labels.shape) == 4:
labels = labels[None, ...]
return torch.tensor(np.count_nonzero(labels.cpu().numpy(), axis=(2, 3, 4)))
return torch.count_nonzero(labels, dim=(2, 3, 4))
def get_label_overlap_stats(labels: np.ndarray, label_names: List[str]) -> Dict[str, int]:

Просмотреть файл

@ -19,7 +19,7 @@ from InnerEye.ML.config import ModelArchitectureConfig, PaddingMode, Segmentatio
basic_size_shrinkage
from InnerEye.ML.dataset.scalar_sample import ScalarItem
from InnerEye.ML.dataset.sequence_sample import ClassificationItemSequence
from InnerEye.ML.deep_learning_config import DeepLearningConfig, OptimizerType
from InnerEye.ML.deep_learning_config import OptimizerParams, OptimizerType
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel, CropSizeConstraints
from InnerEye.ML.models.architectures.complex import ComplexModel
@ -38,7 +38,7 @@ from InnerEye.ML.utils.temperature_scaling import ModelWithTemperature
from InnerEye.ML.visualizers.model_summary import ModelSummary
def create_optimizer(config: DeepLearningConfig, parameters: Iterator[Parameter]) -> torch.optim.Optimizer:
def create_optimizer(config: OptimizerParams, parameters: Iterator[Parameter]) -> torch.optim.Optimizer:
# Select optimizer type
if config.optimizer_type in [OptimizerType.Adam, OptimizerType.AMSGrad]:
return torch.optim.Adam(parameters, config.l_rate,

Просмотреть файл

@ -15,7 +15,7 @@ from InnerEye.Azure.azure_util import RUN_CONTEXT, download_outputs_from_run, fe
from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, check_properties_are_not_none
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, \
create_recovery_checkpoint_path, get_best_checkpoint_path
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, DeepLearningConfig
from InnerEye.ML.deep_learning_config import CHECKPOINT_FOLDER, OutputParams
@dataclass(frozen=True)
@ -26,7 +26,7 @@ class RunRecovery:
checkpoints_roots: List[Path]
@staticmethod
def download_best_checkpoints_from_child_runs(config: DeepLearningConfig, run: Run) -> RunRecovery:
def download_best_checkpoints_from_child_runs(config: OutputParams, run: Run) -> RunRecovery:
"""
Downloads the best checkpoints from all child runs of the provided Hyperdrive parent run.
The checkpoints for the sibling runs will go into folder 'OTHER_RUNS/<cross_validation_split>'
@ -61,7 +61,7 @@ class RunRecovery:
return RunRecovery(checkpoints_roots=child_runs_checkpoints_roots)
@staticmethod
def download_all_checkpoints_from_run(config: DeepLearningConfig, run: Run) -> RunRecovery:
def download_all_checkpoints_from_run(config: OutputParams, run: Run) -> RunRecovery:
"""
Downloads all checkpoints of the provided run: The best checkpoint and the recovery checkpoint.
A single folder inside the checkpoints folder will be created that contains the downloaded checkpoints.

Просмотреть файл

@ -1,51 +0,0 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from dataclasses import dataclass, field
from typing import Any, List
from InnerEye.Common.type_annotations import DictStrFloat
@dataclass(frozen=True)
class ModelTrainingResults:
"""
Stores the results from training, with the results on training and validation data for each training epoch.
"""
train_results_per_epoch: List[DictStrFloat]
val_results_per_epoch: List[DictStrFloat]
train_diagnostics: Any
val_diagnostics: Any
optimal_temperature_scale_values_per_checkpoint_epoch: List[float] = field(default_factory=list)
def get_metric(self, is_training: bool, metric_type: str) -> List[float]:
"""
Gets a scalar metric out of either the list of training or the list of validation results. This returns
that value that a specific metric attains in all of the epochs.
:param is_training: If True, read metrics from the `train_results_per_epoch` field, if False read from the
`val_results_per_epoch` field.
:param metric_type: The metric to extract.
:return: A list of floating point numbers, with one entry per entry in the the training or validation results.
"""
metrics = self.train_results_per_epoch if is_training else self.val_results_per_epoch
return [m[metric_type] for m in metrics]
def get_training_metric(self, metric_type: str) -> List[float]:
"""
Gets a scalar metric from the list of training results. This returns
the value that a specific metric attains in all of the epochs.
:param metric_type: The metric to extract.
:return: A list of floating point numbers, with one entry per entry in the the training results.
"""
return self.get_metric(is_training=True, metric_type=metric_type)
def get_validation_metric(self, metric_type: str) -> List[float]:
"""
Gets a scalar metric from the list of validation results. This returns
the value that a specific metric attains in all of the epochs.
:param metric_type: The metric to extract.
:return: A list of floating point numbers, with one entry per entry in the the validation results.
"""
return self.get_metric(is_training=False, metric_type=metric_type)

Просмотреть файл

@ -16,8 +16,7 @@ from InnerEye.ML.dataset.cropping_dataset import CroppingDataset
from InnerEye.ML.dataset.full_image_dataset import FullImageDataset
from InnerEye.ML.dataset.sample import Sample
from InnerEye.ML.plotting import resize_and_save, scan_with_transparent_overlay
from InnerEye.ML.utils import augmentation, io_util, ml_util
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils import augmentation, io_util
# The name of the folder inside the default outputs folder that will holds plots that show the effect of
# sampling random patches
from InnerEye.ML.utils.image_util import get_unit_image_header
@ -113,7 +112,7 @@ def visualize_random_crops_for_dataset(config: SegmentationModelBase, output_fol
for training. Visualizations are stored in both Nifti format, and as 3 PNG thumbnail files, in the output folder.
:param config: The model configuration.
:param output_folder: The folder in which the visualizations should be written. If not provided, use a subfolder
"patch_sampling" in the models's default output folder
"patch_sampling" in the model's default output folder
"""
dataset_splits = config.get_dataset_splits()
# Load a sample using the full image data loader
@ -123,24 +122,3 @@ def visualize_random_crops_for_dataset(config: SegmentationModelBase, output_fol
for sample_index in range(count):
sample = full_image_dataset.get_samples_at_index(index=sample_index)[0]
visualize_random_crops(sample, config, output_folder=output_folder)
def main(args: CheckPatchSamplingConfig) -> None:
# Identify paths to inputs and outputs
commandline_args = {
"train_batch_size": 1,
"local_dataset": Path(args.local_dataset)
}
output_folder = Path(args.output_folder)
output_folder.mkdir(parents=True, exist_ok=True)
# Create a config file
config = ModelConfigLoader[SegmentationModelBase]().create_model_config_from_name(
args.model_name, overrides=commandline_args)
config.show_patch_sampling = args.number_samples
ml_util.set_random_seed(config.random_seed)
visualize_random_crops_for_dataset(config, output_folder=output_folder)
if __name__ == "__main__":
main(CheckPatchSamplingConfig.parse_args())

Просмотреть файл

@ -11,6 +11,7 @@ On the modelling side, this toolbox supports
- Segmentation models
- Classification and regression models
- Sequence models
- Adding cloud support to any PyTorch Lightning model, via a [bring-your-own-model setup](docs/bring_your_own_model.md)
Classification, regression, and sequence models can be built with only images as inputs, or a combination of images
and non-imaging data as input. This supports typical use cases on medical data where measurements, biomarkers,

Просмотреть файл

@ -41,7 +41,7 @@ from InnerEye.Scripts import submit_for_inference
from Tests.ML.util import assert_nifti_content, get_default_azure_config, get_nifti_shape
FALLBACK_ENSEMBLE_RUN = "refs_pull_432_merge:HD_3af84e4a-0043-4260-8be2-04ce9ab09b1f"
FALLBACK_SINGLE_RUN = "refs_pull_407_merge_1614271518_cdbeb28e"
FALLBACK_SINGLE_RUN = "refs_pull_407_merge:refs_pull_407_merge_1614271518_cdbeb28e"
FALLBACK_2NODE_RUN = "refs_pull_385_merge:refs_pull_385_merge_1612421371_ba12a007"
FALLBACK_CV_GLAUCOMA = "refs_pull_432_merge_1618332810_b5d10d74"

Просмотреть файл

@ -1,68 +0,0 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import pytest
from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY
from InnerEye.Common.build_config import BUILDINFORMATION_JSON, ExperimentResultLocation, \
build_information_to_dot_net_json, build_information_to_dot_net_json_file
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.scalar_config import ScalarModelBase
def test_build_config(test_output_dirs: OutputFolderForTests) -> None:
"""
Test that json with build information is created correctly.
"""
config = AzureConfig(
build_number=42,
build_user="user",
build_branch="branch",
build_source_id="00deadbeef",
build_source_author="author",
tag="tag",
model="model")
result_location = ExperimentResultLocation(azure_job_name="job")
net_json = build_information_to_dot_net_json(config, result_location)
expected = '{"BuildNumber": 42, "BuildRequestedFor": "user", "BuildSourceBranchName": "branch", ' \
'"BuildSourceVersion": "00deadbeef", "BuildSourceAuthor": "author", "ModelName": "model", ' \
'"ResultsContainerName": null, "ResultsUri": null, "DatasetFolder": null, "DatasetFolderUri": null, ' \
'"AzureBatchJobName": "job"}'
assert expected == net_json
result_folder = test_output_dirs.root_dir / "buildinfo"
build_information_to_dot_net_json_file(config, result_location, folder=result_folder)
result_file = result_folder / BUILDINFORMATION_JSON
assert result_file.exists()
assert result_file.read_text() == expected
def test_fields_are_set() -> None:
"""
Tests that expected fields are set when creating config classes.
"""
expected = [("hello", None), ("world", None)]
config = SegmentationModelBase(
should_validate=False,
ground_truth_ids=[x[0] for x in expected],
largest_connected_component_foreground_classes=expected
)
assert hasattr(config, CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY)
assert config.largest_connected_component_foreground_classes == expected
@pytest.mark.cpu_and_gpu
def test_dataset_reader_workers() -> None:
"""
Test to make sure the number of dataset reader workers are set correctly
"""
config = ScalarModelBase(
should_validate=False,
num_dataset_reader_workers=-1
)
if config.is_offline_run:
assert config.num_dataset_reader_workers == -1
else:
assert config.num_dataset_reader_workers == 0

Просмотреть файл

@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from pathlib import Path
from unittest import mock
import pytest
@ -12,62 +13,71 @@ from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR, DEFAULT_LOGS_DIR
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.config import PhotometricNormalizationMethod, SegmentationModelBase
from InnerEye.ML.runner import Runner
from Tests.ML.configs.DummyModel import DummyModel
@pytest.mark.parametrize("is_default_namespace", [True, False])
@pytest.mark.parametrize("is_container", [True, False])
@pytest.mark.parametrize("is_offline_run", [True, False])
@pytest.mark.parametrize("set_output_to", [True, False])
def test_create_ml_runner_args(is_default_namespace: bool,
def test_create_ml_runner_args(is_container: bool,
test_output_dirs: OutputFolderForTests,
is_offline_run: bool,
set_output_to: bool) -> None:
"""Test round trip parsing of commandline arguments:
From arguments to the Azure runner to the arguments of the ML runner, checking that
whatever is passed on can be correctly parsed."""
whatever is passed on can be correctly parsed. It also checks that the output files go into the right place
in local runs and in AzureML."""
logging_to_stdout()
model_name = "Lung"
model_name = "DummyContainerWithPlainLightning" if is_container else "DummyModel"
if is_container:
dataset_folder = Path("download")
else:
local_dataset = DummyModel().local_dataset
assert local_dataset is not None
dataset_folder = local_dataset
outputs_folder = test_output_dirs.root_dir
project_root = fixed_paths.repository_root_directory()
if is_default_namespace:
model_configs_namespace = None
else:
model_configs_namespace = "Tests.ML.configs"
model_name = "DummyModel"
model_configs_namespace = "Tests.ML.configs"
args_list = [f"--model={model_name}", "--train=True", "--l_rate=100.0",
"--norm_method=Simple Norm", "--subscription_id", "Test1", "--tenant_id=Test2",
"--application_id", "Test3", "--azureml_datastore", "Test5",
"--pytest_mark", "gpu"]
"--subscription_id", "Test1", "--tenant_id=Test2",
"--application_id", "Test3", "--azureml_datastore", "Test5"]
# toggle the output_to flag off only for online runs
if set_output_to or is_offline_run:
args_list.append(f"--output_to={outputs_folder}")
if not is_container:
args_list.append("--norm_method=Simple Norm")
if not is_default_namespace:
args_list.append(f"--model_configs_namespace={model_configs_namespace}")
args_list.append(f"--model_configs_namespace={model_configs_namespace}")
with mock.patch("sys.argv", [""] + args_list):
with mock.patch("InnerEye.ML.deep_learning_config.is_offline_run_context", return_value=is_offline_run):
runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
runner.parse_and_load_model()
azure_config = runner.azure_config
model_config = runner.model_config
with mock.patch("InnerEye.ML.run_ml.MLRunner.run", return_value=None):
with mock.patch("InnerEye.ML.run_ml.MLRunner.mount_or_download_dataset", return_value=dataset_folder):
runner = Runner(project_root=project_root, yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
runner.parse_and_load_model()
# Only when calling config.create_filesystem we expect to see the correct paths, and this happens
# inside run_in_situ
runner.run_in_situ()
azure_config = runner.azure_config
container_or_legacy_config = runner.lightning_container if is_container else runner.model_config
assert azure_config.model == model_name
assert model_config.l_rate == 100.0
assert model_config.norm_method == PhotometricNormalizationMethod.SimpleNorm
if not is_container:
assert container_or_legacy_config.norm_method == PhotometricNormalizationMethod.SimpleNorm
if set_output_to or is_offline_run:
# The actual output folder must be a subfolder of the folder given on the commandline. The folder will contain
# a timestamp, that will start with the year number, hence will start with 20...
assert str(model_config.outputs_folder).startswith(str(outputs_folder / "20"))
assert model_config.logs_folder == (model_config.outputs_folder / DEFAULT_LOGS_DIR_NAME)
assert str(container_or_legacy_config.outputs_folder).startswith(str(outputs_folder / "20"))
assert container_or_legacy_config.logs_folder == \
(container_or_legacy_config.outputs_folder / DEFAULT_LOGS_DIR_NAME)
else:
# For runs inside AzureML, the output folder is the project root (the root of the folders that are
# included in the snapshot). The "outputs_to" argument will be ignored.
assert model_config.outputs_folder == (project_root / DEFAULT_AML_UPLOAD_DIR)
assert model_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME)
assert container_or_legacy_config.outputs_folder == (project_root / DEFAULT_AML_UPLOAD_DIR)
assert container_or_legacy_config.logs_folder == (project_root / DEFAULT_LOGS_DIR_NAME)
assert not hasattr(model_config, "azureml_datastore")
assert azure_config.pytest_mark == "gpu"
assert not hasattr(container_or_legacy_config, "azureml_datastore")
def test_overridable_properties() -> None:
@ -146,6 +156,7 @@ def test_parsing_with_custom_yaml(test_output_dirs: OutputFolderForTests) -> Non
yaml_config_file=yaml_file)
loader_result = runner.parse_and_load_model()
assert runner.azure_config is not None
assert runner.model_config is not None
# This is only present in yaml
# This is present in yaml and command line, and the latter should be used.
assert runner.azure_config.tenant_id == "bar"

Просмотреть файл

@ -8,7 +8,7 @@ from typing import Any, List, Optional, Tuple
import param
import pytest
from InnerEye.Common.generic_parsing import GenericConfig, IntTuple
from InnerEye.Common.generic_parsing import GenericConfig, IntTuple, create_from_matching_params
class ParamEnum(Enum):
@ -57,6 +57,7 @@ def test_create_parser() -> None:
"""
Check that parse_args works as expected, with both non default and default values.
"""
def check(arg: List[str], expected_key: str, expected_value: Any) -> None:
parsed = ParamClass.parse_args(arg)
assert getattr(parsed, expected_key) == expected_value
@ -127,3 +128,44 @@ def test_int_tuple_validation(value_idx_0: Any, value_idx_1: Any, value_idx_2: A
m.int_tuple = (value_idx_0, value_idx_1, value_idx_2)
else:
m.int_tuple = (value_idx_0, value_idx_1, value_idx_2)
class ClassFrom(param.Parameterized):
foo = param.String("foo")
bar = param.Integer(1)
baz = param.String("baz")
_private = param.String("private")
constant = param.String("constant", constant=True)
class ClassTo(param.Parameterized):
foo = param.String("foo2")
bar = param.Integer(2)
_private = param.String("private2")
constant = param.String("constant2", constant=True)
class NotParameterized:
foo = 1
def test_create_from_matching_params() -> None:
"""
Test if Parameterized objects can be cloned by looking at matching fields.
"""
class_from = ClassFrom()
class_to = create_from_matching_params(class_from, cls_=ClassTo)
assert isinstance(class_to, ClassTo)
assert class_to.foo == "foo"
assert class_to.bar == 1
# Constant fields should not be touched
assert class_to.constant == "constant2"
# Private fields must be copied over.
assert class_to._private == "private"
# Baz is only present in the "from" object, and should not be copied to the new object
assert not hasattr(class_to, "baz")
with pytest.raises(ValueError) as ex:
create_from_matching_params(class_from, NotParameterized)
assert "subclass of param.Parameterized" in str(ex)
assert "NotParameterized" in str(ex)

Просмотреть файл

@ -2,11 +2,14 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import os
from pathlib import Path
import pytest
from InnerEye.Common import common_util
from InnerEye.Common.common_util import check_is_any_of, is_private_field_name, namespace_to_path, \
from InnerEye.Common.common_util import change_working_directory, check_is_any_of, is_private_field_name, \
namespace_to_path, \
path_to_namespace, print_exception
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path, tests_root_directory
from InnerEye.Common.output_directories import OutputFolderForTests
@ -106,3 +109,18 @@ def test_path_to_namespace(is_external: bool, test_output_dirs: OutputFolderForT
path=full_ml_test_data_path(),
root=tests_root_directory().parent
) == test_data.__name__
def test_change_dir(test_output_dirs: OutputFolderForTests) -> None:
"""
Test the context manager for changing directories.
"""
os.chdir(test_output_dirs.root_dir)
assert Path.cwd() == test_output_dirs.root_dir
new_dir = test_output_dirs.root_dir / "foo"
new_dir.mkdir()
with change_working_directory(new_dir):
assert Path.cwd() == new_dir
Path("bar.txt").touch()
assert Path.cwd() == test_output_dirs.root_dir
assert (new_dir / "bar.txt").is_file()

Просмотреть файл

@ -0,0 +1,78 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
# Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress
# individual warnings only.
# flake8: noqa
import shutil
from pathlib import Path
from typing import Any, Optional
from _pytest.monkeypatch import MonkeyPatch
from pytorch_lightning import LightningDataModule, LightningModule
from InnerEye.Common.common_util import add_folder_to_sys_path_if_needed
from InnerEye.ML.configs.other.fastmri_varnet import VarNetWithImageLogging
from InnerEye.ML.lightning_container import LightningContainer
add_folder_to_sys_path_if_needed("fastMRI")
from fastmri.data import SliceDataset
from fastmri.data.subsample import create_mask_for_mask_type
from fastmri.data.transforms import VarNetDataTransform
from fastmri.pl_modules import FastMriDataModule
# This import can fail if written as "from tests.create_temp_data, even though fastMRI is already in the path.
from fastMRI.tests.create_temp_data import create_temp_data
class FastMriRandomData(FastMriDataModule):
def __init__(self) -> None:
data_path = Path.cwd() / "data"
if data_path.is_dir():
shutil.rmtree(str(data_path))
data_path.mkdir(exist_ok=False, parents=True)
_, _, metadata = create_temp_data(data_path)
def retrieve_metadata_mock(a: Any, fname: Any) -> Any:
return metadata[str(fname)]
# That's a bit flaky, we should be un-doing that after, but there's no obvious place of doing so.
MonkeyPatch().setattr(SliceDataset, "_retrieve_metadata", retrieve_metadata_mock)
mask = create_mask_for_mask_type(mask_type_str="equispaced",
center_fractions=[0.08],
accelerations=[4])
# use random masks for train transform, fixed masks for val transform
train_transform = VarNetDataTransform(mask_func=mask, use_seed=False)
val_transform = VarNetDataTransform(mask_func=mask)
test_transform = VarNetDataTransform()
FastMriDataModule.__init__(self,
data_path=data_path / "knee_data",
challenge="multicoil",
train_transform=train_transform,
val_transform=val_transform,
test_transform=test_transform)
def prepare_data(self, *args: Any, **kwargs: Any) -> None:
print("FastMriRandomData.prepare_data")
def setup(self, stage: Optional[str] = None) -> None:
print("FastMriRandomData.setup")
class FastMriOnRandomData(LightningContainer):
def __init__(self) -> None:
super().__init__()
self.num_epochs = 1
# Restrict to a single GPU, because we have code in dataset creation that could cause race conditions
self.max_num_gpus = 1
def create_model(self) -> LightningModule:
return VarNetWithImageLogging()
def get_data_module(self) -> LightningDataModule:
# Local_dataset is set via the commandline to a random folder for unit testss
return FastMriRandomData()

Просмотреть файл

@ -0,0 +1,230 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from pathlib import Path
from typing import Any, Dict, List, Tuple
import pandas as pd
import param
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.metrics import MeanSquaredError
from torch import Tensor
from torch.nn import Identity
from torch.utils.data import DataLoader, Dataset
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.lightning_container import InnerEyeInference, LightningContainer, LightningModuleWithOptimizer
class DummyContainerWithDatasets(LightningContainer):
def __init__(self, has_local_dataset: bool = False, has_azure_dataset: bool = False):
super().__init__()
self.local_dataset = full_ml_test_data_path("lightning_module_data") if has_local_dataset else None
self.azure_dataset_id = "azure_dataset" if has_azure_dataset else ""
def create_model(self) -> LightningModule:
return LightningModuleWithOptimizer()
class DummyContainerWithAzureDataset(DummyContainerWithDatasets):
def __init__(self) -> None:
super().__init__(has_azure_dataset=True)
class DummyContainerWithoutDataset(DummyContainerWithDatasets):
pass
class DummyContainerWithLocalDataset(DummyContainerWithDatasets):
def __init__(self) -> None:
super().__init__(has_local_dataset=True)
class DummyContainerWithAzureAndLocalDataset(DummyContainerWithDatasets):
def __init__(self) -> None:
super().__init__(has_local_dataset=True, has_azure_dataset=True)
class InferenceWithParameters(LightningModule):
model_param = param.String(default="bar")
def __init__(self, container_param: str):
super().__init__()
class DummyContainerWithParameters(LightningContainer):
container_param = param.String(default="foo")
def __init__(self) -> None:
super().__init__()
def create_model(self) -> LightningModule:
return InferenceWithParameters(self.container_param)
class DummyRegressionPlainLightning(LightningModuleWithOptimizer):
"""
A class that only implements plain Lightning training and test. Ideally, we want to support importing any plain
Lightning module without further methods added. This class here inherits LightningWithInference, but does not
implement the inference_step method
"""
def __init__(self, in_features: int = 1, *args: Any, **kwargs: Any):
super().__init__(*args, **kwargs)
self.l_rate = 1e-1
activation = Identity()
layers = [
torch.nn.Linear(in_features=in_features, out_features=1, bias=True),
activation
]
self.model = torch.nn.Sequential(*layers) # type: ignore
def forward(self, x: Tensor) -> Tensor: # type: ignore
return self.model(x)
def training_step(self, batch: Any, *args: Any, **kwargs: Any) -> torch.Tensor: # type: ignore
input, target = batch
prediction = self.forward(input)
loss = torch.nn.functional.mse_loss(prediction, target)
self.log("loss", loss, on_epoch=True, on_step=True)
return loss
def test_step(self, batch, batch_idx) -> torch.Tensor: # type: ignore
Path("test_step.txt").touch()
input, target = batch
prediction = self.forward(input)
loss = torch.nn.functional.mse_loss(prediction, target)
self.log("test_loss", loss, on_epoch=True, on_step=True)
return loss
def on_test_epoch_end(self) -> None:
Path("on_test_epoch_end.txt").touch()
pass
class DummyRegression(DummyRegressionPlainLightning, InnerEyeInference):
def __init__(self, in_features: int = 1, *args, **kwargs) -> None: # type: ignore
super().__init__(in_features=in_features, *args, **kwargs) # type: ignore
self.l_rate = 1e-1
self.dataset_split = ModelExecutionMode.TRAIN
activation = Identity()
layers = [
torch.nn.Linear(in_features=in_features, out_features=1, bias=True),
activation
]
self.model = torch.nn.Sequential(*layers) # type: ignore
def forward(self, x: Tensor) -> Tensor: # type: ignore
return self.model(x)
def training_step(self, batch, *args, **kwargs) -> torch.Tensor: # type: ignore
input, target = batch
prediction = self.forward(input)
loss = torch.nn.functional.mse_loss(prediction, target)
self.log("loss", loss, on_epoch=True, on_step=True)
return loss
def on_inference_start(self) -> None:
Path("on_inference_start.txt").touch()
self.inference_mse: Dict[ModelExecutionMode, float] = {}
def on_inference_epoch_start(self, dataset_split: ModelExecutionMode, is_ensemble_model: bool) -> None:
self.dataset_split = dataset_split
Path(f"on_inference_start_{self.dataset_split.value}.txt").touch()
self.mse = MeanSquaredError()
def inference_step(self, item: Tuple[Tensor, Tensor], batch_idx: int, model_output: torch.Tensor) -> None:
input, target = item
prediction = self.forward(input)
self.mse(prediction, target)
with Path(f"inference_step_{self.dataset_split.value}.txt").open(mode="a") as f:
f.write(f"{prediction.item()},{target.item()}\n")
def on_inference_epoch_end(self) -> None:
Path(f"on_inference_end_{self.dataset_split.value}.txt").touch()
self.inference_mse[self.dataset_split] = self.mse.compute().item()
self.mse.reset()
def on_inference_end(self) -> None:
Path("on_inference_end.txt").touch()
df = pd.DataFrame(columns=["Split", "MSE"],
data=[[split.value, mse] for split, mse in self.inference_mse.items()])
df.to_csv("metrics_per_split.csv", index=False)
class FixedDataset(Dataset):
def __init__(self, inputs_and_targets: List[Tuple[Any, Any]]):
super().__init__()
self.inputs_and_targets = inputs_and_targets
def __len__(self) -> int:
return len(self.inputs_and_targets)
def __getitem__(self, item: int) -> Tuple[Tensor, Tensor]:
input = torch.tensor([float(self.inputs_and_targets[item][0])])
target = torch.tensor([float(self.inputs_and_targets[item][1])])
return input, target
class FixedRegressionData(LightningDataModule):
def __init__(self) -> None:
super().__init__()
self.train_data = [(i, i) for i in range(1, 20, 3)]
self.val_data = [(i, i) for i in range(2, 20, 3)]
self.test_data = [(i, i) for i in range(3, 20, 3)]
def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(FixedDataset(self.train_data)) # type: ignore
def val_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(FixedDataset(self.val_data)) # type: ignore
def test_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader:
return DataLoader(FixedDataset(self.test_data)) # type: ignore
class DummyContainerWithModel(LightningContainer):
def __init__(self) -> None:
super().__init__()
self.perform_training_set_inference = True
self.num_epochs = 50
self.l_rate = 1e-1
def setup(self) -> None:
assert self.local_dataset is not None
(self.local_dataset / "setup.txt").touch()
def create_model(self) -> LightningModule:
return DummyRegression()
def get_data_module(self) -> LightningDataModule:
return FixedRegressionData() # type: ignore
def create_report(self) -> None:
Path("create_report.txt").touch()
class DummyContainerWithInvalidTrainerArguments(LightningContainer):
def create_model(self) -> LightningModule:
return DummyRegression()
def get_trainer_arguments(self) -> Dict[str, Any]:
return {"no_such_argument": 1}
class DummyContainerWithPlainLightning(LightningContainer):
def __init__(self) -> None:
super().__init__()
self.num_epochs = 100
self.l_rate = 1e-2
def create_model(self) -> LightningModule:
return DummyRegressionPlainLightning()
def get_data_module(self) -> LightningDataModule:
return FixedRegressionData() # type: ignore

Просмотреть файл

@ -21,7 +21,6 @@ from InnerEye.ML.deep_learning_config import TemperatureScalingConfig
from InnerEye.ML.lightning_models import transfer_batch_to_device
from InnerEye.ML.model_config_base import ModelTransformsPerExecutionMode
from InnerEye.ML.model_testing import create_metrics_dict_for_scalar_models
from InnerEye.ML.model_training import model_train
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImageEncoder, ImagingFeatureType
from InnerEye.ML.models.architectures.sequential.rnn_classifier import RNNClassifier, RNNClassifierWithEncoder
from InnerEye.ML.run_ml import MLRunner
@ -34,7 +33,7 @@ from InnerEye.ML.utils.io_util import ImageAndSegmentations
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling, get_scalar_model_inputs_and_labels
from InnerEye.ML.utils.split_dataset import DatasetSplits
from InnerEye.ML.visualizers.grad_cam_hooks import VisualizationMaps
from Tests.ML.util import get_default_azure_config, get_default_checkpoint_handler
from Tests.ML.util import get_default_azure_config, model_train_unittest
SCAN_SIZE = (6, 64, 60)
@ -213,8 +212,7 @@ def test_rnn_classifier_via_config_1(use_combined_model: bool,
image_and_seg = ImageAndSegmentations[np.ndarray](images=np.random.uniform(0, 1, SCAN_SIZE),
segmentations=np.random.randint(0, 2, SCAN_SIZE))
with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
model_train(config, get_default_checkpoint_handler(model_config=config,
project_root=test_output_dirs.root_dir))
model_train_unittest(config, dirs=test_output_dirs)
@pytest.mark.skipif(common_util.is_windows(), reason="Has issues on windows build")
@ -247,7 +245,7 @@ def test_run_ml_with_sequence_model(use_combined_model: bool,
with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
azure_config = get_default_azure_config()
azure_config.train = True
MLRunner(config, azure_config).run()
MLRunner(config, azure_config=azure_config).run()
@pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
@ -379,8 +377,7 @@ def test_rnn_classifier_via_config_2(test_output_dirs: OutputFolderForTests) ->
config.num_epochs = 2
config.set_output_to(test_output_dirs.root_dir)
config.dataset_data_frame = _get_mock_sequence_dataset(dataset_contents)
results = model_train(config, get_default_checkpoint_handler(model_config=config,
project_root=test_output_dirs.root_dir))
results, _ = model_train_unittest(config, dirs=test_output_dirs)
actual_train_loss = results.get_metric(is_training=True, metric_type=MetricType.LOSS.value)[-1]
actual_val_loss = results.get_metric(is_training=False, metric_type=MetricType.LOSS.value)[-1]
@ -455,7 +452,7 @@ def test_run_ml_with_multi_label_sequence_model(test_output_dirs: OutputFolderFo
config.max_batch_grad_cam = 1
azure_config = get_default_azure_config()
azure_config.train = True
MLRunner(config, azure_config).run()
MLRunner(config, azure_config=azure_config).run()
# The metrics file should have one entry per epoch per subject per prediction target,
# for all the 3 prediction targets.
metrics_file = config.outputs_folder / "Train" / SUBJECT_METRICS_FILE_NAME

Просмотреть файл

@ -20,7 +20,6 @@ from InnerEye.Common.type_annotations import TupleInt3
from InnerEye.ML.dataset.scalar_dataset import ScalarDataset
from InnerEye.ML.lightning_models import transfer_batch_to_device
from InnerEye.ML.model_config_base import ModelTransformsPerExecutionMode
from InnerEye.ML.model_training import model_train
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImageEncoderWithMlp, \
ImagingFeatureType
from InnerEye.ML.run_ml import MLRunner
@ -34,7 +33,7 @@ from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling,
from InnerEye.ML.utils.split_dataset import DatasetSplits
from InnerEye.ML.visualizers.grad_cam_hooks import VisualizationMaps
from InnerEye.ML.visualizers.model_summary import ModelSummary
from Tests.ML.util import get_default_azure_config, get_default_checkpoint_handler
from Tests.ML.util import get_default_azure_config, model_train_unittest
class ImageEncoder(ScalarModelBase):
@ -222,8 +221,7 @@ S3,week1,scan3.npy,True,6,60,Male,Val2
summarizer.generate_summary(input_sizes=input_size)
config.local_dataset = dataset_folder
config.validate()
model_train(config, checkpoint_handler=get_default_checkpoint_handler(model_config=config,
project_root=Path(test_output_dirs.root_dir)))
model_train_unittest(config, dirs=test_output_dirs)
# No further asserts here because the models are still in experimental state. Most errors would come
# from having invalid model architectures, which would throw runtime errors during training.
@ -231,13 +229,13 @@ S3,week1,scan3.npy,True,6,60,Male,Val2
@pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
@pytest.mark.gpu
@pytest.mark.parametrize(["encode_channels_jointly", "aggregation_type", "imaging_feature_type"],
[(False, AggregationType.Average, ImagingFeatureType.Segmentation),
(True, AggregationType.Average, ImagingFeatureType.Segmentation),
(False, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.GatedPooling, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.MixPooling, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.ZAdaptive3dAvg, ImagingFeatureType.ImageAndSegmentation)])
[(False, AggregationType.Average, ImagingFeatureType.Segmentation),
(True, AggregationType.Average, ImagingFeatureType.Segmentation),
(False, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.Average, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.GatedPooling, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.MixPooling, ImagingFeatureType.ImageAndSegmentation),
(True, AggregationType.ZAdaptive3dAvg, ImagingFeatureType.ImageAndSegmentation)])
def test_image_encoder_with_segmentation(test_output_dirs: OutputFolderForTests,
encode_channels_jointly: bool,
aggregation_type: AggregationType,
@ -274,7 +272,7 @@ def test_image_encoder_with_segmentation(test_output_dirs: OutputFolderForTests,
with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
azure_config = get_default_azure_config()
azure_config.train = True
MLRunner(config, azure_config).run()
MLRunner(config, azure_config=azure_config).run()
# No further asserts here because the models are still in experimental state. Most errors would come
# from having invalid model architectures, which would throw runtime errors during training.
# Verified manually that the cross entropy on the Val set that appears during training, and the

Просмотреть файл

@ -11,13 +11,11 @@ import pytest
from InnerEye.Common import common_util
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
from InnerEye.ML.model_training import model_train
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import create_mlp
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.scalar_config import ScalarLoss, ScalarModelBase
from InnerEye.ML.utils.split_dataset import DatasetSplits
from Tests.ML.util import get_default_checkpoint_handler
from Tests.ML.util import model_train_unittest
class NonImageEncoder(ScalarModelBase):
@ -73,11 +71,11 @@ def test_non_image_encoder(test_output_dirs: OutputFolderForTests,
config.max_batch_grad_cam = 1
config.validate()
# run model training
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=Path(test_output_dirs.root_dir))
model_train(config, checkpoint_handler=checkpoint_handler)
_, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
# run model inference
MLRunner(config).model_inference_train_and_test(checkpoint_handler=checkpoint_handler)
runner = MLRunner(config)
runner.setup()
runner.model_inference_train_and_test(checkpoint_handler=checkpoint_handler)
assert config.get_total_number_of_non_imaging_features() == 18

Просмотреть файл

@ -4,18 +4,17 @@
# ------------------------------------------------------------------------------------------
import logging
from pathlib import Path
import pytest
from InnerEye.Common.common_util import logging_to_stdout
from InnerEye.Common.metrics_constants import MetricType
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML import model_testing, model_training
from InnerEye.ML import model_testing
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.metrics import InferenceMetricsForClassification
from Tests.ML.configs.ClassificationModelForTesting2D import ClassificationModelForTesting2D
from Tests.ML.util import get_default_checkpoint_handler
from Tests.ML.util import model_train_unittest
@pytest.mark.parametrize("use_mixed_precision", [False])
@ -31,10 +30,7 @@ def test_train_2d_classification_model(test_output_dirs: OutputFolderForTests,
# Train for 4 epochs, checkpoints at epochs 2 and 4
config.num_epochs = 4
config.use_mixed_precision = use_mixed_precision
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=Path(test_output_dirs.root_dir))
model_training_result = model_training.model_train(config, checkpoint_handler=checkpoint_handler)
model_training_result, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
assert model_training_result is not None
expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]

Просмотреть файл

@ -4,16 +4,20 @@
# ------------------------------------------------------------------------------------------
import logging
from typing import List
from unittest import mock
import pytest
from azureml.core import Run
from InnerEye.Common.common_util import logging_to_stdout, namespace_to_path
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.model_training import generate_and_print_model_summary
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling, generate_and_print_model_summary
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.util import get_model_loader
from Tests.ML.configs.lightning_test_containers import DummyContainerWithInvalidTrainerArguments, \
DummyContainerWithParameters
from Tests.ML.util import default_runner, get_model_loader, model_loader_including_tests, model_train_unittest
def find_models() -> List[str]:
@ -48,7 +52,7 @@ def test_load_all_configs(model_name: str) -> None:
"""
logger = logging.getLogger()
logger.setLevel(logging.INFO)
config = ModelConfigLoader[SegmentationModelBase]().create_model_config_from_name(model_name)
config = ModelConfigLoader().create_model_config_from_name(model_name)
assert config.model_name == model_name, "Mismatch between definition .py file and model name"
if config.is_segmentation_model:
# Reduce the feature channels to a minimum, to make tests run fast on CPU.
@ -98,12 +102,90 @@ def test_config_loader_as_in_registration() -> None:
During model registration, the model config namespace is read out from the present model. Ensure that we
can create a config loader that has that value as an input.
"""
loader1 = ModelConfigLoader[SegmentationModelBase]()
loader1 = ModelConfigLoader()
model_name = "BasicModel2Epochs"
model = loader1.create_model_config_from_name(model_name)
assert model is not None
namespace = model.__module__
loader2 = ModelConfigLoader[SegmentationModelBase](model_configs_namespace=namespace)
loader2 = ModelConfigLoader(model_configs_namespace=namespace)
assert len(loader2.module_search_specs) == 2
model2 = loader2.create_model_config_from_name(model_name)
assert model2 is not None
def test_config_loader_on_lightning_container() -> None:
"""
Test if the config loader can load an model that is neither classification nor segmentation.
"""
# First test if the container can be instantiated at all (it is tricky to get that right when inheritance change)
DummyContainerWithParameters()
logging_to_stdout(log_level=logging.DEBUG)
model = model_loader_including_tests.create_model_config_from_name("DummyContainerWithParameters")
assert model is not None
@pytest.mark.parametrize("container_name", ["DummyContainerWithAzureDataset",
"DummyContainerWithoutDataset",
"DummyContainerWithLocalDataset",
"DummyContainerWithAzureAndLocalDataset"])
def test_submit_container_to_azureml(container_name: str) -> None:
"""
Test if we can get the config loader to load a Lightning container model, and get it through the AzureML
submission process.
"""
runner = default_runner()
mock_run = Run.get_context()
args = ["", f"--model={container_name}", "--azureml=True", "--model_configs_namespace=Tests.ML.configs"]
with mock.patch("sys.argv", args):
with mock.patch("InnerEye.Azure.azure_runner.get_dataset_consumption", return_value=None):
with mock.patch("azureml.core.Experiment.submit", return_value=mock_run):
loaded_config, actual_run = runner.run()
assert actual_run == mock_run
assert loaded_config is None
assert isinstance(runner.lightning_container, LightningContainer)
def test_load_container_with_arguments() -> None:
"""
Test if we can load a container and override a value in it via the commandline. Parameters can only be set at
container level, not at model level.
"""
DummyContainerWithParameters()
runner = default_runner()
args = ["", "--model=DummyContainerWithParameters", "--container_param=param1",
"--model_configs_namespace=Tests.ML.configs"]
with mock.patch("sys.argv", args):
runner.parse_and_load_model()
assert isinstance(runner.lightning_container, DummyContainerWithParameters)
assert runner.lightning_container.container_param == "param1"
# Overriding model parameters should not work
args = ["", "--model=DummyContainerWithParameters", "--model_param=param2",
"--model_configs_namespace=Tests.ML.configs"]
with pytest.raises(ValueError) as ex:
with mock.patch("sys.argv", args):
runner.parse_and_load_model()
assert "model_param" in str(ex)
def test_load_invalid_container() -> None:
"""
Test if we loading a container fails if one of the parameters is not valid.
"""
DummyContainerWithParameters()
runner = default_runner()
args = ["", "--model=DummyContainerWithParameters", "--number_of_cross_validation_splits=1",
"--model_configs_namespace=Tests.ML.configs"]
with pytest.raises(ValueError) as ex:
with mock.patch("sys.argv", args):
runner.parse_and_load_model()
assert "At least two splits required to perform cross validation, but got 1" in str(ex)
def test_run_model_with_invalid_trainer_arguments(test_output_dirs: OutputFolderForTests) -> None:
"""
Test if the trainer_arguments in a LightningContainer are passed to the trainer.
"""
container = DummyContainerWithInvalidTrainerArguments()
with pytest.raises(Exception) as ex:
model_train_unittest(config=None, dirs=test_output_dirs, lightning_container=container)
assert "no_such_argument" in str(ex)

Просмотреть файл

@ -16,10 +16,11 @@ import torch
from InnerEye.Common import common_util, fixed_paths
from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME, CROSSVAL_RESULTS_FOLDER, EPOCH_METRICS_FILE_NAME, \
METRICS_AGGREGATES_FILE, SUBJECT_METRICS_FILE_NAME, get_best_epoch_results_path, logging_to_stdout
from InnerEye.Common.fixed_paths import LOG_FILE_NAME
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.metrics_constants import LoggingColumns, MetricType
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML import model_testing, model_training, runner
from InnerEye.ML import model_testing, runner
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.configs.classification.DummyMulticlassClassification import DummyMulticlassClassification
from InnerEye.ML.dataset.scalar_dataset import ScalarDataset
@ -30,12 +31,13 @@ from InnerEye.ML.reports.notebook_report import generate_classification_multilab
generate_classification_notebook, get_html_report_name, get_ipynb_report_name
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.scalar_config import ScalarLoss, ScalarModelBase
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.visualizers.plot_cross_validation import EpochMetricValues, get_config_and_results_for_offline_runs, \
unroll_aggregate_metrics
from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.util import get_default_azure_config, get_default_checkpoint_handler, machine_has_gpu
from Tests.ML.util import get_default_azure_config, machine_has_gpu, \
model_train_unittest
@pytest.mark.cpu_and_gpu
@ -50,20 +52,20 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
config = ClassificationModelForTesting()
config.class_names = [class_name]
config.set_output_to(test_output_dirs.root_dir)
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=Path(test_output_dirs.root_dir))
# Train for 4 epochs, checkpoints at epochs 2 and 4
config.num_epochs = 4
model_training_result = model_training.model_train(config, checkpoint_handler=checkpoint_handler)
model_training_result, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
assert model_training_result is not None
expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
expected_train_loss = [0.686614, 0.686465, 0.686316, 0.686167]
expected_val_loss = [0.737061, 0.736691, 0.736321, 0.735952]
# Ensure that all metrics are computed on both training and validation set
assert len(model_training_result.train_results_per_epoch) == config.num_epochs
assert len(model_training_result.val_results_per_epoch) == config.num_epochs
assert len(model_training_result.train_results_per_epoch[0]) >= 11
assert len(model_training_result.val_results_per_epoch[0]) >= 11
train_results_per_epoch = model_training_result.train_results_per_epoch()
val_results_per_epoch = model_training_result.val_results_per_epoch()
assert len(train_results_per_epoch) == config.num_epochs
assert len(val_results_per_epoch) == config.num_epochs
assert len(train_results_per_epoch[0]) >= 11
assert len(val_results_per_epoch[0]) >= 11
for metric in [MetricType.ACCURACY_AT_THRESHOLD_05,
MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
@ -74,10 +76,8 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
MetricType.SECONDS_PER_BATCH,
MetricType.SECONDS_PER_EPOCH,
MetricType.SUBJECT_COUNT]:
assert metric.value in model_training_result.train_results_per_epoch[0], \
f"{metric.value} not in training"
assert metric.value in model_training_result.val_results_per_epoch[0], \
f"{metric.value} not in validation"
assert metric.value in train_results_per_epoch[0], f"{metric.value} not in training"
assert metric.value in val_results_per_epoch[0], f"{metric.value} not in validation"
actual_train_loss = model_training_result.get_metric(is_training=True, metric_type=MetricType.LOSS.value)
actual_val_loss = model_training_result.get_metric(is_training=False, metric_type=MetricType.LOSS.value)
@ -144,6 +144,7 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
"""
check_log_file(inference_metrics_path, inference_metrics_expected, ignore_columns=[])
@pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
@pytest.mark.cpu_and_gpu
def test_train_classification_multilabel_model(test_output_dirs: OutputFolderForTests) -> None:
@ -155,35 +156,33 @@ def test_train_classification_multilabel_model(test_output_dirs: OutputFolderFor
logging_to_stdout(logging.DEBUG)
config = DummyMulticlassClassification()
config.set_output_to(test_output_dirs.root_dir)
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=Path(test_output_dirs.root_dir))
# Train for 4 epochs, checkpoints at epochs 2 and 4
config.num_epochs = 4
model_training_result = model_training.model_train(config, checkpoint_handler=checkpoint_handler)
model_training_result, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
assert model_training_result is not None
expected_learning_rates = [0.0001, 9.99971e-05, 9.99930e-05, 9.99861e-05]
expected_train_loss = [0.699870228767395, 0.6239662170410156, 0.551329493522644, 0.4825132489204407]
expected_val_loss = [0.6299371719360352, 0.5546272993087769, 0.4843321740627289, 0.41909298300743103]
# Ensure that all metrics are computed on both training and validation set
assert len(model_training_result.train_results_per_epoch) == config.num_epochs
assert len(model_training_result.val_results_per_epoch) == config.num_epochs
assert len(model_training_result.train_results_per_epoch[0]) >= 11
assert len(model_training_result.val_results_per_epoch[0]) >= 11
train_results_per_epoch = model_training_result.train_results_per_epoch()
val_results_per_epoch = model_training_result.val_results_per_epoch()
assert len(train_results_per_epoch) == config.num_epochs
assert len(val_results_per_epoch) == config.num_epochs
assert len(train_results_per_epoch[0]) >= 11
assert len(val_results_per_epoch[0]) >= 11
for class_name in config.class_names:
for metric in [MetricType.ACCURACY_AT_THRESHOLD_05,
MetricType.ACCURACY_AT_OPTIMAL_THRESHOLD,
MetricType.AREA_UNDER_PR_CURVE,
MetricType.AREA_UNDER_ROC_CURVE,
MetricType.CROSS_ENTROPY]:
assert f'{metric.value}/{class_name}' in model_training_result.train_results_per_epoch[
0], f"{metric.value} not in training"
assert f'{metric.value}/{class_name}' in model_training_result.val_results_per_epoch[
0], f"{metric.value} not in validation"
assert f'{metric.value}/{class_name}' in train_results_per_epoch[0], f"{metric.value} not in training"
assert f'{metric.value}/{class_name}' in val_results_per_epoch[0], f"{metric.value} not in validation"
for metric in [MetricType.LOSS,
MetricType.SECONDS_PER_EPOCH,
MetricType.SUBJECT_COUNT]:
assert metric.value in model_training_result.train_results_per_epoch[0], f"{metric.value} not in training"
assert metric.value in model_training_result.val_results_per_epoch[0], f"{metric.value} not in validation"
assert metric.value in train_results_per_epoch[0], f"{metric.value} not in training"
assert metric.value in val_results_per_epoch[0], f"{metric.value} not in validation"
actual_train_loss = model_training_result.get_metric(is_training=True, metric_type=MetricType.LOSS.value)
actual_val_loss = model_training_result.get_metric(is_training=False, metric_type=MetricType.LOSS.value)
@ -265,13 +264,12 @@ def test_run_ml_with_classification_model(test_output_dirs: OutputFolderForTests
logging_to_stdout()
azure_config = get_default_azure_config()
azure_config.train = True
config: ScalarModelBase = ModelConfigLoader[ScalarModelBase]() \
.create_model_config_from_name(model_name)
config: ScalarModelBase = ModelConfigLoader().create_model_config_from_name(model_name)
config.number_of_cross_validation_splits = number_of_offline_cross_validation_splits
config.set_output_to(test_output_dirs.root_dir)
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
config.max_num_gpus = 1
MLRunner(config, azure_config).run()
MLRunner(config, azure_config=azure_config).run()
_check_offline_cross_validation_output_files(config)
if config.perform_cross_validation:
@ -306,7 +304,7 @@ def test_run_ml_with_segmentation_model(test_output_dirs: OutputFolderForTests)
config.set_output_to(test_output_dirs.root_dir)
azure_config = get_default_azure_config()
azure_config.train = True
MLRunner(config, azure_config).run()
MLRunner(config, azure_config=azure_config).run()
@pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
@ -319,7 +317,7 @@ def test_runner1(test_output_dirs: OutputFolderForTests) -> None:
set_from_commandline = 12345
scalar1 = '["label"]'
model_name = "DummyClassification"
initial_config = ModelConfigLoader[ScalarModelBase]().create_model_config_from_name(model_name)
initial_config = ModelConfigLoader().create_model_config_from_name(model_name)
assert initial_config.non_image_feature_channels == []
output_root = str(test_output_dirs.root_dir)
args = ["",
@ -338,7 +336,7 @@ def test_runner1(test_output_dirs: OutputFolderForTests) -> None:
assert config.get_effective_random_seed() == set_from_commandline
assert config.non_image_feature_channels == ["label"]
assert str(config.outputs_folder).startswith(output_root)
assert (config.logs_folder / runner.LOG_FILE_NAME).exists()
assert (config.logs_folder / LOG_FILE_NAME).exists()
@pytest.mark.skipif(common_util.is_windows(), reason="Has OOM issues on windows build")
@ -456,7 +454,9 @@ def _compute_scalar_metrics(output_values_list: List[List[float]],
def test_is_offline_cross_val_parent_run(offline_parent_cv_run: bool) -> None:
train_config = DummyModel()
train_config.number_of_cross_validation_splits = 2 if offline_parent_cv_run else 0
assert MLRunner(train_config).is_offline_cross_val_parent_run() == offline_parent_cv_run
runner = MLRunner(train_config)
runner.setup()
assert runner.is_offline_cross_val_parent_run() == offline_parent_cv_run
def _check_offline_cross_validation_output_files(train_config: ScalarModelBase) -> None:
@ -487,12 +487,15 @@ def _check_offline_cross_validation_output_files(train_config: ScalarModelBase)
_dataset_splits.train[train_config.subject_column].unique())
_test_dataset_split_count = len(_dataset_splits.test[train_config.subject_column].unique())
_aggregates_csv = pd.read_csv(aggregate_metrics_path)
_aggregates_csv_test = _aggregates_csv.loc[_aggregates_csv[LoggingColumns.DataSplit.value] == ModelExecutionMode.TEST.value]
_aggregates_csv_train_val = _aggregates_csv.loc[_aggregates_csv[LoggingColumns.DataSplit.value] != ModelExecutionMode.TEST.value]
_aggregates_csv_test = _aggregates_csv.loc[
_aggregates_csv[LoggingColumns.DataSplit.value] == ModelExecutionMode.TEST.value]
_aggregates_csv_train_val = _aggregates_csv.loc[
_aggregates_csv[LoggingColumns.DataSplit.value] != ModelExecutionMode.TEST.value]
_counts_for_splits_train_val = list(_aggregates_csv_train_val[LoggingColumns.SubjectCount.value])
_counts_for_splits_test = list(_aggregates_csv_test[LoggingColumns.SubjectCount.value])
assert all([x == _val_dataset_split_count for x in _counts_for_splits_train_val])
assert all([x == _test_dataset_split_count * train_config.number_of_cross_validation_splits for x in _counts_for_splits_test])
assert all([x == _test_dataset_split_count * train_config.number_of_cross_validation_splits for x in
_counts_for_splits_test])
_epochs = list(_aggregates_csv_train_val[LoggingColumns.Epoch.value].astype(int))
# Each epoch is recorded twice once for the training split and once for the validation
# split

Просмотреть файл

@ -11,8 +11,8 @@ from InnerEye.Common.common_util import is_windows
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.Common.type_annotations import TupleInt3
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.lightning_helpers import create_lightning_model, load_from_checkpoint_and_adjust_for_inference
from InnerEye.ML.lightning_models import SegmentationLightning
from InnerEye.ML.lightning_helpers import load_from_checkpoint_and_adjust_for_inference
from InnerEye.ML.lightning_models import SegmentationLightning, create_lightning_model
from InnerEye.ML.pipelines.inference import InferencePipeline
from InnerEye.ML.utils import image_util
from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint

Просмотреть файл

@ -12,8 +12,7 @@ from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.Common.type_annotations import TupleInt3
from InnerEye.ML.dataset.sample import GeneralSampleMetadata
from InnerEye.ML.dataset.scalar_sample import ScalarItem
from InnerEye.ML.lightning_helpers import create_lightning_model
from InnerEye.ML.lightning_models import ScalarLightning
from InnerEye.ML.lightning_models import ScalarLightning, create_lightning_model
from InnerEye.ML.models.architectures.base_model import DeviceAwareModule
from InnerEye.ML.pipelines.scalar_inference import ScalarEnsemblePipeline, ScalarInferencePipeline, \
ScalarInferencePipelineBase

Просмотреть файл

@ -1,30 +0,0 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import argparse
from InnerEye.ML.model_config_base import ModelConfigBase
from Tests.ML.util import get_model_loader
MODEL_NAME = "DummyModelWithOverrideGroups"
LOADER = get_model_loader("Tests.ML.configs")
def test_script_params_override() -> None:
# these are the parameters from the command line that should override
# the initial parameters
parser = argparse.ArgumentParser()
parser.add_argument("--l_rate",
help="The name of the model to train/test.",
type=float,
default=1.0)
args = parser.parse_args("")
try:
config: ModelConfigBase = LOADER.create_model_config_from_name(model_name=MODEL_NAME, overrides=vars(args))
# check that the values were changed
assert config.l_rate == args.l_rate
except ValueError:
# (Temporarily) handle the case where there is no Lung config.
pass

Просмотреть файл

@ -7,10 +7,13 @@ from typing import List, Optional, Union
import pytest
import torch
from pandas import DataFrame
from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.config import ModelArchitectureConfig, SegmentationModelBase, equally_weighted_classes
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
from InnerEye.ML.scalar_config import ScalarModelBase
from InnerEye.ML.utils import ml_util
@ -127,6 +130,35 @@ def test_equally_weighted_classes_fails(num_fg_clases: int, background_weight: O
equally_weighted_classes(classes, background_weight)
def test_fields_are_set() -> None:
"""
Tests that expected fields are set when creating config classes.
"""
expected = [("hello", None), ("world", None)]
config = SegmentationModelBase(
should_validate=False,
ground_truth_ids=[x[0] for x in expected],
largest_connected_component_foreground_classes=expected
)
assert hasattr(config, CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY)
assert config.largest_connected_component_foreground_classes == expected
@pytest.mark.cpu_and_gpu
def test_dataset_reader_workers() -> None:
"""
Test to make sure the number of dataset reader workers are set correctly
"""
config = ScalarModelBase(
should_validate=False,
num_dataset_reader_workers=-1
)
if config.is_offline_run:
assert config.num_dataset_reader_workers == -1
else:
assert config.num_dataset_reader_workers == 0
def create_dataset_csv(test_output_dirs: OutputFolderForTests) -> Path:
"""Create dummy dataset csv file for tests,
deleting any pre-existing file."""
@ -176,34 +208,46 @@ def test_dataset_csv_with_ScalarModelBase(
assert model_config.dataset_data_frame is not None
validate_dataset_paths(model_config)
def test_unet3_num_downsampling_paths() -> None:
for num_downsampling_paths in range(1, 5):
j = int(2**num_downsampling_paths)
j = int(2 ** num_downsampling_paths)
# Test that num_downsampling_paths for built UNet3D
# is set via model configuration
crop_size = (j, j, j)
config = SegmentationModelBase(
architecture=ModelArchitectureConfig.UNet3D,
image_channels=["ct"],
feature_channels=[1],
crop_size=crop_size,
num_downsampling_paths=num_downsampling_paths,
should_validate=False)
architecture=ModelArchitectureConfig.UNet3D,
image_channels=["ct"],
feature_channels=[1],
crop_size=crop_size,
num_downsampling_paths=num_downsampling_paths,
should_validate=False)
network = build_net(config)
assert network.num_downsampling_paths == num_downsampling_paths
# Test that exception is raised if crop size is smaller than is allowed
# by num_downsampling_paths
too_small_crop_size = (j//2, j//2, j//2)
too_small_crop_size = (j // 2, j // 2, j // 2)
ex_msg = f"Crop size is not valid. The required minimum is {crop_size}"
config = SegmentationModelBase(
architecture=ModelArchitectureConfig.UNet3D,
image_channels=["ct"],
feature_channels=[1],
crop_size=too_small_crop_size,
num_downsampling_paths=num_downsampling_paths,
should_validate=False)
architecture=ModelArchitectureConfig.UNet3D,
image_channels=["ct"],
feature_channels=[1],
crop_size=too_small_crop_size,
num_downsampling_paths=num_downsampling_paths,
should_validate=False)
with pytest.raises(ValueError) as ex:
network = build_net(config)
build_net(config)
assert ex_msg in str(ex)
def test_config_str() -> None:
"""
Check if dataframe fields are omitted from the string conversion of a config object.
"""
config = DeepLearningConfig()
df = DataFrame(columns=["foobar"], data=[1.0, 2.0])
config.dataset_data_frame = df
s = str(config)
assert "foobar" not in s, f"Incorrect output: {s}"

Просмотреть файл

@ -0,0 +1,100 @@
5.749202728271484375e+00,1.045434713363647461e+00
7.637505531311035156e+00,1.481036424636840820e+00
9.329424858093261719e+00,1.917572140693664551e+00
7.494448661804199219e+00,1.591190218925476074e+00
9.689485549926757812e+00,1.971193432807922363e+00
1.534464955329895020e+00,2.667523026466369629e-01
9.561051368713378906e+00,1.685984492301940918e+00
8.846873283386230469e+00,1.842781662940979004e+00
5.290946006774902344e+00,1.096875548362731934e+00
6.057560443878173828e+00,1.215266227722167969e+00
9.478215217590332031e+00,2.150293827056884766e+00
3.349403142929077148e+00,6.767664551734924316e-01
4.251931190490722656e+00,8.223311901092529297e-01
9.133169174194335938e+00,1.817616820335388184e+00
5.868637561798095703e-01,7.238323986530303955e-02
4.395127296447753906e-02,-1.203215569257736206e-01
4.473074913024902344e+00,1.031657218933105469e+00
9.513977050781250000e+00,1.858699083328247070e+00
1.762510538101196289e+00,5.511313676834106445e-01
3.059309720993041992e+00,7.563391923904418945e-01
1.646029353141784668e+00,1.496532708406448364e-01
4.897529602050781250e+00,1.009216904640197754e+00
5.573034763336181641e+00,1.063154220581054688e+00
6.369268894195556641e-01,1.111971661448478699e-01
4.231463432312011719e+00,7.547589540481567383e-01
2.101852893829345703e+00,4.300535917282104492e-01
6.729665279388427734e+00,1.531016945838928223e+00
7.961721420288085938e+00,1.478816986083984375e+00
2.669513702392578125e+00,5.368775129318237305e-01
4.574956893920898438e+00,8.576059341430664062e-01
7.437694072723388672e-01,2.781907916069030762e-01
5.983660221099853516e+00,1.187077164649963379e+00
9.752596855163574219e+00,2.027858018875122070e+00
9.338418960571289062e+00,2.025038957595825195e+00
6.751420497894287109e+00,1.303008437156677246e+00
9.554377555847167969e+00,1.930274367332458496e+00
3.040063381195068359e-01,1.029240339994430542e-02
2.818799018859863281e-01,2.614871561527252197e-01
5.994813919067382812e+00,1.070150852203369141e+00
5.795848369598388672e-01,-4.384742677211761475e-02
3.211182355880737305e+00,5.655854344367980957e-01
8.615511894226074219e+00,1.730698943138122559e+00
4.950296401977539062e+00,9.849137663841247559e-01
1.632133126258850098e+00,4.643072187900543213e-01
5.252981662750244141e+00,1.020202517509460449e+00
6.792118072509765625e+00,1.392252922058105469e+00
2.290313720703125000e+00,2.924301028251647949e-01
3.329365253448486328e+00,8.425519466400146484e-01
3.469936370849609375e+00,6.026793718338012695e-01
8.790910243988037109e-02,-2.861313149333000183e-02
3.999347686767578125e+00,7.818984389305114746e-01
2.891576290130615234e-01,1.004043519496917725e-01
5.804258346557617188e+00,1.216606140136718750e+00
3.836791992187500000e+00,9.209365844726562500e-01
5.516016006469726562e+00,1.077136993408203125e+00
7.987973213195800781e+00,1.436331987380981445e+00
8.790102958679199219e+00,1.966201663017272949e+00
7.359976291656494141e+00,1.414163231849670410e+00
3.334070444107055664e+00,6.647097468376159668e-01
6.998687744140625000e+00,1.236205577850341797e+00
4.366195201873779297e+00,8.169019818305969238e-01
5.620658397674560547e-02,1.258963048458099365e-01
4.941163539886474609e+00,8.424454331398010254e-01
5.723971366882324219e+00,1.067836642265319824e+00
6.601081371307373047e+00,1.462573528289794922e+00
2.285490512847900391e+00,3.970748484134674072e-01
3.233198642730712891e+00,5.250911116600036621e-01
8.005992889404296875e+00,1.783252477645874023e+00
9.988401412963867188e+00,2.108428716659545898e+00
3.176209926605224609e+00,6.435566544532775879e-01
8.746008872985839844e+00,1.664346933364868164e+00
1.034811139106750488e+00,7.043153047561645508e-02
4.228623390197753906e+00,7.815699577331542969e-01
8.936402320861816406e+00,1.719643831253051758e+00
6.312811374664306641e+00,1.115693926811218262e+00
2.998808622360229492e+00,6.670392155647277832e-01
4.147662162780761719e+00,8.732877969741821289e-01
7.289369106292724609e+00,1.516813516616821289e+00
7.059363842010498047e+00,1.442374944686889648e+00
8.922320365905761719e+00,1.986880540847778320e+00
2.708734989166259766e+00,5.354607105255126953e-01
7.997574329376220703e+00,1.465035080909729004e+00
3.965347290039062500e+00,8.214159011840820312e-01
6.266443729400634766e+00,1.197527050971984863e+00
1.101133823394775391e+00,6.622195243835449219e-02
5.336141586303710938e+00,9.389448165893554688e-01
9.122401237487792969e+00,1.791326642036437988e+00
5.871895790100097656e+00,1.136480450630187988e+00
7.560163497924804688e+00,1.564274787902832031e+00
9.699577093124389648e-01,3.698585033416748047e-01
6.118352413177490234e+00,1.192414522171020508e+00
2.855588197708129883e+00,7.185647487640380859e-01
7.776415348052978516e-01,1.955290585756301880e-01
5.909432888031005859e+00,1.285851240158081055e+00
1.471748352050781250e+00,4.017572104930877686e-01
7.064949989318847656e+00,1.341601967811584473e+00
4.807097911834716797e+00,1.043601632118225098e+00
6.570946693420410156e+00,1.190045952796936035e+00
4.360112667083740234e+00,7.959681153297424316e-01
9.832940101623535156e+00,1.984794259071350098e+00
1 5.749202728271484375e+00 1.045434713363647461e+00
2 7.637505531311035156e+00 1.481036424636840820e+00
3 9.329424858093261719e+00 1.917572140693664551e+00
4 7.494448661804199219e+00 1.591190218925476074e+00
5 9.689485549926757812e+00 1.971193432807922363e+00
6 1.534464955329895020e+00 2.667523026466369629e-01
7 9.561051368713378906e+00 1.685984492301940918e+00
8 8.846873283386230469e+00 1.842781662940979004e+00
9 5.290946006774902344e+00 1.096875548362731934e+00
10 6.057560443878173828e+00 1.215266227722167969e+00
11 9.478215217590332031e+00 2.150293827056884766e+00
12 3.349403142929077148e+00 6.767664551734924316e-01
13 4.251931190490722656e+00 8.223311901092529297e-01
14 9.133169174194335938e+00 1.817616820335388184e+00
15 5.868637561798095703e-01 7.238323986530303955e-02
16 4.395127296447753906e-02 -1.203215569257736206e-01
17 4.473074913024902344e+00 1.031657218933105469e+00
18 9.513977050781250000e+00 1.858699083328247070e+00
19 1.762510538101196289e+00 5.511313676834106445e-01
20 3.059309720993041992e+00 7.563391923904418945e-01
21 1.646029353141784668e+00 1.496532708406448364e-01
22 4.897529602050781250e+00 1.009216904640197754e+00
23 5.573034763336181641e+00 1.063154220581054688e+00
24 6.369268894195556641e-01 1.111971661448478699e-01
25 4.231463432312011719e+00 7.547589540481567383e-01
26 2.101852893829345703e+00 4.300535917282104492e-01
27 6.729665279388427734e+00 1.531016945838928223e+00
28 7.961721420288085938e+00 1.478816986083984375e+00
29 2.669513702392578125e+00 5.368775129318237305e-01
30 4.574956893920898438e+00 8.576059341430664062e-01
31 7.437694072723388672e-01 2.781907916069030762e-01
32 5.983660221099853516e+00 1.187077164649963379e+00
33 9.752596855163574219e+00 2.027858018875122070e+00
34 9.338418960571289062e+00 2.025038957595825195e+00
35 6.751420497894287109e+00 1.303008437156677246e+00
36 9.554377555847167969e+00 1.930274367332458496e+00
37 3.040063381195068359e-01 1.029240339994430542e-02
38 2.818799018859863281e-01 2.614871561527252197e-01
39 5.994813919067382812e+00 1.070150852203369141e+00
40 5.795848369598388672e-01 -4.384742677211761475e-02
41 3.211182355880737305e+00 5.655854344367980957e-01
42 8.615511894226074219e+00 1.730698943138122559e+00
43 4.950296401977539062e+00 9.849137663841247559e-01
44 1.632133126258850098e+00 4.643072187900543213e-01
45 5.252981662750244141e+00 1.020202517509460449e+00
46 6.792118072509765625e+00 1.392252922058105469e+00
47 2.290313720703125000e+00 2.924301028251647949e-01
48 3.329365253448486328e+00 8.425519466400146484e-01
49 3.469936370849609375e+00 6.026793718338012695e-01
50 8.790910243988037109e-02 -2.861313149333000183e-02
51 3.999347686767578125e+00 7.818984389305114746e-01
52 2.891576290130615234e-01 1.004043519496917725e-01
53 5.804258346557617188e+00 1.216606140136718750e+00
54 3.836791992187500000e+00 9.209365844726562500e-01
55 5.516016006469726562e+00 1.077136993408203125e+00
56 7.987973213195800781e+00 1.436331987380981445e+00
57 8.790102958679199219e+00 1.966201663017272949e+00
58 7.359976291656494141e+00 1.414163231849670410e+00
59 3.334070444107055664e+00 6.647097468376159668e-01
60 6.998687744140625000e+00 1.236205577850341797e+00
61 4.366195201873779297e+00 8.169019818305969238e-01
62 5.620658397674560547e-02 1.258963048458099365e-01
63 4.941163539886474609e+00 8.424454331398010254e-01
64 5.723971366882324219e+00 1.067836642265319824e+00
65 6.601081371307373047e+00 1.462573528289794922e+00
66 2.285490512847900391e+00 3.970748484134674072e-01
67 3.233198642730712891e+00 5.250911116600036621e-01
68 8.005992889404296875e+00 1.783252477645874023e+00
69 9.988401412963867188e+00 2.108428716659545898e+00
70 3.176209926605224609e+00 6.435566544532775879e-01
71 8.746008872985839844e+00 1.664346933364868164e+00
72 1.034811139106750488e+00 7.043153047561645508e-02
73 4.228623390197753906e+00 7.815699577331542969e-01
74 8.936402320861816406e+00 1.719643831253051758e+00
75 6.312811374664306641e+00 1.115693926811218262e+00
76 2.998808622360229492e+00 6.670392155647277832e-01
77 4.147662162780761719e+00 8.732877969741821289e-01
78 7.289369106292724609e+00 1.516813516616821289e+00
79 7.059363842010498047e+00 1.442374944686889648e+00
80 8.922320365905761719e+00 1.986880540847778320e+00
81 2.708734989166259766e+00 5.354607105255126953e-01
82 7.997574329376220703e+00 1.465035080909729004e+00
83 3.965347290039062500e+00 8.214159011840820312e-01
84 6.266443729400634766e+00 1.197527050971984863e+00
85 1.101133823394775391e+00 6.622195243835449219e-02
86 5.336141586303710938e+00 9.389448165893554688e-01
87 9.122401237487792969e+00 1.791326642036437988e+00
88 5.871895790100097656e+00 1.136480450630187988e+00
89 7.560163497924804688e+00 1.564274787902832031e+00
90 9.699577093124389648e-01 3.698585033416748047e-01
91 6.118352413177490234e+00 1.192414522171020508e+00
92 2.855588197708129883e+00 7.185647487640380859e-01
93 7.776415348052978516e-01 1.955290585756301880e-01
94 5.909432888031005859e+00 1.285851240158081055e+00
95 1.471748352050781250e+00 4.017572104930877686e-01
96 7.064949989318847656e+00 1.341601967811584473e+00
97 4.807097911834716797e+00 1.043601632118225098e+00
98 6.570946693420410156e+00 1.190045952796936035e+00
99 4.360112667083740234e+00 7.959681153297424316e-01
100 9.832940101623535156e+00 1.984794259071350098e+00

Просмотреть файл

@ -3,21 +3,27 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import logging
import shutil
from pathlib import Path
from typing import List
from typing import Any, List, Optional
from unittest import mock
import pytest
from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Azure.azure_util import get_results_blob_path
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import OTHER_RUNS_SUBDIR_NAME, logging_section, logging_to_stdout
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import DATASET_CSV_FILE_NAME
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.utils.run_recovery import RunRecovery
from Tests.AfterTraining.test_after_training import FALLBACK_ENSEMBLE_RUN, FALLBACK_SINGLE_RUN, get_most_recent_run
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.configs.lightning_test_containers import DummyContainerWithDatasets
from Tests.ML.util import get_default_azure_config
logging_to_stdout(logging.DEBUG)
@ -35,10 +41,6 @@ def runner_config() -> AzureConfig:
return config
def test_get_results_blob_path() -> None:
assert get_results_blob_path("some_run_id") == "azureml/ExperimentRun/dcid.some_run_id"
def check_single_checkpoint(downloaded_checkpoints: List[Path]) -> None:
assert len(downloaded_checkpoints) == 1
assert downloaded_checkpoints[0].is_file()
@ -77,18 +79,22 @@ def test_download_best_checkpoints_ensemble_run(test_output_dirs: OutputFolderFo
def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> None:
dataset_name = "test-dataset"
config = ModelConfigBase(should_validate=False)
config = DummyModel()
config.local_dataset = None
config.azure_dataset_id = ""
azure_config = get_default_azure_config()
runner = MLRunner(config, azure_config)
runner.project_root = test_output_dirs.root_dir
runner = MLRunner(config, azure_config=azure_config)
# If the model has neither local_dataset or azure_dataset_id, mount_or_download_dataset should fail.
with pytest.raises(ValueError):
runner.mount_or_download_dataset()
# This mounting call must happen before any other operations on the container, because already the model
# creation may need access to the dataset.
with pytest.raises(ValueError) as ex:
runner.setup()
assert ex.value.args[0] == "The model must contain either local_dataset or azure_dataset_id."
runner.project_root = test_output_dirs.root_dir
# Pointing the model to a dataset folder that does not exist should raise an Exception
fake_folder = runner.project_root / "foo"
runner.model_config.local_dataset = fake_folder
runner.container.local_dataset = fake_folder
with pytest.raises(FileNotFoundError):
runner.mount_or_download_dataset()
@ -98,8 +104,8 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
assert local_dataset == fake_folder
# Pointing the model to a dataset in Azure should trigger a download
runner.model_config.local_dataset = None
runner.model_config.azure_dataset_id = dataset_name
runner.container.local_dataset = None
runner.container.azure_dataset_id = dataset_name
with logging_section("Starting download"):
result_path = runner.mount_or_download_dataset()
# Download goes into <project_root> / "datasets" / "test_dataset"
@ -115,3 +121,127 @@ def test_download_azureml_dataset(test_output_dirs: OutputFolderForTests) -> Non
for file in ["ct", "esophagus", "heart", "lung_l", "lung_r", "spinalcord"]:
f = (sub_folder / file).with_suffix(".nii.gz")
assert f.is_file()
def _test_mount_for_lightning_container(test_output_dirs: OutputFolderForTests,
is_offline_run: bool,
local_dataset: Optional[Path],
azure_dataset: str,
is_lightning_model: bool) -> LightningContainer:
config: Optional[DeepLearningConfig] = None
container: Optional[LightningContainer] = None
if is_lightning_model:
container = DummyContainerWithDatasets()
container.azure_dataset_id = azure_dataset
container.local_dataset = local_dataset
else:
config = DummyModel()
config.azure_dataset_id = azure_dataset
config.local_dataset = local_dataset
# The legacy InnerEye models require an existing dataset_csv file present in the dataset folder. Create that.
download_path = test_output_dirs.root_dir / "downloaded"
mount_path = test_output_dirs.root_dir / "mounted"
if not is_lightning_model:
for path in [download_path, mount_path]:
path.mkdir(exist_ok=True)
shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), path / DATASET_CSV_FILE_NAME)
with mock.patch("InnerEye.ML.run_ml.MLRunner.is_offline_run", is_offline_run):
with mock.patch("InnerEye.ML.run_ml.download_dataset", return_value=download_path):
with mock.patch("InnerEye.ML.run_ml.try_to_mount_input_dataset", return_value=mount_path):
runner = MLRunner(config, container=container,
azure_config=None, project_root=test_output_dirs.root_dir)
runner.setup()
return runner.container
@pytest.mark.parametrize(("is_lightning_model", "expected_error"),
[
# A built-in InnerEye model must have either local dataset or azure dataset provided.
(False, "The model must contain either local_dataset or azure_dataset_id"),
# ... but this is OK for Lightning container models. A Lightning container could simply
# download its data from the web before training.
(True, "")
])
def test_mount_failing_offline_runs(test_output_dirs: OutputFolderForTests,
is_lightning_model: bool,
expected_error: str) -> None:
"""
Test cases when MLRunner.mount_or_download_dataset raises an exception, when running outside AzureML.
"""
def run() -> Any:
return _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=True,
local_dataset=None,
azure_dataset="",
is_lightning_model=is_lightning_model)
if expected_error:
with pytest.raises(ValueError) as ex:
run()
assert expected_error in str(ex)
else:
assert run().local_dataset is None
def test_mount_in_azureml1(test_output_dirs: OutputFolderForTests) -> None:
"""
Test cases when MLRunner.mount_or_download_dataset runs inside AzureML.
"""
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=False,
local_dataset=None,
azure_dataset="foo",
is_lightning_model=False)
assert "mounted" in str(container.local_dataset)
def test_mount_in_azureml2(test_output_dirs: OutputFolderForTests) -> None:
"""
Test cases when MLRunner.mount_or_download_dataset runs inside AzureML.
"""
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=False,
local_dataset=None,
azure_dataset="",
is_lightning_model=True)
assert container.local_dataset is None
def test_mount_or_download(test_output_dirs: OutputFolderForTests) -> None:
"""
Tests the different combinations of local and Azure datasets, with Innereye built-in and container models.
"""
root = test_output_dirs.root_dir
for is_lightning_model in [True, False]:
# With runs outside of AzureML, an AML dataset should get downloaded.
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=True,
local_dataset=None,
azure_dataset="foo",
is_lightning_model=is_lightning_model)
assert "downloaded" in str(container.local_dataset)
# For all InnerEye built-in models, the paths from container level need to be copied down to legacy config
# level.
if not is_lightning_model:
assert container.config.local_dataset == container.local_dataset
# With runs in AzureML, an AML dataset should get mounted.
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=False,
local_dataset=None,
azure_dataset="foo",
is_lightning_model=is_lightning_model)
assert "mounted" in str(container.local_dataset)
if not is_lightning_model:
assert container.config.local_dataset == container.local_dataset
# With runs outside of AzureML, a local dataset should be used as-is. Azure dataset ID is ignored here.
shutil.copy(full_ml_test_data_path(DATASET_CSV_FILE_NAME), root / DATASET_CSV_FILE_NAME)
container = _test_mount_for_lightning_container(test_output_dirs=test_output_dirs,
is_offline_run=True,
local_dataset=root,
azure_dataset="",
is_lightning_model=is_lightning_model)
assert container.local_dataset == root
if not is_lightning_model:
assert container.config.local_dataset == container.local_dataset

Просмотреть файл

@ -0,0 +1,229 @@
# ------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from io import StringIO
from unittest import mock
import pandas as pd
import pytest
from pytorch_lightning import LightningModule
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.deep_learning_config import ARGS_TXT, DatasetParams, WorkflowParams
from InnerEye.ML.lightning_base import InnerEyeContainer
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.run_ml import MLRunner
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.configs.lightning_test_containers import DummyContainerWithModel, DummyContainerWithPlainLightning
from Tests.ML.util import default_runner
def test_run_container_in_situ(test_output_dirs: OutputFolderForTests) -> None:
"""
Test if we can get the config loader to load a Lightning container model, and then train locally.
"""
runner = default_runner()
local_dataset = test_output_dirs.root_dir / "dataset"
local_dataset.mkdir()
args = ["", "--model=DummyContainerWithModel", "--model_configs_namespace=Tests.ML.configs",
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
with mock.patch("sys.argv", args):
loaded_config, actual_run = runner.run()
assert actual_run is None
assert isinstance(runner.lightning_container, DummyContainerWithModel)
# Test if the outputs folder is relative to the folder that we specified via the commandline
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
results = runner.lightning_container.outputs_folder
# Test that the setup method has been called
assert runner.lightning_container.local_dataset is not None
assert (runner.lightning_container.local_dataset / "setup.txt").is_file()
# Test if all the files that are written during inference exist. Data for all 3 splits must be processed
assert (results / "on_inference_start.txt").is_file()
assert (results / "on_inference_end.txt").is_file()
for mode in ModelExecutionMode:
assert (results / f"on_inference_start_{mode.value}.txt").is_file()
assert (results / f"on_inference_end_{mode.value}.txt").is_file()
step_results = results / f"inference_step_{mode.value}.txt"
assert step_results.is_file()
# We should have one line per data item, and there are around 6 of them
result_lines = [line for line in step_results.read_text().splitlines() if line.strip()]
assert len(result_lines) >= 5
metrics_per_split = pd.read_csv(results / "metrics_per_split.csv")
# Training should have reduced the MSE to pretty much zero.
expected = pd.read_csv(StringIO("""Split,MSE
Test,1e-7
Val,1e-7
Train,1e-7"""))
pd.testing.assert_frame_equal(metrics_per_split, expected, check_less_precise=True)
# Test if we have an args file that lists all parameters
args_file = (results / ARGS_TXT).read_text()
assert "Container:" in args_file
assert "adam_betas" in args_file
# Report generation must run
assert (results / "create_report.txt").is_file()
def test_run_container_with_plain_lightning_in_situ(test_output_dirs: OutputFolderForTests) -> None:
"""
Test if we can train a plain Lightning model, without any additional methods defined, end-to-end.
"""
runner = default_runner()
local_dataset = test_output_dirs.root_dir / "dataset"
local_dataset.mkdir()
args = ["", "--model=DummyContainerWithPlainLightning", "--model_configs_namespace=Tests.ML.configs",
f"--output_to={test_output_dirs.root_dir}", f"--local_dataset={local_dataset}"]
with mock.patch("sys.argv", args):
loaded_config, actual_run = runner.run()
assert actual_run is None
assert isinstance(runner.lightning_container, DummyContainerWithPlainLightning)
# Test if the outputs folder is relative to the folder that we specified via the commandline
runner.lightning_container.outputs_folder.relative_to(test_output_dirs.root_dir)
results = runner.lightning_container.outputs_folder
# Test if all the files that are written during inference exist.
assert not (results / "on_inference_start.txt").is_file()
assert (results / "test_step.txt").is_file()
def test_innereye_container_init() -> None:
"""
Test if the constructor of the InnerEye container copies attributes as expected.
"""
# The constructor should copy all fields that belong to either EssentialParams or DatasetParams from the
# config object to the container.
for (attrib, type_) in [("weights_url", WorkflowParams), ("azure_dataset_id", DatasetParams)]:
config = ModelConfigBase()
assert hasattr(type_, attrib)
assert hasattr(config, attrib)
setattr(config, attrib, "foo")
container = InnerEyeContainer(config)
assert getattr(container, attrib) == "foo"
def test_create_fastmri_container() -> None:
"""
Test if we can create a model that uses the fastMRI submodule. This is effectively just testing module imports,
and if the submodule is created correctly.
"""
from InnerEye.ML.configs.other.fastmri_varnet import VarNetWithImageLogging
from Tests.ML.configs.fastmri_random import FastMriOnRandomData
FastMriOnRandomData()
VarNetWithImageLogging()
@pytest.mark.gpu
def test_run_fastmri_container(test_output_dirs: OutputFolderForTests) -> None:
"""
Test if we can get run the fastMRI model end-to-end. This takes about 2min on a CPU machine, hence only run
in AzureML
"""
runner = default_runner()
dataset_dir = test_output_dirs.root_dir / "dataset"
dataset_dir.mkdir(parents=True)
args = ["", "--model=FastMriOnRandomData",
f"--output_to={test_output_dirs.root_dir}",
"--model_configs_namespace=Tests.ML.configs"]
with mock.patch("sys.argv", args):
loaded_config, actual_run = runner.run()
assert actual_run is None
from Tests.ML.configs.fastmri_random import FastMriOnRandomData
assert isinstance(runner.lightning_container, FastMriOnRandomData)
def test_model_name_is_set(test_output_dirs: OutputFolderForTests) -> None:
container = DummyContainerWithModel()
container.local_dataset = test_output_dirs.root_dir
runner = MLRunner(model_config=None, container=container)
runner.setup()
expected_name = "DummyContainerWithModel"
assert runner.container._model_name == expected_name
assert expected_name in str(runner.container.outputs_folder)
def test_model_name_for_innereye_container() -> None:
"""
Test if the InnerEye container picks up the name of the model correctly. The name will impact the output folder
structure that is created.
"""
expected_name = "DummyModel"
model = DummyModel()
assert model.model_name == expected_name
container = InnerEyeContainer(model)
assert container.model_name == expected_name
class DummyContainerWithFields(LightningContainer):
def __init__(self) -> None:
super().__init__()
self.perform_training_set_inference = True
self.num_epochs = 123456
self.l_rate = 1e-2
def create_model(self) -> LightningModule:
return LightningModule()
def test_container_to_str() -> None:
"""
Test how a string representation of a container looks like.
"""
c = DummyContainerWithFields()
# Set any other field that is not done via the params library
c.foo = "bar"
s = str(c)
print(s)
assert "foo" in s
assert "bar" in s
assert "param" not in s
assert "initialized" not in s
assert "123456" in s
def test_file_system_with_subfolders(test_output_dirs: OutputFolderForTests) -> None:
"""
Test if a subfolder can be created within the output folder structure, for use with cross validation.
"""
model = DummyModel()
model.set_output_to(test_output_dirs.root_dir)
container = InnerEyeContainer(model)
# File system should be copied from model config to container
assert container.file_system_config == model.file_system_config
runner = MLRunner(model_config=model)
runner.setup()
assert str(runner.container.outputs_folder).endswith(model.model_name)
output_subfolder = "foo"
expected_folder = runner.container.outputs_folder / output_subfolder
runner = MLRunner(model_config=model, output_subfolder=output_subfolder)
runner.setup()
assert runner.container.outputs_folder == expected_folder
def test_optim_params1(test_output_dirs: OutputFolderForTests) -> None:
"""
Test if the optimizer parameters are read correctly for InnerEye configs.
"""
model = DummyModel()
model.set_output_to(test_output_dirs.root_dir)
runner = MLRunner(model_config=model)
runner.setup()
lightning_model = runner.container.model
optim, _ = lightning_model.configure_optimizers()
assert optim[0].param_groups[0]["lr"] == 1e-3
def test_optim_params2(test_output_dirs: OutputFolderForTests) -> None:
"""
Test if the optimizer parameters are read correctly for containers.
"""
container = DummyContainerWithModel()
container.local_dataset = test_output_dirs.root_dir
runner = MLRunner(model_config=None, container=container)
runner.setup()
lightning_model = runner.container.model
optim, _ = lightning_model.configure_optimizers()
expected_lr = 1e-1
assert container.l_rate == expected_lr
assert optim[0].param_groups[0]["lr"] == expected_lr

Просмотреть файл

@ -14,9 +14,8 @@ from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, ModelExecu
from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
from InnerEye.ML.metrics import InferenceMetricsForClassification
from InnerEye.ML.model_testing import model_test
from InnerEye.ML.model_training import model_train
from InnerEye.ML.utils.run_recovery import RunRecovery
from Tests.ML.util import get_default_checkpoint_handler
from Tests.ML.util import get_default_checkpoint_handler, model_train_unittest
# @pytest.mark.parametrize("mean_teacher_model", [True, False])
@ -35,10 +34,8 @@ def test_recover_testing_from_run_recovery(mean_teacher_model: bool,
os.makedirs(str(config.outputs_folder))
config.recovery_checkpoint_save_interval = 2
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=test_output_dirs.root_dir)
train_results = model_train(config, checkpoint_handler=checkpoint_handler)
assert len(train_results.train_results_per_epoch) == config.num_epochs
train_results, checkpoint_handler = model_train_unittest(config, dirs=test_output_dirs)
assert len(train_results.train_results_per_epoch()) == config.num_epochs
# Run inference on this
test_results = model_test(config=config, data_split=ModelExecutionMode.TEST, checkpoint_handler=checkpoint_handler)

Просмотреть файл

@ -19,7 +19,6 @@ from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, is_windows, l
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.metrics_constants import MetricType, TrackedMetrics, VALIDATION_PREFIX
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML import model_training
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode, \
RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX, \
STORED_CSV_FILE_NAMES
@ -27,15 +26,15 @@ from InnerEye.ML.config import MixtureLossComponent, SegmentationLoss
from InnerEye.ML.configs.classification.DummyClassification import DummyClassification
from InnerEye.ML.dataset.sample import CroppedSample
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.model_training import aggregate_and_create_subject_metrics_file, model_train
from InnerEye.ML.lightning_loggers import StoringLogger
from InnerEye.ML.model_training import aggregate_and_create_subject_metrics_file
from InnerEye.ML.models.losses.mixture import MixtureLoss
from InnerEye.ML.utils.io_util import load_nifti_image
from InnerEye.ML.utils.model_util import create_segmentation_loss_function
from InnerEye.ML.utils.run_recovery import RunRecovery
from InnerEye.ML.utils.training_util import ModelTrainingResults
from InnerEye.ML.visualizers.patch_sampling import PATCH_SAMPLING_FOLDER
from Tests.ML.configs.DummyModel import DummyModel
from Tests.ML.util import get_default_checkpoint_handler, machine_has_gpu
from Tests.ML.util import get_default_checkpoint_handler, machine_has_gpu, model_train_unittest
config_path = full_ml_test_data_path()
base_path = full_ml_test_data_path()
@ -102,22 +101,24 @@ def _test_model_train(output_dirs: OutputFolderForTests,
train_config.recovery_checkpoint_save_interval = 1
if machine_has_gpu:
expected_train_losses = [0.4553468, 0.454904]
expected_val_losses = [0.4553881, 0.4553041]
expected_train_losses = [0.4552919, 0.4548529]
expected_val_losses = [0.455389, 0.455306]
else:
expected_train_losses = [0.4553469, 0.4548947]
expected_val_losses = [0.4553880, 0.4553041]
expected_train_losses = [0.4552919, 0.4548538]
expected_val_losses = [0.4553891, 0.4553060]
loss_absolute_tolerance = 1e-6
expected_learning_rates = [train_config.l_rate, 5.3589e-4]
checkpoint_handler = get_default_checkpoint_handler(model_config=train_config,
project_root=Path(output_dirs.root_dir))
model_training_result = model_training.model_train(train_config,
checkpoint_handler=checkpoint_handler)
assert isinstance(model_training_result, ModelTrainingResults)
model_training_result, _ = model_train_unittest(train_config, dirs=output_dirs)
assert isinstance(model_training_result, StoringLogger)
actual_train_losses = model_training_result.get_train_metric(MetricType.LOSS.value)
actual_val_losses = model_training_result.get_val_metric(MetricType.LOSS.value)
print("actual_train_losses = {}".format(actual_train_losses))
print("actual_val_losses = {}".format(actual_val_losses))
def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
actual = model_training_result.get_training_metric(metric)
actual = model_training_result.get_train_metric(metric)
assert np.allclose(actual, expected, **kwargs), f"Mismatch for {metric}: Got {actual}, expected {expected}"
# check to make sure training batches are NOT all the same across epochs
@ -135,28 +136,24 @@ def _test_model_train(output_dirs: OutputFolderForTests,
# and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
# The following values are read off directly from the results of compute_dice_across_patches in the training loop
# This checks that averages are computed correctly, and that metric computers are reset after each epoch.
train_voxels = [[83092.0, 83212.0, 82946.0], [83000.0, 82881.0, 83309.0]]
train_voxels = [[82860.0, 83212.0, 83087.0], [82831.0, 82900.0, 83212.0]]
val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
_check_voxel_count(model_training_result.train_results_per_epoch, _mean_list(train_voxels), "Train")
_check_voxel_count(model_training_result.val_results_per_epoch, _mean_list(val_voxels), "Val")
_check_voxel_count(model_training_result.train_results_per_epoch(), _mean_list(train_voxels), "Train")
_check_voxel_count(model_training_result.val_results_per_epoch(), _mean_list(val_voxels), "Val")
actual_train_losses = model_training_result.get_training_metric(MetricType.LOSS.value)
actual_val_losses = model_training_result.get_validation_metric(MetricType.LOSS.value)
print("actual_train_losses = {}".format(actual_train_losses))
print("actual_val_losses = {}".format(actual_val_losses))
assert np.allclose(actual_train_losses, expected_train_losses, atol=loss_absolute_tolerance), "Train losses"
assert np.allclose(actual_val_losses, expected_val_losses, atol=loss_absolute_tolerance), "Val losses"
# Check that the metric we track for Hyperdrive runs is actually written.
assert TrackedMetrics.Val_Loss.value.startswith(VALIDATION_PREFIX)
tracked_metric = TrackedMetrics.Val_Loss.value[len(VALIDATION_PREFIX):]
for val_result in model_training_result.val_results_per_epoch:
for val_result in model_training_result.val_results_per_epoch():
assert tracked_metric in val_result
# The following values are read off directly from the results of compute_dice_across_patches in the
# training loop. Results are slightly different for CPU, hence use a larger tolerance there.
dice_tolerance = 1e-4 if machine_has_gpu else 4.5e-4
train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0309, 0.0334, 0.0961]]
train_dice_region1 = [[0.4806, 0.4800, 0.4832], [0.4812, 0.4842, 0.4663]]
# training loop. Results are slightly different for GPU, hence use a larger tolerance there.
dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4
train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0372, 0.0388, 0.1091]]
train_dice_region1 = [[0.4785, 0.4807, 0.4834], [0.4832, 0.4800, 0.4628]]
# There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
# test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
# failing here, the losses match up to the expected tolerance.
@ -192,10 +189,10 @@ def _test_model_train(output_dirs: OutputFolderForTests,
assert len(list(sampling_folder.rglob("*.png"))) == 3 * train_config.show_patch_sampling
# Time per epoch: Test that we have all these times logged.
model_training_result.get_training_metric(MetricType.SECONDS_PER_EPOCH.value)
model_training_result.get_validation_metric(MetricType.SECONDS_PER_EPOCH.value)
model_training_result.get_validation_metric(MetricType.SECONDS_PER_BATCH.value)
model_training_result.get_training_metric(MetricType.SECONDS_PER_BATCH.value)
model_training_result.get_train_metric(MetricType.SECONDS_PER_EPOCH.value)
model_training_result.get_val_metric(MetricType.SECONDS_PER_EPOCH.value)
model_training_result.get_val_metric(MetricType.SECONDS_PER_BATCH.value)
model_training_result.get_train_metric(MetricType.SECONDS_PER_BATCH.value)
# Issue #372
# # Test for saving of example images
@ -323,9 +320,7 @@ def test_recover_training_mean_teacher_model(test_output_dirs: OutputFolderForTe
# First round of training
config.num_epochs = 2
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=test_output_dirs.root_dir)
model_train(config, checkpoint_handler=checkpoint_handler)
model_train_unittest(config, dirs=test_output_dirs)
assert len(list(config.checkpoint_folder.glob("*.*"))) == 2
# Restart training from previous run
@ -336,9 +331,13 @@ def test_recover_training_mean_teacher_model(test_output_dirs: OutputFolderForTe
# make if seem like run recovery objects have been downloaded
checkpoint_root = config.checkpoint_folder / "old_run"
shutil.copytree(str(original_checkpoint_folder), str(checkpoint_root))
# Create a new checkpoint handler and set run_recovery to the copied checkpoints
checkpoint_handler = get_default_checkpoint_handler(model_config=config,
project_root=test_output_dirs.root_dir)
checkpoint_handler.run_recovery = RunRecovery([checkpoint_root])
model_train(config, checkpoint_handler=checkpoint_handler)
model_train_unittest(config, dirs=test_output_dirs, checkpoint_handler=checkpoint_handler)
# remove recovery checkpoints
shutil.rmtree(checkpoint_root)
assert len(list(config.checkpoint_folder.glob("*.*"))) == 2

Просмотреть файл

@ -4,7 +4,7 @@
# ------------------------------------------------------------------------------------------
import logging
from pathlib import Path
from typing import Any, List, Optional, Union
from typing import Any, List, Optional, Tuple, Union
import numpy as np
import pytest
@ -15,15 +15,21 @@ from azureml.core import Workspace
from InnerEye.Azure.azure_config import AzureConfig
from InnerEye.Common import fixed_paths
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.Common.type_annotations import PathOrString, TupleInt3
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.dataset.full_image_dataset import PatientDatasetSource
from InnerEye.ML.dataset.sample import PatientMetadata, Sample
from InnerEye.ML.deep_learning_config import DeepLearningConfig
from InnerEye.ML.lightning_base import InnerEyeContainer
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.lightning_loggers import StoringLogger
from InnerEye.ML.model_training import model_train
from InnerEye.ML.photometric_normalization import PhotometricNormalization
from InnerEye.ML.run_ml import MLRunner
from InnerEye.ML.runner import Runner
from InnerEye.ML.utils import io_util
from InnerEye.ML.utils.checkpoint_handling import CheckpointHandler
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.io_util import ImageHeader, ImageWithHeader
from InnerEye.ML.utils.ml_util import is_gpu_available
@ -185,12 +191,12 @@ def assert_binary_files_match(actual_file: Path, expected_file: Path) -> None:
DummyPatientMetadata = PatientMetadata(patient_id='42')
def get_model_loader(namespace: Optional[str] = None) -> ModelConfigLoader[SegmentationModelBase]:
def get_model_loader(namespace: Optional[str] = None) -> ModelConfigLoader:
"""
Returns a ModelConfigLoader for segmentation models, with the given non-default namespace (if not None)
to search under.
"""
return ModelConfigLoader[SegmentationModelBase](model_configs_namespace=namespace)
return ModelConfigLoader(model_configs_namespace=namespace)
def get_default_azure_config() -> AzureConfig:
@ -206,7 +212,9 @@ def get_default_checkpoint_handler(model_config: DeepLearningConfig, project_roo
Gets a checkpoint handler, using the given model config and the default azure configuration.
"""
azure_config = get_default_azure_config()
return CheckpointHandler(azure_config=azure_config, model_config=model_config,
lightning_container = InnerEyeContainer(model_config)
return CheckpointHandler(azure_config=azure_config,
container=lightning_container,
project_root=project_root)
@ -216,3 +224,46 @@ def get_default_workspace() -> Workspace:
:return:
"""
return get_default_azure_config().get_workspace()
def model_train_unittest(config: Optional[DeepLearningConfig],
dirs: OutputFolderForTests,
checkpoint_handler: Optional[CheckpointHandler] = None,
lightning_container: Optional[LightningContainer] = None) -> \
Tuple[StoringLogger, CheckpointHandler]:
"""
A shortcut for running model training in the unit test suite. It runs training for the given config, with the
default checkpoint handler initialized to point to the test output folder specified in dirs.
:param config: The configuration of the model to train.
:param dirs: The test fixture that provides an output folder for the test.
:param lightning_container: An optional LightningContainer object that will be pass through to the training routine.
:param checkpoint_handler: The checkpoint handler that should be used for training. If not provided, it will be
created via get_default_checkpoint_handler.
:return: Tuple[StoringLogger, CheckpointHandler]
"""
runner = MLRunner(model_config=config, container=lightning_container)
# Setup will set random seeds before model creation, and set the model in the container.
# It will also set random seeds correctly. Later we use so initialized container.
# For all tests running in AzureML, we need to skip the downloading of datasets that would otherwise happen,
# because all unit test configs come with their own local dataset already.
runner.setup(use_mount_or_download_dataset=False)
if checkpoint_handler is None:
azure_config = get_default_azure_config()
checkpoint_handler = CheckpointHandler(azure_config=azure_config,
container=runner.container,
project_root=dirs.root_dir)
_, storing_logger = model_train(checkpoint_handler=checkpoint_handler,
container=runner.container)
return storing_logger, checkpoint_handler # type: ignore
def default_runner() -> Runner:
"""
Create an InnerEye Runner object with the default settings, pointing to the repository root and
default settings files.
"""
return Runner(project_root=fixed_paths.repository_root_directory(),
yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)
model_loader_including_tests = get_model_loader(namespace="Tests.ML.configs")

Просмотреть файл

@ -5,6 +5,7 @@
import os
from pathlib import Path
from unittest import mock
from urllib.parse import urlparse
import pytest
@ -49,20 +50,20 @@ def test_use_local_weights_file(test_output_dirs: OutputFolderForTests) -> None:
assert not checkpoint_handler.local_weights_path
# weights from local_weights_path and weights_url will be modified if needed and stored at this location
expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE
expected_path = checkpoint_handler.output_params.outputs_folder / WEIGHTS_FILE
# Set a weights_path
checkpoint_handler.azure_config.run_recovery_id = ""
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
checkpoint_handler.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
checkpoint_handler.download_recovery_checkpoints_or_weights()
assert checkpoint_handler.local_weights_path == expected_path
assert checkpoint_handler.local_weights_path.is_file()
# set a local_weights_path
config.weights_url = ""
checkpoint_handler.container.weights_url = ""
local_weights_path = test_output_dirs.root_dir / "exist.pth"
create_checkpoint_file(local_weights_path)
config.local_weights_path = local_weights_path
checkpoint_handler.container.local_weights_path = local_weights_path
checkpoint_handler.download_recovery_checkpoints_or_weights()
assert checkpoint_handler.local_weights_path == expected_path
@ -113,35 +114,35 @@ def test_get_recovery_path_train(test_output_dirs: OutputFolderForTests) -> None
assert checkpoint_handler.get_recovery_path_train() is None
# weights from local_weights_path and weights_url will be modified if needed and stored at this location
expected_path = checkpoint_handler.model_config.outputs_folder / WEIGHTS_FILE
expected_path = checkpoint_handler.output_params.outputs_folder / WEIGHTS_FILE
# Set a weights_url to get checkpoint from
checkpoint_handler.azure_config.run_recovery_id = ""
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
checkpoint_handler.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
checkpoint_handler.download_recovery_checkpoints_or_weights()
assert checkpoint_handler.local_weights_path == expected_path
config.start_epoch = 0
checkpoint_handler.container.start_epoch = 0
assert checkpoint_handler.get_recovery_path_train() == expected_path
# Can't resume training from an external checkpoint
config.start_epoch = 20
checkpoint_handler.container.start_epoch = 20
with pytest.raises(ValueError) as ex:
checkpoint_handler.get_recovery_path_train()
assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
assert ex.value.args[0] == "Start epoch is > 0, but no run recovery object has been provided to resume training."
# Set a local_weights_path to get checkpoint from
config.weights_url = ""
checkpoint_handler.container.weights_url = ""
local_weights_path = test_output_dirs.root_dir / "exist.pth"
create_checkpoint_file(local_weights_path)
config.local_weights_path = local_weights_path
checkpoint_handler.container.local_weights_path = local_weights_path
checkpoint_handler.download_recovery_checkpoints_or_weights()
assert checkpoint_handler.local_weights_path == expected_path
config.start_epoch = 0
checkpoint_handler.container.start_epoch = 0
assert checkpoint_handler.get_recovery_path_train() == expected_path
# Can't resume training from an external checkpoint
config.start_epoch = 20
checkpoint_handler.container.start_epoch = 20
with pytest.raises(ValueError) as ex:
checkpoint_handler.get_recovery_path_train()
assert ex.value.args == "Start epoch is > 0, but no run recovery object has been provided to resume training."
assert ex.value.args[0] == "Start epoch is > 0, but no run recovery object has been provided to resume training."
@pytest.mark.after_training_single_run
@ -162,7 +163,7 @@ def test_get_recovery_path_train_single_run(test_output_dirs: OutputFolderForTes
assert "Run recovery set, but start epoch is 0" in ex.value.args[0]
# Run recovery with start epoch provided should succeed
config.start_epoch = 20
checkpoint_handler.container.start_epoch = 20
expected_path = create_recovery_checkpoint_path(path=config.checkpoint_folder / run_recovery_id.split(":")[1])
assert checkpoint_handler.get_recovery_path_train() == expected_path
@ -200,7 +201,7 @@ def test_get_best_checkpoint_single_run(test_output_dirs: OutputFolderForTests)
checkpoint_handler.azure_config.run_recovery_id = run_recovery_id
checkpoint_handler.download_recovery_checkpoints_or_weights()
config.start_epoch = 1
checkpoint_handler.container.start_epoch = 1
# There is no checkpoint in the current run - use the one from run_recovery
checkpoint_paths = checkpoint_handler.get_best_checkpoint()
expected_checkpoint = config.checkpoint_folder / run_recovery_id.split(":")[1] \
@ -246,16 +247,16 @@ def test_get_checkpoints_to_test(test_output_dirs: OutputFolderForTests) -> None
# so the local weights should be used ignoring any epochs to test
local_weights_path = test_output_dirs.root_dir / "exist.pth"
create_checkpoint_file(local_weights_path)
config.local_weights_path = local_weights_path
manage_recovery.container.local_weights_path = local_weights_path
manage_recovery.download_recovery_checkpoints_or_weights()
checkpoint_and_paths = manage_recovery.get_checkpoints_to_test()
assert checkpoint_and_paths
assert len(checkpoint_and_paths) == 1
assert checkpoint_and_paths[0] == manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
assert checkpoint_and_paths[0] == manage_recovery.output_params.outputs_folder / WEIGHTS_FILE
config.start_epoch = 1
manage_recovery.container.start_epoch = 1
manage_recovery.additional_training_done()
config.checkpoint_folder.mkdir()
manage_recovery.container.checkpoint_folder.mkdir()
# Copy checkpoint to make it seem like training has happened
expected_checkpoint = config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX
@ -325,13 +326,13 @@ def test_get_local_weights_path_or_download(test_output_dirs: OutputFolderForTes
# If local_weights_path folder exists, get_local_weights_path_or_download should not do anything.
local_weights_path = manage_recovery.project_root / "exist.pth"
create_checkpoint_file(local_weights_path)
manage_recovery.model_config.local_weights_path = local_weights_path
manage_recovery.container.local_weights_path = local_weights_path
returned_weights_path = manage_recovery.get_local_weights_path_or_download()
assert local_weights_path == returned_weights_path
# Pointing the model to a URL should trigger a download
config.local_weights_path = None
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
manage_recovery.container.local_weights_path = None
manage_recovery.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
downloaded_weights = manage_recovery.get_local_weights_path_or_download()
# Download goes into <project_root> / "modelweights" / "resnet18-5c106cde.pth"
expected_path = manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \
@ -361,7 +362,7 @@ def test_get_and_modify_local_weights(test_output_dirs: OutputFolderForTests) ->
assert "neither local_weights_path nor weights_url is set in the model config" in ex.value.args[0]
# Pointing the model to a local_weights_path that does not exist will raise an error.
config.local_weights_path = manage_recovery.project_root / "non_exist"
manage_recovery.container.local_weights_path = manage_recovery.project_root / "non_exist"
with pytest.raises(FileNotFoundError) as file_ex:
manage_recovery.get_and_save_modified_weights()
assert "Could not find the weights file" in file_ex.value.args[0]
@ -369,39 +370,42 @@ def test_get_and_modify_local_weights(test_output_dirs: OutputFolderForTests) ->
# Test that weights are properly modified when a local_weights_path is set
# set a method to modify weights:
ModelConfigBase.load_checkpoint_and_modify = lambda self, path_to_checkpoint: {"modified": "local", # type: ignore
"path": path_to_checkpoint}
# Set the local_weights_path to an empty file, which will be passed to modify_checkpoint
local_weights_path = manage_recovery.project_root / "exist.pth"
create_checkpoint_file(local_weights_path)
config.local_weights_path = local_weights_path
weights_path = manage_recovery.get_and_save_modified_weights()
expected_path = manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
# read from weights_path and check that the dict has been written
assert weights_path.is_file()
assert expected_path == weights_path
read = torch.load(str(weights_path))
assert read.keys() == {"modified", "path"}
assert read["modified"] == "local"
assert read["path"] == local_weights_path
# clean up
weights_path.unlink()
with mock.patch.object(ModelConfigBase,
'load_checkpoint_and_modify',
lambda self, path_to_checkpoint: {"modified": "local", # type: ignore
"path": path_to_checkpoint}):
# Set the local_weights_path to an empty file, which will be passed to modify_checkpoint
local_weights_path = manage_recovery.project_root / "exist.pth"
create_checkpoint_file(local_weights_path)
manage_recovery.container.local_weights_path = local_weights_path
weights_path = manage_recovery.get_and_save_modified_weights()
expected_path = manage_recovery.output_params.outputs_folder / WEIGHTS_FILE
# read from weights_path and check that the dict has been written
assert weights_path.is_file()
assert expected_path == weights_path
read = torch.load(str(weights_path))
assert read.keys() == {"modified", "path"}
assert read["modified"] == "local"
assert read["path"] == local_weights_path
# clean up
weights_path.unlink()
# Test that weights are properly modified when weights_url is set
# set a different method to modify weights, to avoid using old files from other tests:
ModelConfigBase.load_checkpoint_and_modify = lambda self, path_to_checkpoint: {"modified": "url", # type: ignore
"path": path_to_checkpoint}
# Set the weights_url to the sample pytorch URL, which will be passed to modify_checkpoint
config.local_weights_path = None
config.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
weights_path = manage_recovery.get_and_save_modified_weights()
expected_path = manage_recovery.model_config.outputs_folder / WEIGHTS_FILE
# read from weights_path and check that the dict has been written
assert weights_path.is_file()
assert expected_path == weights_path
read = torch.load(str(weights_path))
assert read.keys() == {"modified", "path"}
assert read["modified"] == "url"
assert read["path"] == manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \
os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path)
with mock.patch.object(ModelConfigBase,
'load_checkpoint_and_modify',
lambda self, path_to_checkpoint: {"modified": "url", "path": path_to_checkpoint}):
# Set the weights_url to the sample pytorch URL, which will be passed to modify_checkpoint
manage_recovery.container.local_weights_path = None
manage_recovery.container.weights_url = EXTERNAL_WEIGHTS_URL_EXAMPLE
weights_path = manage_recovery.get_and_save_modified_weights()
expected_path = manage_recovery.output_params.outputs_folder / WEIGHTS_FILE
# read from weights_path and check that the dict has been written
assert weights_path.is_file()
assert expected_path == weights_path
read = torch.load(str(weights_path))
assert read.keys() == {"modified", "path"}
assert read["modified"] == "url"
assert read["path"] == manage_recovery.project_root / MODEL_WEIGHTS_DIR_NAME / \
os.path.basename(urlparse(EXTERNAL_WEIGHTS_URL_EXAMPLE).path)

Просмотреть файл

@ -161,7 +161,7 @@ def _create_lr_scheduler_and_optimizer(config: SegmentationModelBase, optimizer:
if optimizer is None:
optimizer = _create_dummy_optimizer(config)
# create lr scheduler
lr_scheduler = SchedulerWithWarmUp(config, optimizer)
lr_scheduler = SchedulerWithWarmUp(config, optimizer, num_epochs=config.num_epochs)
return lr_scheduler, optimizer
@ -215,7 +215,7 @@ def test_lr_scheduler_with_warmup(warmup_epochs: int, expected_values: List[floa
l_rate_warmup_epochs=warmup_epochs,
l_rate_warmup=LRWarmUpType.Linear,
should_validate=False)
scheduler = SchedulerWithWarmUp(config, optimizer)
scheduler = SchedulerWithWarmUp(config, optimizer, num_epochs=config.num_epochs)
lrs = enumerate_scheduler(scheduler, 4)
assert lrs == expected_values

Просмотреть файл

@ -15,7 +15,9 @@ from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, LAST_CHECK
LAST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, RECOVERY_CHECKPOINT_FILE_NAME, RECOVERY_CHECKPOINT_FILE_NAME_WITH_SUFFIX, \
cleanup_checkpoint_folder, keep_best_checkpoint, keep_latest
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.lightning_helpers import create_lightning_model, load_from_checkpoint_and_adjust_for_inference
from InnerEye.ML.lightning_base import InnerEyeContainer
from InnerEye.ML.lightning_helpers import load_from_checkpoint_and_adjust_for_inference
from InnerEye.ML.lightning_models import create_lightning_model
from InnerEye.ML.model_config_base import ModelConfigBase
from InnerEye.ML.model_training import create_lightning_trainer
from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
@ -34,7 +36,8 @@ def create_model_and_store_checkpoint(config: ModelConfigBase, checkpoint_path:
:param config: The model configuration.
:param checkpoint_path: The path and filename of the checkpoint file.
"""
trainer, _ = create_lightning_trainer(config)
container = InnerEyeContainer(config)
trainer, _ = create_lightning_trainer(container)
model = create_lightning_model(config)
if machine_has_gpu:
model = model.cuda() # type: ignore

Просмотреть файл

@ -397,7 +397,7 @@ def test_run_ml_with_multi_label_sequence_in_crossval(test_output_dirs: OutputFo
config.number_of_cross_validation_splits = 2
azure_config = get_default_azure_config()
azure_config.train = True
MLRunner(config, azure_config).run()
MLRunner(config, azure_config=azure_config).run()
def test_load_files_with_prediction_target() -> None:

Просмотреть файл

@ -148,3 +148,18 @@ jobs:
parameters:
pytest_mark: after_training_glaucoma_cv_run
test_run_title: tests_after_training_glaucoma_cv_run
- job: TrainHelloWorld
variables:
- name: model
value: 'HelloWorld'
- name: tag
value: 'HelloWorldPR'
pool:
vmImage: 'ubuntu-18.04'
steps:
- template: train_template.yml
parameters:
wait_for_completion: 'True'
pytest_mark: ''
max_run_duration: '1h'

Просмотреть файл

@ -1,6 +1,7 @@
steps:
- checkout: self
lfs: true
submodules: true
- bash: |
if [ $(Agent.OS) = 'Windows_NT' ]

Просмотреть файл

@ -0,0 +1,229 @@
# Bring Your Own PyTorch Lightning Model
The InnerEye toolbox is capable of training any PyTorch Lighting (PL) model inside of AzureML, making
use of all the usual InnerEye toolbox features:
- Working with different model in the same codebase, and selecting one by name
- Distributed training in AzureML
- Logging via AzureML's native capabilities
- Training on a local GPU machine or inside of AzureML without code changes
- Supply commandline overrides for model configuration elements, to quickly queue many jobs
This can be used by
- Defining a special container class, that encapsulates the PyTorch Lighting model to train, and the data that should
be used for training and testing.
- Adding essential trainer parameters like number of epochs to that container.
- Invoking the InnerEye runner and providing the name of the container class, like this:
`python InnerEye/ML/runner.py --model=MyContainer`. To train in AzureML, just add a `--azureml=True` flag.
There is a fully working example [HelloContainer](../InnerEye/ML/configs/other/HelloContainer.py), that implements
a simple 1-dimensional regression model from data stored in a CSV file. You can run that
from the command line by `python InnerEye/ML/runner.py --model=HelloContainer`.
## Setup
In order to use these capabilities, you need to implement a class deriving from `LightningContainer`. This class
encapsulates everything that is needed for training with PyTorch Lightning:
- The `create_model` method needs to return a subclass of `LightningModule`, that has
all the usual PyTorch Lightning methods required for training, like the `training_step` and `forward` methods. This
object needs to adhere to additional constraints, see below.
- The `get_data_module` method of the container needs to return a `LightningDataModule` that has the data loaders for
training and validation data.
- The optional `get_inference_data_module` returns a `LightningDataModule` that is used to read the data for inference
(that is, evaluating the trained model). By default, this returns the same data as `get_training_data_module`, but you
can override this for special models like segmentation models that are trained on equal sized image patches, but
evaluated on full images of varying size.
Your class needs to be defined in a Python file in the `InnerEye/ML/configs` folder, otherwise it won't be picked up
correctly. If you'd like to have your model defined in a different folder, please specify the Python namespace via
the `--model_configs_namespace` argument. For example, use `--model_configs_namespace=My.Own.configs` if your
model configuration classes reside in folder `My/Own/configs` from the repository root.
*Example*:
```python
from pathlib import Path
from torch.utils.data import DataLoader
from pytorch_lightning import LightningModule, LightningDataModule
from InnerEye.ML.lightning_container import LightningContainer
class MyLightningModel(LightningModule):
def __init__(self):
self.layer = ...
def training_step(self, *args, **kwargs):
...
def forward(self, *args, **kwargs):
...
def configure_optimizers(self):
...
def test_step(self, *args, **kwargs):
...
class MyDataModule(LightningDataModule):
def __init__(self, root_path: Path):
# All data should be read from the folder given in self.root_path
self.root_path = root_path
def train_dataloader(self, *args, **kwargs) -> DataLoader:
...
def val_dataloader(self, *args, **kwargs) -> DataLoader:
# The data should be read off self.root_path
...
def test_dataloader(self, *args, **kwargs) -> DataLoader:
# The data should be read off self.root_path
...
class MyContainer(LightningContainer):
def __init__(self):
super().__init__()
self.azure_dataset_id = "folder_name_in_azure_blob_storage"
self.local_dataset = "/some/local/path"
self.num_epochs = 42
def create_model(self) -> LightningModule:
return MyLightningModel()
def get_data_module(self) -> LightningDataModule:
return MyDataModule(root_path=self.local_dataset)
```
Where does the data for training come from?
- When training a model on a local box or VM, the data is read from the `local_dataset` folder that you define in the
container.
- When training a model in AzureML, the code searches for a folder called `folder_name_in_azure_blob_storage` in
Azure blob storage. That is then downloaded or mounted. The local download path is then copied over the `local_dataset`
field in the container, and hence you can always read data from `self.local_dataset`
- Alternatively, you can use the `prepare_data` method of a `LightningDataModule` to download data from the web,
for example. In this case, you don't need to define any of the `local_dataset` or `azure_dataset_id` fields.
In the above example, training is done for 42 epochs. After the model is trained, it will be evaluated on the test set,
via PyTorch Lightning's [built-in test functionality](https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html?highlight=trainer.test#test).
See below for an alternative way of running the evaluation on the test set.
### Outputting files during training
The Lightning model returned by `create_model` needs to write its output files to the current working directory.
When running the InnerEye toolbox outside of AzureML, the toolbox will change the current working directory to a
newly created output folder, with a name that contains the time stamp and and the model name.
When running the InnerEye toolbox in AzureML, the folder structure will be set up such that all files written
to the current working directory are later uploaded to Azure blob storage at the end of the AzureML job. The files
will also be later available via the AzureML UI.
### Trainer arguments
All arguments that control the PyTorch Lightning `Trainer` object are defined in the class `TrainerParams`. A
`LightningContainer` object inherits from this class. The most essential one is the `num_epochs` field, which controls
the `max_epochs` argument of the `Trainer`.
Usage example:
```python
from pytorch_lightning import LightningModule, LightningDataModule
from InnerEye.ML.lightning_container import LightningContainer
class MyContainer(LightningContainer):
def __init__(self):
super().__init__()
self.num_epochs = 42
def create_model(self) -> LightningModule:
return MyLightningModel()
def get_data_module(self) -> LightningDataModule:
return MyDataModule(root_path=self.local_dataset)
```
For further details how the `TrainerParams` are used, refer to the `create_lightning_trainer` method in
[InnerEye/ML/model_training.py](../InnerEye/ML/model_training.py)
### Optimizer and LR scheduler arguments
There are two possible ways of choosing the optimizer and LR scheduler:
- The Lightning model returned by `create_model` can define its own `configure_optimizers` method, with the same
signature as `LightningModule.configure_optimizers`. This is the typical way of configuring it for Lightning models.
- Alternatively, the model can inherit from `LightningModuleWithOptimizer`. This class implements a
`configure_optimizers` method that uses settings defined in the `OptimizerParams` class. These settings are all
available from the command line, and you can, for example, start a new run with a different learning rate by
supplying the additional commandline flag `--l_rate=1e-2`.
### Evaluating the trained model
The InnerEye toolbox provides two possible routes of implementing that:
You can either use PyTorch Lightning's built-in capabilities, via the `test_step` method. If the model that is
returned by `create_model` implements the `test_step` method, the InnerEye toolbox will use the `trainer.test` method
(see [docs](https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html?highlight=trainer.test#test)).
In this case, the best checkpoint during training will be used. The test data is read via the data loader created
by the `test_dataloader` of the `LightningDataModule` that is used for training/validation.
Alternatively, the model can implement the methods defined in `InnerEyeInference`. In this case, the methods will be
call in this order:
```
model.on_inference_start()
for dataset_split in [Train, Val, Test]
model.on_inference_epoch_start(dataset_split, is_ensemble_model=False)
for batch_idx, item in enumerate(dataloader[dataset_split])):
model_outputs = model.forward(item)
model.inference_step(item, batch_idx, model_outputs)
model.on_inference_epoch_end()
model.on_inference_end()
```
## Overriding properties on the commandline
You can define hyperparameters that affect data and/or model, as in the following code snippet:
```python
import param
from pytorch_lightning import LightningModule
from InnerEye.ML.lightning_container import LightningContainer
class DummyContainerWithParameters(LightningContainer):
num_layers = param.Integer(default=4)
def create_model(self) -> LightningModule:
return MyLightningModel(self.num_layers)
...
```
All parameters added in this form will be automatically accessible from the commandline, there is no need to define
a separate argument parser: When starting training, you can add a flag like `--num_layers=7`.
## Examples
### Setting only the required fields
```python
from pytorch_lightning import LightningModule, LightningDataModule
from InnerEye.ML.lightning_container import LightningContainer
class Container1(LightningContainer):
def __init__(self):
super().__init__()
self.azure_dataset_id = "some_folder_in_azure"
self.num_epochs = 20
def create_model(self) -> LightningModule:
return MyLightningModel()
def get_data_module(self) -> LightningDataModule:
# This should read data from self.local_dataset. Before training, the data folder "some_folder_in_azure"
# (given by self.azure_dataset_id) will be downloaded or mounted, and its local path set in
# self.local_dataset
return MyDataModule(root_folder=self.local_dataset)
```
### Adding additional arguments for the PyTorch Lightning trainer
```python
from typing import Dict, Any
from pytorch_lightning import LightningModule, LightningDataModule
from InnerEye.ML.lightning_container import LightningContainer
class Container2(LightningContainer):
def __init__(self):
super().__init__()
self.azure_dataset_id = "some_folder_in_azure"
self.num_epochs = 20
def create_model(self) -> LightningModule:
return MyLightningModel()
def get_data_module(self) -> LightningDataModule:
# This should read data from self.local_dataset. Before training, the data folder "some_folder_in_azure"
# (given by self.azure_dataset_id) will be downloaded or mounted, and its local path set in
# self.local_dataset
return MyDataModule(root_folder=self.local_dataset)
def get_trainer_arguments(self) -> Dict[str, Any]:
# These arguments will be passed through to the Lightning trainer.
return {"gradient_clip_val": 1, "limit_train_batches": 10}
```

Просмотреть файл

@ -27,7 +27,7 @@ dependencies:
- lightning-bolts==0.3.1
- matplotlib==3.3.0
- mlflow==1.12.1
- mypy==0.770
- mypy==0.812
- mypy-extensions==0.4.3
- numba==0.51.2
- numpy==1.19.1
@ -47,6 +47,7 @@ dependencies:
- pytorch-lightning==1.2.8
- rich==5.1.1
- rpdb==0.1.6
- runstats==1.8.0
- scikit-image==0.17.2
- scikit-learn==0.23.2
- scipy==1.5.2

1
fastMRI Submodule

@ -0,0 +1 @@
Subproject commit f2070aeb7a5e7d1b0e45c6aad247d18d074705a8

Просмотреть файл

@ -5,10 +5,10 @@
import os
import subprocess
import sys
from pathlib import Path
from typing import List
from shutil import which
from argparse import ArgumentParser
from pathlib import Path
from shutil import which
from typing import List
def run_mypy(files: List[str], mypy_executable_path: str) -> int:
@ -23,51 +23,24 @@ def run_mypy(files: List[str], mypy_executable_path: str) -> int:
:return: maximum return code from any of the mypy runs
"""
return_code = 0
iteration = 1
while files:
dirs = sorted(set(os.path.dirname(file) or "." for file in files))
print(f"Iteration {iteration}: running mypy on {len(files)} files in {len(dirs)} directories")
# Set of files we are hoping to see mentioned in the mypy log.
files_to_do = set(files)
for index, dir in enumerate(dirs, 1):
# Adding "--no-site-packages" might be necessary if there are errors in site packages,
# but it may stop inconsistencies with site packages being spotted.
command = [mypy_executable_path, "--config=mypy.ini", "--verbose", dir]
print(f"Processing directory {index:2d} of {len(dirs)}: {Path(dir).absolute()}")
print(f"Running mypy on {len(files)} files")
for index, file in enumerate(files):
print(f"Processing {(index+1):2d} of {len(files)}: {file}")
file_path = Path(file)
mypy_args = []
if file_path.is_file():
mypy_args = [file]
elif file_path.is_dir():
# There is a bug in recent mypy versions, complaining about duplicate files when telling
# mypy to scan a directory. Telling it to scan a namespace avoids this bug.
mypy_args = ["-p", file.replace(os.path.sep, ".")]
else:
print("Skipping.")
if mypy_args:
command = [mypy_executable_path, "--config=mypy.ini", *mypy_args]
# We pipe stdout and then print it, otherwise lines can appear in the wrong order in builds.
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
process = subprocess.run(command)
return_code = max(return_code, process.returncode)
for line in process.stdout.split("\n"):
if line and not line.startswith("Success: "):
tokens = line.split(":")
if line.startswith("Found") or len(tokens) < 2:
print(line)
else:
print(f"{Path.cwd() / tokens[0]}:{':'.join(tokens[1:])}")
# Remove from files_to_do every Python file that's reported as processed in the log.
for line in process.stderr.split("\n"):
tokens = line.split()
if len(tokens) == 4 and tokens[0] == "LOG:" and tokens[1] == "Parsing":
name = tokens[2]
elif len(tokens) == 7 and tokens[:4] == ["LOG:", "Metadata", "fresh", "for"]:
name = tokens[-1]
else:
continue
if name.endswith(".py"):
if name.startswith("./") or name.startswith(".\\"):
name = name[2:]
files_to_do.discard(name)
# If we didn't manage to discard any files, there's no point continuing. This should not occur, but if
# it does, we don't want to continue indefinitely.
if len(files_to_do) == len(files):
print("No further files appear to have been checked! Unchecked files are:")
for file in sorted(files_to_do):
print(f" {file}")
return_code = max(return_code, 1)
break
files = sorted(files_to_do)
iteration += 1
return return_code
@ -83,17 +56,11 @@ def main() -> int:
args = parser.parse_args()
current_dir = Path(".")
if args.files:
file_list = [Path(arg) for arg in args.files if arg.endswith(".py")]
file_list = args.files
else:
# We don't want to check the files in the submodule if any, partly because they should already have
# been checked in the original repo, and partly because we don't want the module name clashes mypy would
# otherwise report.
submodule_name = "innereye-deeplearning"
files = set(current_dir.glob('*.py'))
for path in current_dir.glob('*'):
if path.name != submodule_name:
files.update(path.rglob('*.py'))
file_list = list(files)
file_list = list(str(f) for f in current_dir.glob('*.py'))
for dir in ["InnerEye", "Tests", "TestsOutsidePackage", "TestSubmodule"]:
file_list.append(dir)
mypy = args.mypy or which("mypy")
if not mypy:

Просмотреть файл

@ -26,7 +26,7 @@ from InnerEye.ML.model_testing import DEFAULT_RESULT_IMAGE_NAME
from InnerEye.ML.photometric_normalization import PhotometricNormalization
from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
from InnerEye.ML.pipelines.inference import FullImageInferencePipelineBase, InferencePipeline
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.config_loader import ModelConfigLoader
from InnerEye.ML.utils.io_util import ImageWithHeader, load_nifti_image, reverse_tuple_float3, store_as_ubyte_nifti, \
load_dicom_series_and_save
@ -67,8 +67,7 @@ def init_from_model_inference_json(model_folder: Path, use_gpu: bool = True) ->
logging.info(f'model_inference_config: {model_inference_config}')
full_path_to_checkpoints = [model_folder / x for x in model_inference_config.checkpoint_paths]
logging.info(f'full_path_to_checkpoints: {full_path_to_checkpoints}')
loader = ModelConfigLoader[SegmentationModelBase](
model_configs_namespace=model_inference_config.model_configs_namespace)
loader = ModelConfigLoader(model_configs_namespace=model_inference_config.model_configs_namespace)
model_config = loader.create_model_config_from_name(model_name=model_inference_config.model_name)
return create_inference_pipeline(model_config, full_path_to_checkpoints, use_gpu)