Upgrade to Pytorch Lightning 1.5.5 (#591)

This commit is contained in:
Anton Schwaighofer 2021-12-15 10:48:35 +00:00 коммит произвёл GitHub
Родитель 4aa84b9f36
Коммит e477c9dd83
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
35 изменённых файлов: 323 добавлений и 201 удалений

Просмотреть файл

@ -13,7 +13,7 @@
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="InnerEye/ML/runner.py" />
<option name="PARAMETERS" value="--azureml --model=HelloContainer --cluster=training-nc12" />
<option name="PARAMETERS" value="--azureml --model=HelloContainer --cluster=training-nc12 --num_nodes=2 --log_level=DEBUG" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />

Просмотреть файл

@ -13,7 +13,7 @@
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="InnerEye/ML/runner.py" />
<option name="PARAMETERS" value="--model=HelloWorld" />
<option name="PARAMETERS" value="--model=HelloContainer" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />

Просмотреть файл

@ -63,6 +63,7 @@ gets uploaded to AzureML, by skipping all test folders.
- ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
- ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
- ([#591](https://github.com/microsoft/InnerEye-DeepLearning/pull/591)) Upgrade Pytorch Lightning to 1.5.0
- ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
- ([#617](https://github.com/microsoft/InnerEye-DeepLearning/pull/617)) Provide an easier way for LightningContainers to add callbacks.
- ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.

Просмотреть файл

@ -5,10 +5,9 @@
from typing import Any, List, Optional
import torch
from torchmetrics import Metric
from pl_bolts.models.self_supervised import SSLEvaluator
from health_ml.utils import log_on_epoch
from torch.nn import functional as F
from pl_bolts.models.self_supervised import SSLEvaluator
from torch.nn import ModuleList, functional as F
from InnerEye.ML.SSL.encoders import get_encoder_output_dim
from InnerEye.ML.dataset.scalar_sample import ScalarItem
@ -38,18 +37,12 @@ class SSLClassifier(LightningModuleWithOptimizer, DeviceAwareModule):
n_classes=num_classes,
p=0.20)
if self.num_classes == 2:
self.train_metrics: List[Metric] = \
[AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
self.val_metrics: List[Metric] = \
[AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
self.train_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
self.val_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
else:
# Note that for multi-class, Accuracy05 is the standard multi-class accuracy.
self.train_metrics = [Accuracy05()]
self.val_metrics = [Accuracy05()]
def on_train_start(self) -> None:
for metric in [*self.train_metrics, *self.val_metrics]:
metric.to(device=self.device) # type: ignore
self.train_metrics = ModuleList([Accuracy05()])
self.val_metrics = ModuleList([Accuracy05()])
def train(self, mode: bool = True) -> Any:
self.classifier_head.train(mode)

Просмотреть файл

@ -3,8 +3,6 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from typing import Any, Dict, List, Optional, Set, Tuple, Union
import pytorch_lightning as pl
import torch
from pl_bolts.callbacks.ssl_online import SSLOnlineEvaluator
@ -14,8 +12,9 @@ from torch import Tensor as T
from torch.nn import SyncBatchNorm, functional as F
from torch.nn.parallel import DistributedDataParallel
from torchmetrics import Metric
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from InnerEye.ML.SSL.utils import SSLDataModuleType
from InnerEye.ML.SSL.utils import SSLDataModuleType, add_submodules_to_same_device
from InnerEye.ML.lightning_metrics import Accuracy05, AreaUnderPrecisionRecallCurve, AreaUnderRocCurve
from InnerEye.ML.utils.layer_util import set_model_to_eval_mode
from health_ml.utils import log_on_epoch
@ -81,10 +80,17 @@ class SSLOnlineEvaluatorInnerEye(SSLOnlineEvaluator):
If training happens via DDP, SyncBatchNorm is enabled for the online evaluator, and it is converted to
a DDP module.
"""
for metric in [*self.train_metrics, *self.val_metrics]:
metric.to(device=pl_module.device) # type: ignore
for prefix, metrics in [("train", self.train_metrics), ("val", self.val_metrics)]:
add_submodules_to_same_device(pl_module, metrics, prefix=prefix)
self.evaluator.to(pl_module.device)
accelerator = trainer.accelerator_connector
if hasattr(trainer, "accelerator_connector"):
# This works with Lightning 1.3.8
accelerator = trainer.accelerator_connector
elif hasattr(trainer, "_accelerator_connector"):
# This works with Lightning 1.5.5
accelerator = trainer._accelerator_connector
else:
raise ValueError("Unable to retrieve the accelerator information")
if accelerator.is_distributed:
if accelerator.use_ddp:
self.evaluator = SyncBatchNorm.convert_sync_batchnorm(self.evaluator)
@ -152,7 +158,7 @@ class SSLOnlineEvaluatorInnerEye(SSLOnlineEvaluator):
for metric in self.val_metrics:
log_on_epoch(pl_module, f"ssl_online_evaluator/val/{metric.name}", metric)
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: # type: ignore
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None: # type: ignore
"""
Get and log training metrics, perform network update.
"""

Просмотреть файл

@ -6,7 +6,7 @@
import logging
from enum import Enum
from pathlib import Path
from typing import Any, Optional
from typing import Any, Iterable, Optional
import torch
from yacs.config import CfgNode
@ -119,3 +119,23 @@ def SSLModelLoader(ssl_class: Any, num_classes: int) -> Any:
n_hidden=None)
return _wrap
def add_submodules_to_same_device(module: torch.nn.Module,
submodules: Iterable[torch.nn.Module],
prefix: str = "") -> None:
"""
Adds each of the given submodules to the "main" module, and moves them to the same device as the "main"
module. The submodules get a name derived from their class name, with the given prefix.
:param module: The module to which submodules should be added.
:param submodules: The submodules to add.
:param prefix: A string prefix that will be used to create the name of the submodule.
"""
def _class_name(o: Any) -> str:
return type(o).__name__
for m in submodules:
m.to(device=module.device) # type: ignore
module.add_module(f"{prefix}{_class_name(m)}", m)

Просмотреть файл

@ -32,8 +32,10 @@ REGRESSION_TEST_OUTPUT_FOLDER = "OUTPUT"
REGRESSION_TEST_AZUREML_FOLDER = "AZUREML_OUTPUT"
REGRESSION_TEST_AZUREML_PARENT_FOLDER = "AZUREML_PARENT_OUTPUT"
CONTENTS_MISMATCH = "Contents mismatch"
FILE_FORMAT_ERROR = "File format error"
MISSING_FILE = "Missing"
TEXT_FILE_SUFFIXES = [".txt", ".csv", ".json", ".html", ".md"]
CSV_SUFFIX = ".csv"
TEXT_FILE_SUFFIXES = [".txt", ".json", ".html", ".md"]
INFERENCE_DISABLED_WARNING = "Not performing comparison of model against baseline(s), because inference is currently " \
"disabled. If comparison is required, use either the inference_on_test_set or " \
@ -185,7 +187,7 @@ def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig,
return comparison_baselines
def compare_files(expected: Path, actual: Path) -> str:
def compare_files(expected: Path, actual: Path, csv_relative_tolerance: float = 0.0) -> str:
"""
Compares two individual files for regression testing. It returns an empty string if the two files appear identical.
If the files are not identical, an error message with details is return. This handles known text file formats,
@ -195,16 +197,35 @@ def compare_files(expected: Path, actual: Path) -> str:
:param expected: A file that contains the expected contents. The type of comparison (text or binary) is chosen
based on the extension of this file.
:param actual: A file that contains the actual contents.
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
If 0.0, do not allow any discrepancy.
:return: An empty string if the files appear identical, or otherwise an error message with details.
"""
def print_lines(prefix: str, lines: List[str]) -> None:
num_lines = len(lines)
count = min(5, num_lines)
logging.debug(f"{prefix} {num_lines} lines, first {count} of those:")
logging.debug(os.linesep.join(lines[:count]))
logging.info(f"{prefix} {num_lines} lines, first {count} of those:")
logging.info(os.linesep.join(lines[:count]))
if expected.suffix in TEXT_FILE_SUFFIXES:
def try_read_csv(prefix: str, file: Path) -> Optional[pd.DataFrame]:
try:
return pd.read_csv(file)
except Exception as ex:
logging.info(f"{prefix} file can't be read as CSV: {str(ex)}")
return None
if expected.suffix == CSV_SUFFIX:
expected_df = try_read_csv("Expected", expected)
actual_df = try_read_csv("Actual", actual)
if expected_df is None or actual_df is None:
return FILE_FORMAT_ERROR
try:
pd.testing.assert_frame_equal(actual_df, expected_df, rtol=csv_relative_tolerance)
except Exception as ex:
logging.info(str(ex))
return CONTENTS_MISMATCH
elif expected.suffix in TEXT_FILE_SUFFIXES:
# Compare line-by-line to avoid issues with line separators
expected_lines = expected.read_text().splitlines()
actual_lines = actual.read_text().splitlines()
@ -216,12 +237,13 @@ def compare_files(expected: Path, actual: Path) -> str:
expected_binary = expected.read_bytes()
actual_binary = actual.read_bytes()
if expected_binary != actual_binary:
logging.debug(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
logging.info(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
return CONTENTS_MISMATCH
return ""
def compare_folder_contents(expected_folder: Path,
csv_relative_tolerance: float,
actual_folder: Optional[Path] = None,
run: Optional[Run] = None) -> List[str]:
"""
@ -230,9 +252,12 @@ def compare_folder_contents(expected_folder: Path,
(or the AzureML run), with exactly the same contents, in the same folder structure.
For example, if there is a file "<expected>/foo/bar/contents.txt", then there must also be a file
"<actual>/foo/bar/contents.txt"
:param expected_folder: A folder with files that are expected to be present.
:param actual_folder: The output folder with the actually produced files.
:param run: An AzureML run
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
If 0.0, do not allow any discrepancy.
:return: A list of human readable error messages, with message and file path. If no errors are found, the list is
empty.
"""
@ -256,7 +281,8 @@ def compare_folder_contents(expected_folder: Path,
run.download_file(name=str(file_relative), output_file_path=str(actual_file))
else:
raise ValueError("One of the two arguments run, actual_folder must be provided.")
message = compare_files(expected=file, actual=actual_file) if actual_file.exists() else MISSING_FILE
message = compare_files(expected=file, actual=actual_file,
csv_relative_tolerance=csv_relative_tolerance) if actual_file.exists() else MISSING_FILE
if message:
messages.append(f"{message}: {file_relative}")
logging.info(f"File {file_relative}: {message or 'OK'}")
@ -265,15 +291,18 @@ def compare_folder_contents(expected_folder: Path,
return messages
def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
def compare_folders_and_run_outputs(expected: Path, actual: Path, csv_relative_tolerance: float) -> None:
"""
Compares the actual set of run outputs in the `actual` folder against an expected set of files in the `expected`
folder. The `expected` folder can have two special subfolders AZUREML_OUTPUT and AZUREML_PARENT_OUTPUT, that
contain files that are expected to be present in the AzureML run context of the present run (AZUREML_OUTPUT)
or the run context of the parent run (AZUREML_PARENT_OUTPUT).
If a file is missing, or does not have the expected contents, an exception is raised.
:param expected: A folder with files that are expected to be present.
:param actual: The output folder with the actually produced files.
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
If 0.0, do not allow any discrepancy.
"""
if not expected.is_dir():
raise ValueError(f"Folder with expected files does not exist: {expected}")
@ -289,7 +318,10 @@ def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
if actual_folder is None and run_to_compare is None:
raise ValueError(f"The set of expected test results in {expected} contains a folder "
f"{subfolder}, but there is no (parent) run to compare against.")
new_messages = compare_folder_contents(folder, actual_folder=actual_folder, run=run_to_compare)
new_messages = compare_folder_contents(folder,
actual_folder=actual_folder,
run=run_to_compare,
csv_relative_tolerance=csv_relative_tolerance)
if new_messages:
messages.append(f"Issues in {message_prefix}:")
messages.extend(new_messages)

Просмотреть файл

@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple
import numpy as np
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.metrics import MeanAbsoluteError
from torchmetrics.regression import MeanAbsoluteError
from torch.optim import Adam, Optimizer
from torch.optim.lr_scheduler import StepLR, _LRScheduler
from torch.utils.data import DataLoader, Dataset

Просмотреть файл

@ -243,6 +243,10 @@ class WorkflowParams(param.Parameterized):
"folder, and their contents must match exactly. When running in AzureML, you need to "
"ensure that this folder is part of the snapshot that gets uploaded. The path should "
"be relative to the repository root directory.")
regression_test_csv_tolerance: float = \
param.Number(default=0.0, allow_None=False,
doc="When comparing CSV files during regression tests, use this value as the maximum allowed "
"relative difference of actual and expected results. Default: 0.0 (must match exactly)")
def validate(self) -> None:
if sum([bool(param) for param in [self.weights_url, self.local_weights_path, self.model_id]]) > 1:
@ -583,7 +587,7 @@ class TrainerParams(param.Parameterized):
param.Boolean(default=False,
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
"you may see training speed increases.")
"you may see significant training speed increases.")
pl_find_unused_parameters: bool = \
param.Boolean(default=False,
doc="Controls the PyTorch Lightning flag 'find_unused_parameters' for the DDP plugin. "
@ -606,7 +610,7 @@ class TrainerParams(param.Parameterized):
monitor_gpu: bool = param.Boolean(default=False,
doc="If True, add the GPUStatsMonitor callback to the Lightning trainer object. "
"This will write GPU utilization metrics every 50 batches by default.")
monitor_loading: bool = param.Boolean(default=True,
monitor_loading: bool = param.Boolean(default=False,
doc="If True, add the BatchTimeCallback callback to the Lightning trainer "
"object. This will monitor how long individual batches take to load.")

Просмотреть файл

@ -253,14 +253,6 @@ class InnerEyeLightning(LightningModule):
self.train_epoch_metrics_logger.flush()
self.val_epoch_metrics_logger.flush()
@property
def use_sync_dist(self) -> bool:
"""
Returns True if metric logging should use sync_dist=True. This is read off from the use_ddp flag of the trainer.
"""
assert isinstance(self.trainer, Trainer)
return self.trainer.accelerator_connector.use_ddp
def training_epoch_end(self, outputs: List[Any]) -> None:
# Write out all the metrics that have been accumulated in the StoringLogger in the previous epoch.
# Metrics for the very last epoch are written in on_train_end
@ -314,32 +306,29 @@ class InnerEyeLightning(LightningModule):
name: Union[MetricType, str],
value: Any,
is_training: bool,
reduce_fx: Callable = torch.mean,
sync_dist_override: Optional[bool] = None,
sync_dist_op: Any = "mean") -> None:
reduce_fx: Union[str, Callable] = "mean",
sync_dist_override: Optional[bool] = None) -> None:
"""
Logs a metrics to Pytorch Lightning with the on_epoch flag set. The metric will get a prefix indicating
if it is a training or a validation metric. A custom reducer function can be provided.
The method also ensures that the correct synchronization across nodes is used. If the value to log is a
floating point, it is converted to a Tensor on the current device to enable synchronization.
:param sync_dist_override: If not None, use this value for the sync_dist argument to self.log. If None,
set it automatically depending on the use of DDP.
:param name: The name of the metric to log
:param value: The value of the metric. This can be a tensor, floating point value, or a Metric class.
:param is_training: If true, give the metric a "train/" prefix, otherwise a "val/" prefix.
:param reduce_fx: The reduce function to apply to step values. Default: torch.mean
:param sync_dist_op: The reduce operation to use when synchronizing the tensors across GPUs. This must be
a value recognized by sync_ddp: Either 'None' to use 'sum' as aggregate, or 'mean' or 'avg'
:param reduce_fx: The reduce function to use when synchronizing the tensors across GPUs. This must be
a value recognized by sync_ddp: "sum", "mean"
"""
metric_name = name if isinstance(name, str) else name.value
prefix = TRAIN_PREFIX if is_training else VALIDATION_PREFIX
sync_dist = self.use_sync_dist if sync_dist_override is None else sync_dist_override
log_on_epoch(self,
name=prefix + metric_name,
value=value,
sync_dist=sync_dist,
reduce_fx=reduce_fx,
sync_dist_op=sync_dist_op)
sync_dist=sync_dist_override,
reduce_fx=reduce_fx)
def store_epoch_results(self, metrics: DictStrFloat, epoch: int, is_training: bool) -> None:
"""

Просмотреть файл

@ -10,7 +10,7 @@ import torch
import torch.nn.functional as F
import torchmetrics as metrics
from torchmetrics import Metric
from pytorch_lightning.metrics.functional import accuracy, auc, auroc, precision_recall_curve, roc
from torchmetrics.functional import accuracy, auc, auroc, precision_recall_curve, roc
from torch.nn import ModuleList
from InnerEye.Common.metrics_constants import AVERAGE_DICE_SUFFIX, MetricType, TRAIN_PREFIX, VALIDATION_PREFIX

Просмотреть файл

@ -92,11 +92,11 @@ class SegmentationLightning(InnerEyeLightning):
# apply mask if required
if mask is not None:
posteriors = image_util.apply_mask_to_posteriors(posteriors=posteriors, mask=mask)
posteriors = image_util.apply_mask_to_posteriors(posteriors=posteriors, mask=mask) # type: ignore
# post process posteriors to compute result
segmentation = image_util.posteriors_to_segmentation(posteriors=posteriors)
self.compute_metrics(cropped_sample, segmentation, is_training)
segmentation = image_util.posteriors_to_segmentation(posteriors=posteriors) # type: ignore
self.compute_metrics(cropped_sample, segmentation, is_training) # type: ignore
self.write_loss(is_training, loss)
return loss
@ -114,10 +114,10 @@ class SegmentationLightning(InnerEyeLightning):
# Dice NaN means that both ground truth and prediction are empty.
dice_per_crop_and_class = compute_dice_across_patches(
segmentation=segmentation,
ground_truth=cropped_sample.labels_center_crop,
ground_truth=cropped_sample.labels_center_crop, # type: ignore
allow_multiple_classes_for_each_pixel=True)[:, 1:]
# Number of foreground voxels per class, across all crops
foreground_voxels = metrics_util.get_number_of_voxels_per_class(cropped_sample.labels)[:, 1:]
foreground_voxels = metrics_util.get_number_of_voxels_per_class(cropped_sample.labels)[:, 1:] # type: ignore
# Store Dice and voxel count per sample in the minibatch. We need a custom aggregation logic for Dice
# because it can be NaN. Also use custom logging for voxel count because Lightning's batch-size weighted
# average has a bug.
@ -150,8 +150,7 @@ class SegmentationLightning(InnerEyeLightning):
self.log_on_epoch(name=MetricType.SUBJECT_COUNT,
value=num_subjects,
is_training=is_training,
reduce_fx=torch.sum,
sync_dist_op="sum")
reduce_fx=torch.sum)
def training_or_validation_epoch_end(self, is_training: bool) -> None:
"""
@ -292,7 +291,7 @@ class ScalarLightning(InnerEyeLightning):
logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger # type: ignore
logger.flush()
def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any: # type: ignore
def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: # type: ignore
"""
For sequence models, transfer the nested lists of items to the given GPU device.
For all other models, this relies on the superclass to move the batch of data to the GPU.

Просмотреть файл

@ -14,7 +14,6 @@ import numpy as np
import torch
import torch.nn.functional as F
from azureml.core import Run
from numpy.core.numeric import NaN
from InnerEye.Azure.azure_util import get_run_context_or_default
from InnerEye.Common.metrics_constants import LoggingColumns, MetricType
@ -174,9 +173,9 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
continue
# Skip but record if nan_image
elif nan_images[i]:
add_metric(MetricType.DICE, NaN)
add_metric(MetricType.HAUSDORFF_mm, NaN)
add_metric(MetricType.MEAN_SURFACE_DIST_mm, NaN)
add_metric(MetricType.DICE, np.nan)
add_metric(MetricType.HAUSDORFF_mm, np.nan)
add_metric(MetricType.MEAN_SURFACE_DIST_mm, np.nan)
continue
check_size_matches(prediction, ground_truth[i], arg1_name="prediction", arg2_name="ground_truth")
if not is_binary_array(prediction):

Просмотреть файл

@ -9,7 +9,7 @@ from pathlib import Path
from typing import Any, List, Optional, Tuple, TypeVar
from pytorch_lightning import Callback, LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import GPUStatsMonitor, ModelCheckpoint
from pytorch_lightning.callbacks import GPUStatsMonitor, ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.plugins import DDPPlugin
@ -18,14 +18,14 @@ from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, change_working_directory
from InnerEye.Common.resource_monitor import ResourceMonitor
from InnerEye.ML.common import ARGS_TXT, ModelExecutionMode, RECOVERY_CHECKPOINT_FILE_NAME, VISUALIZATION_FOLDER
from InnerEye.ML.utils.checkpoint_handling import create_best_checkpoint
from InnerEye.ML.lightning_base import InnerEyeContainer, InnerEyeLightning
from InnerEye.ML.lightning_container import LightningContainer
from InnerEye.ML.lightning_loggers import StoringLogger
from InnerEye.ML.lightning_models import SUBJECT_OUTPUT_PER_RANK_PREFIX, ScalarLightning, \
get_subject_output_file_per_rank
from InnerEye.ML.utils.checkpoint_handling import create_best_checkpoint
from health_azure.utils import is_global_rank_zero, is_local_rank_zero
from health_ml.utils import AzureMLLogger, AzureMLProgressBar, BatchTimeCallback, log_on_epoch
from health_ml.utils import AzureMLLogger, AzureMLProgressBar, log_on_epoch
TEMP_PREFIX = "temp/"
@ -66,7 +66,7 @@ class InnerEyeRecoveryCheckpointCallback(ModelCheckpoint):
super().__init__(dirpath=str(container.checkpoint_folder),
monitor="epoch_started",
filename=RECOVERY_CHECKPOINT_FILE_NAME + "_{epoch}",
period=container.recovery_checkpoint_save_interval,
every_n_epochs=container.recovery_checkpoint_save_interval,
save_top_k=container.recovery_checkpoints_save_last_k,
mode="max",
save_last=False)
@ -74,6 +74,7 @@ class InnerEyeRecoveryCheckpointCallback(ModelCheckpoint):
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule, unused: bool = None) -> None:
# The metric to monitor must be logged on all ranks in distributed training
log_on_epoch(pl_module, name="epoch_started", value=trainer.current_epoch, sync_dist=False) # type: ignore
super().on_train_epoch_end(trainer, pl_module)
def create_lightning_trainer(container: LightningContainer,
@ -92,17 +93,23 @@ def create_lightning_trainer(container: LightningContainer,
logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
num_gpus = container.num_gpus_per_node()
effective_num_gpus = num_gpus * num_nodes
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
if effective_num_gpus > 1:
accelerator: Optional[str] = "ddp"
# Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin prints out
# lengthy warnings about the performance impact of find_unused_parameters.
plugins = [DDPPlugin(num_nodes=num_nodes, sync_batchnorm=True,
find_unused_parameters=container.pl_find_unused_parameters)]
strategy = None
if effective_num_gpus == 0:
accelerator = "cpu"
devices = 1
message = "CPU"
else:
accelerator = None
plugins = []
logging.info(f"Using {num_gpus} GPUs per node with accelerator '{accelerator}'")
accelerator = "gpu"
devices = num_gpus
message = f"{devices} GPU"
if effective_num_gpus > 1:
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of
# GPU memory).
# Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin
# prints out lengthy warnings about the performance impact of find_unused_parameters.
strategy = DDPPlugin(find_unused_parameters=container.pl_find_unused_parameters)
message += "s per node with DDP"
logging.info(f"Using {message}")
tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="")
loggers = [tensorboard_logger, AzureMLLogger(False)]
storing_logger = StoringLogger()
@ -111,8 +118,7 @@ def create_lightning_trainer(container: LightningContainer,
precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
# The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
# https://pytorch.org/docs/stable/notes/randomness.html
# For the classification models, we observed only a small performance deterioration (increase in 10sec on total
# training time of 22min) when switching to deterministic.
# Note that switching to deterministic models can have large performance downside.
if container.pl_deterministic:
deterministic = True
benchmark = False
@ -134,7 +140,9 @@ def create_lightning_trainer(container: LightningContainer,
recovery_checkpoint_callback,
]
if container.monitor_loading:
callbacks.append(BatchTimeCallback())
# TODO antonsc: Remove after fixing the callback.
raise NotImplementedError("Monitoring batch loading times has been temporarily disabled.")
# callbacks.append(BatchTimeCallback())
if num_gpus > 0 and container.monitor_gpu:
logging.info("Adding monitoring for GPU utilization")
callbacks.append(GPUStatsMonitor(intra_step_time=True, inter_step_time=True))
@ -150,19 +158,23 @@ def create_lightning_trainer(container: LightningContainer,
callbacks.extend(container.get_callbacks())
is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate
if progress_bar_refresh_rate is None:
progress_bar_refresh_rate = 50
logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
f"To change, modify the pl_progress_bar_refresh_rate field of the container.")
if is_azureml_run:
if progress_bar_refresh_rate is None:
progress_bar_refresh_rate = 50
logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
f"To change, modify the pl_progress_bar_refresh_rate field of the container.")
callbacks.append(AzureMLProgressBar(refresh_rate=progress_bar_refresh_rate,
write_to_logging_info=True,
print_timestamp=False))
else:
callbacks.append(TQDMProgressBar(refresh_rate=progress_bar_refresh_rate))
# Read out additional model-specific args here.
# We probably want to keep essential ones like numgpu and logging.
trainer = Trainer(default_root_dir=str(container.outputs_folder),
deterministic=deterministic,
benchmark=benchmark,
accelerator=accelerator,
plugins=plugins,
strategy=strategy,
max_epochs=container.num_epochs,
# Both these arguments can be integers or floats. If integers, it is the number of batches.
# If float, it's the fraction of batches. We default to 1.0 (processing all batches).
@ -172,12 +184,11 @@ def create_lightning_trainer(container: LightningContainer,
check_val_every_n_epoch=container.pl_check_val_every_n_epoch,
callbacks=callbacks,
logger=loggers,
progress_bar_refresh_rate=progress_bar_refresh_rate,
num_nodes=num_nodes,
gpus=num_gpus,
devices=devices,
precision=precision,
sync_batchnorm=True,
terminate_on_nan=container.detect_anomaly,
detect_anomaly=container.detect_anomaly,
profiler=container.pl_profiler,
resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
**additional_args)

Просмотреть файл

@ -377,7 +377,8 @@ class MLRunner:
logging.info("Comparing the current results against stored results")
if self.is_normal_run_or_crossval_child_0():
compare_folders_and_run_outputs(expected=self.container.regression_test_folder,
actual=self.container.outputs_folder)
actual=self.container.outputs_folder,
csv_relative_tolerance=self.container.regression_test_csv_tolerance)
else:
logging.info("Skipping because this is not cross-validation child run 0.")
@ -718,7 +719,9 @@ class MLRunner:
def are_sibling_runs_finished(self) -> bool:
"""
Checks if all child runs (except the current run) of the current run's parent are completed or failed.
Checks if all child runs (except the current run) of the current run's parent are completed, failed,
or cancelled.
:return: True if all sibling runs of the current run have finished (they either completed successfully,
or failed). False if any of them is still pending (running or queued).
"""
@ -729,7 +732,7 @@ class MLRunner:
expected_number_cross_validation_splits=n_splits)
pending_runs = [x.id for x in child_runs
if (x.id != RUN_CONTEXT.id)
and (x.get_status() not in [RunStatus.COMPLETED, RunStatus.FAILED])]
and (x.get_status() not in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELED])]
all_runs_finished = len(pending_runs) == 0
if not all_runs_finished:
logging.info(f"Waiting for sibling run(s) to finish: {pending_runs}")

Просмотреть файл

@ -117,7 +117,6 @@ class Runner:
:param model_deployment_hook: an optional function for deploying a model in an application-specific way.
If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML
Model as arguments, and return an optional Path and a further object of any type.
:param command_line_args: command-line arguments to use; if None, use sys.argv.
"""
def __init__(self,
@ -230,6 +229,8 @@ class Runner:
and not self.lightning_container.azure_dataset_id:
raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
"property must be set.")
# https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
env_variables = {"CUBLAS_WORKSPACE_CONFIG": ":4096:8"} if self.lightning_container.pl_deterministic else {}
source_config = SourceConfig(
root_folder=self.project_root,
entry_script=Path(sys.argv[0]).resolve(),
@ -238,7 +239,8 @@ class Runner:
hyperdrive_config_func=(self.model_config.get_hyperdrive_config if self.model_config
else self.lightning_container.get_hyperdrive_config),
# For large jobs, upload of results can time out because of large checkpoint files. Default is 600
upload_timeout_seconds=86400
upload_timeout_seconds=86400,
environment_variables=env_variables
)
# Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed
# when running pytest.
@ -259,6 +261,10 @@ class Runner:
"""
A function that will be called right after job submission.
"""
# Set the default display name to what was provided as the "tag". This will affect single runs
# and Hyperdrive parent runs
if self.azure_config.tag:
azure_run.display_name = self.azure_config.tag
# Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
# run in cross validation analysis
recovery_id = create_run_recovery_id(azure_run)
@ -332,11 +338,14 @@ class Runner:
commandline_args=" ".join(source_config.script_params)),
after_submission=after_submission_hook,
hyperdrive_config=hyperdrive_config)
# Set the default display name to what was provided as the "tag"
if self.azure_config.tag:
azure_run_info.run.display_name = self.azure_config.tag
if self.azure_config.tag and azure_run_info.run:
if self.lightning_container.perform_cross_validation:
# This code is only reached inside Azure. Set display name again - this will now affect
# Hypdrive child runs (for other jobs, this has already been done after submission)
cv_index = self.lightning_container.cross_validation_split_index
full_display_name = f"{self.azure_config.tag} {cv_index}"
azure_run_info.run.display_name = full_display_name
else:
# compute_cluster_name is a required parameter in early versions of the HI-ML package
azure_run_info = submit_to_azure_if_needed(
input_datasets=input_datasets,
submit_to_azureml=False)

Просмотреть файл

@ -224,9 +224,9 @@ def r2_score(model_output: Union[torch.Tensor, np.ndarray], label: Union[torch.T
Computes the coefficient of determination R2. Represents the proportion of variance explained
by the (independent) variables in the model. R2 = 1 - Mean(SquaredErrors) / Variance(Labels)
"""
if torch.is_tensor(label):
if isinstance(label, torch.Tensor):
label = label.detach().cpu().numpy()
if torch.is_tensor(model_output):
if isinstance(model_output, torch.Tensor):
model_output = model_output.detach().cpu().numpy()
return sklearn_r2_score(label, model_output)
@ -260,10 +260,10 @@ def convert_input_and_label(model_output: Union[torch.Tensor, np.ndarray],
model_output = torch.tensor(model_output)
if not torch.is_tensor(label):
label = torch.tensor(label)
return model_output.float(), label.float()
return model_output.float(), label.float() # type: ignore
def is_missing_ground_truth(ground_truth: np.array) -> bool:
def is_missing_ground_truth(ground_truth: np.ndarray) -> bool:
"""
calculate_metrics_per_class in metrics.py and plot_contours_for_all_classes in plotting.py both
check whether there is ground truth missing using this simple check for NaN value at 0, 0, 0.

Просмотреть файл

@ -1,3 +1,3 @@
subject_count,loss,learning_rate,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
2.000000,0.718559,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,98256.000000,0,-1
2.000000,0.792989,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,43307.000000,13992.500000,1,-1
2.000000,0.718717,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,126273.000000,0,-1
2.000000,0.775692,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,84030.000000,0.000000,1,-1

1 subject_count loss learning_rate Dice/AverageAcrossStructures Dice/spinalcord Dice/lung_r Dice/lung_l VoxelCount/spinalcord VoxelCount/lung_r VoxelCount/lung_l epoch cross_validation_split_index
2 2.000000 0.718559 0.718717 0.000100 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 98256.000000 126273.000000 0 -1
3 2.000000 0.792989 0.775692 0.000090 0.000000 0.000000 0.000000 0.000000 0.000000 43307.000000 84030.000000 13992.500000 0.000000 1 -1

Просмотреть файл

@ -1,3 +1,3 @@
subject_count,loss,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
2.000000,0.715468,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,89502.000000,0,-1
2.000000,0.715476,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,89502.000000,1,-1
2.000000,0.716739,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,84282.000000,0,-1
2.000000,0.716731,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,84282.000000,1,-1

1 subject_count loss Dice/AverageAcrossStructures Dice/spinalcord Dice/lung_r Dice/lung_l VoxelCount/spinalcord VoxelCount/lung_r VoxelCount/lung_l epoch cross_validation_split_index
2 2.000000 0.715468 0.716739 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 89502.000000 84282.000000 0 -1
3 2.000000 0.715476 0.716731 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 89502.000000 84282.000000 1 -1

Просмотреть файл

@ -1,3 +1,3 @@
subject_count,loss,learning_rate,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
4.000000,0.753852,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,18609.000000,61179.000000,0,-1
4.000000,0.773389,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,33453.000000,24258.500000,1,-1
4.000000,0.716914,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,99342.000000,0,-1
4.000000,0.773825,0.000090,0.000000,0.000000,0.000000,0.000000,181.250000,83803.250000,122.250000,1,-1

1 subject_count loss learning_rate Dice/AverageAcrossStructures Dice/spinalcord Dice/lung_r Dice/lung_l VoxelCount/spinalcord VoxelCount/lung_r VoxelCount/lung_l epoch cross_validation_split_index
2 4.000000 0.753852 0.716914 0.000100 0.000000 0.000000 0.000000 0.000000 0.000000 18609.000000 0.000000 61179.000000 99342.000000 0 -1
3 4.000000 0.773389 0.773825 0.000090 0.000000 0.000000 0.000000 0.000000 0.000000 181.250000 33453.000000 83803.250000 24258.500000 122.250000 1 -1

Просмотреть файл

@ -1,3 +1,3 @@
subject_count,loss,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
4.000000,0.758059,0.000000,0.000000,0.000000,0.000000,0.000000,32021.000000,48380.750000,0,-1
4.000000,0.758054,0.000000,0.000000,0.000000,0.000000,0.000000,32021.000000,48380.750000,1,-1
4.000000,0.729041,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,65335.000000,0,-1
4.000000,0.729027,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,65335.000000,1,-1

1 subject_count loss Dice/AverageAcrossStructures Dice/spinalcord Dice/lung_r Dice/lung_l VoxelCount/spinalcord VoxelCount/lung_r VoxelCount/lung_l epoch cross_validation_split_index
2 4.000000 0.758059 0.729041 0.000000 0.000000 0.000000 0.000000 0.000000 32021.000000 0.000000 48380.750000 65335.000000 0 -1
3 4.000000 0.758054 0.729027 0.000000 0.000000 0.000000 0.000000 0.000000 32021.000000 0.000000 48380.750000 65335.000000 1 -1

Просмотреть файл

@ -1,3 +1,3 @@
subject_count,loss,learning_rate,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
2.000000,0.720039,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,110279.000000,0,0
2.000000,0.793326,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,43307.000000,3900.500000,1,0
2.000000,0.723947,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,148093.000000,0,0
2.000000,0.768604,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,86614.000000,0.000000,1,0

1 subject_count loss learning_rate Dice/AverageAcrossStructures Dice/spinalcord Dice/lung_r Dice/lung_l VoxelCount/spinalcord VoxelCount/lung_r VoxelCount/lung_l epoch cross_validation_split_index
2 2.000000 0.720039 0.723947 0.000100 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 110279.000000 148093.000000 0 0
3 2.000000 0.793326 0.768604 0.000090 0.000000 0.000000 0.000000 0.000000 0.000000 43307.000000 86614.000000 3900.500000 0.000000 1 0

Просмотреть файл

@ -1,3 +1,3 @@
subject_count,loss,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
2.000000,0.721449,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,54751.500000,0,0
2.000000,0.721449,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,54751.500000,1,0
2.000000,0.715168,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,94367.500000,0,0
2.000000,0.715166,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,94367.500000,1,0

1 subject_count loss Dice/AverageAcrossStructures Dice/spinalcord Dice/lung_r Dice/lung_l VoxelCount/spinalcord VoxelCount/lung_r VoxelCount/lung_l epoch cross_validation_split_index
2 2.000000 0.721449 0.715168 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 54751.500000 94367.500000 0 0
3 2.000000 0.721449 0.715166 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 54751.500000 94367.500000 1 0

Просмотреть файл

@ -12,8 +12,9 @@ up the most recently run AzureML job from most_recent_run.txt
import os
import shutil
import sys
import tempfile
from pathlib import Path
from typing import List
from typing import List, Optional
from unittest import mock
import numpy as np
@ -57,7 +58,7 @@ from health_azure.himl import RUN_RECOVERY_FILE
FALLBACK_SINGLE_RUN = "refs_pull_606_merge:refs_pull_606_merge_1638867172_17ba8dc5"
FALLBACK_ENSEMBLE_RUN = "refs_pull_606_merge:HD_b8a6ad93-8c19-45de-8ea1-f87fce92c3bd"
FALLBACK_2NODE_RUN = "refs_pull_606_merge:refs_pull_606_merge_1638867224_8d8072fe"
FALLBACK_2NODE_RUN = "refs_pull_593_merge:refs_pull_591_merge_1639416130_e5d29ba7"
FALLBACK_CV_GLAUCOMA = "refs_pull_545_merge:HD_72ecc647-07c3-4353-a538-620346114ebd"
FALLBACK_HELLO_CONTAINER_RUN = "refs_pull_606_merge:refs_pull_606_merge_1638867108_789991ac"
@ -386,6 +387,28 @@ def test_register_and_score_model(test_output_dirs: OutputFolderForTests) -> Non
assert_nifti_content(str(expected_segmentation_path), expected_shape, image_header, [3], np.ubyte)
def get_job_log_file(run: Run, index: Optional[int] = None) -> str:
"""
Reads the job log file (70_driver_log.txt or std_log.txt) of the given job. If an index is provided, get
the matching file from a multi-node job.
:return: The contents of the job log file.
"""
assert run.status == RunStatus.COMPLETED
files = run.get_file_names()
suffix = (f"_{index}" if index is not None else "") + ".txt"
file1 = "azureml-logs/70_driver_log" + suffix
file2 = "user_logs/std_log" + suffix
if file1 in files:
file = file1
elif file2 in files:
file = file2
else:
raise ValueError(f"No log file ({file1} or {file2}) present in the run. Existing files: {files}")
downloaded = tempfile.NamedTemporaryFile().name
run.download_file(name=file, output_file_path=downloaded)
return Path(downloaded).read_text()
@pytest.mark.after_training_2node
def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
"""
@ -393,19 +416,9 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
"""
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
assert run.status == RunStatus.COMPLETED
files = run.get_file_names()
# There are two nodes, so there should be one log file per node.
log0_path = "azureml-logs/70_driver_log_0.txt"
log1_path = "azureml-logs/70_driver_log_1.txt"
assert log0_path in files, "Node rank 0 log file is missing"
assert log1_path in files, "Node rank 1 log file is missing"
# Download both log files and check their contents
log0 = test_output_dirs.root_dir / log0_path
log1 = test_output_dirs.root_dir / log1_path
run.download_file(log0_path, output_file_path=str(log0))
run.download_file(log1_path, output_file_path=str(log1))
log0_txt = log0.read_text()
log1_txt = log1.read_text()
log0_txt = get_job_log_file(run, index=0)
log1_txt = get_job_log_file(run, index=1)
# Only the node at rank 0 should be done certain startup activities, like visualizing crops.
# Running inference similarly should only run on one node.
for in_log0_only in ["Visualizing the effect of sampling random crops for training",
@ -418,8 +431,8 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
assert training_indicator in log1_txt
# Check diagnostic messages that show if DDP was set up correctly. This could fail if Lightning
# changes its diagnostic outputs.
assert "initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4" in log0_txt
assert "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
assert "initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4" in log0_txt
assert "initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
@pytest.mark.skip("The recovery job hangs after completing on AML")
@ -443,19 +456,9 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
main()
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
assert run.status == RunStatus.COMPLETED
files = run.get_file_names()
# There are two nodes, so there should be one log file per node.
log0_path = "azureml-logs/70_driver_log_0.txt"
log1_path = "azureml-logs/70_driver_log_1.txt"
assert log0_path in files, "Node rank 0 log file is missing"
assert log1_path in files, "Node rank 1 log file is missing"
# Download both log files and check their contents
log0 = test_output_dirs.root_dir / log0_path
log1 = test_output_dirs.root_dir / log1_path
run.download_file(log0_path, output_file_path=str(log0))
run.download_file(log1_path, output_file_path=str(log1))
log0_txt = log0.read_text()
log1_txt = log1.read_text()
log0_txt = get_job_log_file(run, index=0)
log1_txt = get_job_log_file(run, index=1)
assert "Downloading multiple files from run" in log0_txt
assert "Downloading multiple files from run" not in log1_txt
assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt

Просмотреть файл

@ -9,7 +9,7 @@ import pandas as pd
import param
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.metrics import MeanSquaredError
from torchmetrics.regression import MeanSquaredError
from torch import Tensor
from torch.nn import Identity
from torch.utils.data import DataLoader, Dataset

Просмотреть файл

@ -209,6 +209,8 @@ def test_rnn_classifier_via_config_1(use_combined_model: bool,
use_mean_teacher_model=use_mean_teacher_model,
should_validate=False)
config.use_mixed_precision = True
# Necessary because torch otherwise says "index_add_cuda_ does not have a deterministic implementation"
config.pl_deterministic = False
config.set_output_to(test_output_dirs.root_dir)
config.dataset_data_frame = _get_mock_sequence_dataset()
# Patch the load_images function that will be called once we access a dataset item
@ -377,6 +379,8 @@ def test_rnn_classifier_via_config_2(test_output_dirs: OutputFolderForTests) ->
dataset_contents += f"S{subject},{i},{value},{label}\n"
logging_to_stdout()
config = ToySequenceModel2(should_validate=False)
# Necessary because torch otherwise says "index_add_cuda_ does not have a deterministic implementation"
config.pl_deterministic = False
config.num_epochs = 2
config.set_output_to(test_output_dirs.root_dir)
config.dataset_data_frame = _get_mock_sequence_dataset(dataset_contents)

Просмотреть файл

@ -273,6 +273,8 @@ def test_image_encoder_with_segmentation(test_output_dirs: OutputFolderForTests,
aggregation_type=aggregation_type,
scan_size=scan_size)
config.use_mixed_precision = True
# Necessary because torch otherwise says "avg_pool3d_backward_cuda does not have a deterministic implementation"
config.pl_deterministic = False
config.set_output_to(test_output_dirs.root_dir)
config.num_epochs = 1
config.local_dataset = Path()

Просмотреть файл

@ -2,23 +2,22 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import logging
import os
import shutil
from pathlib import Path
from typing import Any, Dict, List
import h5py
import logging
import numpy as np
import os
import pandas as pd
import pytest
from health_ml.utils import BatchTimeCallback
import shutil
from pathlib import Path
from torch.utils.data import DataLoader
from typing import Any, Dict, List
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, is_windows, logging_to_stdout
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
from InnerEye.Common.metrics_constants import MetricType, TRAIN_PREFIX, TrackedMetrics, VALIDATION_PREFIX
from InnerEye.Common.metrics_constants import MetricType, TrackedMetrics, VALIDATION_PREFIX
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_SUFFIX, DATASET_CSV_FILE_NAME, \
ModelExecutionMode, \
@ -116,19 +115,20 @@ def _test_model_train(output_dirs: OutputFolderForTests,
model_training_result, _ = model_train_unittest(train_config, dirs=output_dirs)
assert isinstance(model_training_result, StoringLogger)
# Check that all metrics from the BatchTimeCallback are present
for epoch, epoch_results in model_training_result.results_per_epoch.items():
for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]:
for metric_type in [BatchTimeCallback.EPOCH_TIME,
BatchTimeCallback.BATCH_TIME + " avg",
BatchTimeCallback.BATCH_TIME + " max",
BatchTimeCallback.EXCESS_LOADING_TIME]:
expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type
assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}"
# Excess loading time can be zero because that only measure batches over the threshold
if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME:
value = epoch_results[expected]
assert isinstance(value, float)
assert value > 0.0, f"Time for {expected} should be > 0"
# # TODO: re-enable once the BatchTimeCallback is fixed
# for epoch, epoch_results in model_training_result.results_per_epoch.items():
# for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]:
# for metric_type in [BatchTimeCallback.EPOCH_TIME,
# BatchTimeCallback.BATCH_TIME + " avg",
# BatchTimeCallback.BATCH_TIME + " max",
# BatchTimeCallback.EXCESS_LOADING_TIME]:
# expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type
# assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}"
# # Excess loading time can be zero because that only measure batches over the threshold
# if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME:
# value = epoch_results[expected]
# assert isinstance(value, float)
# assert value > 0.0, f"Time for {expected} should be > 0"
actual_train_losses = model_training_result.get_train_metric(MetricType.LOSS.value)
actual_val_losses = model_training_result.get_val_metric(MetricType.LOSS.value)

Просмотреть файл

@ -13,7 +13,8 @@ from InnerEye.Common.common_util import CROSSVAL_RESULTS_FOLDER, logging_to_stdo
from InnerEye.Common.fixed_paths import MODEL_INFERENCE_JSON_FILE_NAME
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML import baselines_util
from InnerEye.ML.baselines_util import REGRESSION_TEST_AZUREML_FOLDER, REGRESSION_TEST_AZUREML_PARENT_FOLDER, \
from InnerEye.ML.baselines_util import FILE_FORMAT_ERROR, REGRESSION_TEST_AZUREML_FOLDER, \
REGRESSION_TEST_AZUREML_PARENT_FOLDER, \
REGRESSION_TEST_OUTPUT_FOLDER, compare_files, compare_folder_contents, compare_folders_and_run_outputs
from InnerEye.ML.common import FINAL_MODEL_FOLDER
from InnerEye.ML.run_ml import MLRunner
@ -63,6 +64,40 @@ def test_compare_files_text(test_output_dirs: OutputFolderForTests, file_extensi
assert compare_files(expected=expected, actual=actual) == baselines_util.CONTENTS_MISMATCH
def test_compare_files_csv(test_output_dirs: OutputFolderForTests) -> None:
expected = test_output_dirs.root_dir / "expected.csv"
actual = test_output_dirs.root_dir / "actual.does_not_matter"
expected.write_text("""foo,bar
1.0,10.0""")
actual.write_text("""foo,bar
1.0001,10.001""")
assert compare_files(expected=expected, actual=actual, csv_relative_tolerance=1e-2) == ""
assert compare_files(expected=expected, actual=actual, csv_relative_tolerance=1e-3) == ""
assert compare_files(expected=expected, actual=actual, csv_relative_tolerance=2e-4) == ""
assert compare_files(expected=expected, actual=actual,
csv_relative_tolerance=9e-5) == baselines_util.CONTENTS_MISMATCH
def test_compare_files_empty_csv(test_output_dirs: OutputFolderForTests) -> None:
"""
If either of the two CSV files is empty, it should not raise an error, but exit gracefully.
"""
expected = test_output_dirs.root_dir / "expected.csv"
actual = test_output_dirs.root_dir / "actual.csv"
valid_csv = """foo,bar
1.0,10.0"""
empty_csv = ""
for expected_contents, actual_contents in [(empty_csv, empty_csv),
(valid_csv, empty_csv),
(empty_csv, valid_csv)]:
expected.write_text(expected_contents)
actual.write_text(actual_contents)
assert compare_files(expected=expected, actual=actual) == FILE_FORMAT_ERROR
expected.write_text(valid_csv)
actual.write_text(valid_csv)
assert compare_files(expected=expected, actual=actual) == ""
@pytest.mark.parametrize("file_extension", [".png", ".whatever"])
def test_compare_files_binary(test_output_dirs: OutputFolderForTests, file_extension: str) -> None:
"""
@ -107,7 +142,8 @@ def test_compare_folder(test_output_dirs: OutputFolderForTests) -> None:
(expected / subfolder / mismatch).write_text("contents1")
(actual / subfolder / mismatch).write_text("contents2")
messages = compare_folder_contents(expected_folder=expected, actual_folder=actual)
messages = compare_folder_contents(expected_folder=expected, actual_folder=actual,
csv_relative_tolerance=0.0)
all_messages = " ".join(messages)
# No issues expected
assert matching not in all_messages
@ -130,13 +166,14 @@ def test_compare_plain_outputs(test_output_dirs: OutputFolderForTests) -> None:
file1 = folder / "output.txt"
create_folder_and_write_text(file1, "Something")
# First comparison should pass
compare_folders_and_run_outputs(expected=expected, actual=actual)
compare_folders_and_run_outputs(expected=expected, actual=actual, csv_relative_tolerance=0.0)
# Now add a file to the set of expected files that does not exist in the run: comparison should now fail
no_such_file = "no_such_file.txt"
file2 = expected / no_such_file
create_folder_and_write_text(file2, "foo")
with pytest.raises(ValueError) as ex:
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
message = ex.value.args[0].splitlines()
assert f"{baselines_util.MISSING_FILE}: {no_such_file}" in message
@ -156,18 +193,21 @@ def test_compare_folder_against_run(test_output_dirs: OutputFolderForTests) -> N
'"model_configs_namespace": "InnerEye.ML.configs.segmentation.BasicModel2Epochs"}')
with mock.patch("InnerEye.ML.baselines_util.RUN_CONTEXT", run):
# First comparison only on the .json file should pass
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
# Now add a file to the set of expected files that does not exist in the run: comparison should now fail
no_such_file = "no_such_file.txt"
file2 = test_output_dirs.root_dir / REGRESSION_TEST_AZUREML_FOLDER / no_such_file
create_folder_and_write_text(file2, "foo")
with pytest.raises(ValueError) as ex:
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
message = ex.value.args[0].splitlines()
assert f"{baselines_util.MISSING_FILE}: {no_such_file}" in message
# Now run the same comparison that failed previously, without mocking the RUN_CONTEXT. This should now
# realize that the present run is an offline run, and skip the comparison
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
@pytest.mark.after_training_ensemble_run
@ -190,10 +230,12 @@ No outliers found
No outliers found""")
with mock.patch("InnerEye.ML.baselines_util.PARENT_RUN_CONTEXT", parent_run):
# No plain files to compare. The file Test_outliers.txt should be compared and found to match.
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
create_folder_and_write_text(file1, "foo")
with pytest.raises(ValueError) as ex:
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
message = ex.value.args[0].splitlines()
assert f"{baselines_util.CONTENTS_MISMATCH}: {CROSSVAL_RESULTS_FOLDER}/{file1.name}" in message
# Now add a file to the set of expected files that does not exist in the run: comparison should now fail
@ -201,11 +243,13 @@ No outliers found""")
file2 = test_output_dirs.root_dir / REGRESSION_TEST_AZUREML_PARENT_FOLDER / no_such_file
create_folder_and_write_text(file2, "foo")
with pytest.raises(ValueError) as ex:
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
message = ex.value.args[0].splitlines()
assert f"{baselines_util.MISSING_FILE}: {no_such_file}" in message
# Now run the same comparison without mocking the PARENT_RUN_CONTEXT. This should now
# realize that the present run is a crossval child run
with pytest.raises(ValueError) as ex:
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
csv_relative_tolerance=0.0)
assert "no (parent) run to compare against" in str(ex)

Просмотреть файл

@ -46,9 +46,9 @@ def create_model_and_store_checkpoint(config: ModelConfigBase, checkpoint_path:
model = model.cuda() # type: ignore
trainer.model = model
# Before saving, the values for epoch and step are incremented. Save them here in such a way that we can assert
# easily later.
trainer.current_epoch = FIXED_EPOCH - 1 # type: ignore
trainer.global_step = FIXED_GLOBAL_STEP - 1 # type: ignore
# easily later. We can't mock that because otherwise the mock object would be written to disk (that fails)
trainer.fit_loop.current_epoch = FIXED_EPOCH - 1 # type: ignore
trainer.fit_loop.global_step = FIXED_GLOBAL_STEP - 1 # type: ignore
# In PL, it is the Trainer's responsibility to save the model. Checkpoint handling refers back to the trainer
# to get a save_func. Mimicking that here.
trainer.save_checkpoint(checkpoint_path, weights_only=weights_only)

Просмотреть файл

@ -2,11 +2,10 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import math
from pathlib import Path
from typing import Dict
from unittest import mock
import math
import numpy as np
import pandas as pd
import pytest
@ -16,6 +15,7 @@ from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn import Module
from torch.optim.lr_scheduler import _LRScheduler
from typing import Dict
from InnerEye.Common import fixed_paths
from InnerEye.Common.common_util import is_windows
@ -135,8 +135,9 @@ def test_innereye_ssl_container_cifar10_resnet_simclr() -> None:
assert len(checkpoint["optimizer_states"]) == 1
assert len(checkpoint["lr_schedulers"]) == 1
assert "callbacks" in checkpoint
assert SSLOnlineEvaluatorInnerEye in checkpoint["callbacks"]
callback_state = checkpoint["callbacks"][SSLOnlineEvaluatorInnerEye]
callback_name = SSLOnlineEvaluatorInnerEye.__name__
assert callback_name in checkpoint["callbacks"]
callback_state = checkpoint["callbacks"][callback_name]
assert SSLOnlineEvaluatorInnerEye.OPTIMIZER_STATE_NAME in callback_state
assert SSLOnlineEvaluatorInnerEye.EVALUATOR_STATE_NAME in callback_state
@ -336,8 +337,9 @@ def test_online_evaluator_recovery(test_output_dirs: OutputFolderForTests) -> No
# It's somewhat obsolete, but we can now check that the checkpoint file really contained the optimizer and weights
checkpoint = torch.load(last_checkpoint)
assert "callbacks" in checkpoint
assert SSLOnlineEvaluatorInnerEye in checkpoint["callbacks"]
callback_state = checkpoint["callbacks"][SSLOnlineEvaluatorInnerEye]
callback_name = SSLOnlineEvaluatorInnerEye.__name__
assert callback_name in checkpoint["callbacks"]
callback_state = checkpoint["callbacks"][callback_name]
assert SSLOnlineEvaluatorInnerEye.OPTIMIZER_STATE_NAME in callback_state
assert SSLOnlineEvaluatorInnerEye.EVALUATOR_STATE_NAME in callback_state
@ -359,7 +361,8 @@ def test_online_evaluator_not_distributed() -> None:
# Standard trainer without DDP
trainer = Trainer()
# Test the flag that the internal logic of on_pretrain_routine_start uses
assert not trainer.accelerator_connector.is_distributed
assert hasattr(trainer, "_accelerator_connector")
assert not trainer._accelerator_connector.is_distributed
mock_module = mock.MagicMock(device=torch.device("cpu"))
callback.on_pretrain_routine_start(trainer, mock_module)
assert isinstance(callback.evaluator, Module)
@ -378,19 +381,19 @@ def test_online_evaluator_distributed() -> None:
with mock.patch("InnerEye.ML.SSL.lightning_modules.ssl_online_evaluator.DistributedDataParallel",
return_value=mock_ddp_result) as mock_ddp:
callback = SSLOnlineEvaluatorInnerEye(class_weights=None,
z_dim=1,
num_classes=2,
dataset="foo",
drop_p=0.2,
learning_rate=1e-5)
z_dim=1,
num_classes=2,
dataset="foo",
drop_p=0.2,
learning_rate=1e-5)
# Trainer with DDP
device = torch.device("cuda:0")
mock_module = mock.MagicMock(device=device)
trainer = Trainer(accelerator="ddp", gpus=2)
# Test the two flags that the internal logic of on_pretrain_routine_start uses
assert trainer.accelerator_connector.is_distributed
assert trainer.accelerator_connector.use_ddp
assert trainer._accelerator_connector.is_distributed
assert trainer._accelerator_connector.use_ddp
original_evaluator = callback.evaluator
callback.on_pretrain_routine_start(trainer, mock_module)
# Check that SyncBatchNorm has been turned on

Просмотреть файл

@ -98,7 +98,7 @@ jobs:
- name: tag
value: 'TrainEnsemble'
- name: more_switches
value: '--pl_deterministic --regression_test_folder=RegressionTestResults/PR_TrainEnsemble'
value: '--pl_deterministic --log_level=DEBUG --regression_test_folder=RegressionTestResults/PR_TrainEnsemble'
pool:
vmImage: 'ubuntu-18.04'
steps:

Просмотреть файл

@ -35,7 +35,7 @@ steps:
# work on the package
- bash: |
source activate InnerEye
pytest ./Tests/ -m "not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)" --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-config=.coveragerc --cov-report=xml -n 2 --dist=loadscope --verbose
pytest ./Tests/ -m "not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)" --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-config=.coveragerc --cov-report=xml --verbose
env:
APPLICATION_KEY: $(InnerEyeDeepLearningServicePrincipalKey)
DATASETS_ACCOUNT_KEY: $(InnerEyePublicDatasetsStorageKey)

Просмотреть файл

@ -8,7 +8,7 @@ dependencies:
- pip=20.1.1
- python=3.7.3
- pytorch=1.8.0
- python-blosc==1.7.0
- python-blosc=1.7.0
- torchvision=0.9.0
- pip:
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
@ -52,8 +52,8 @@ dependencies:
- pytest-cov==2.10.1
- pytest-forked==1.3.0
- pytest-xdist==1.34.0
- pytorch-lightning==1.3.8
- rich==5.1.1
- pytorch-lightning==1.5.5
- rich==10.13.0
- rpdb==0.1.6
- ruamel.yaml==0.16.12
- runstats==1.8.0
@ -68,6 +68,6 @@ dependencies:
- tensorboard==2.3.0
- tensorboardX==2.1
- torchprof==1.3.3
- torchmetrics==0.4.1
- torchmetrics==0.6.0
- umap-learn==0.5.2
- yacs==0.1.8

@ -1 +1 @@
Subproject commit f2070aeb7a5e7d1b0e45c6aad247d18d074705a8
Subproject commit 13560d2f198cc72f06e01675e9ecee509ce5639a