Upgrade to Pytorch Lightning 1.5.5 (#591)
This commit is contained in:
Родитель
4aa84b9f36
Коммит
e477c9dd83
|
@ -13,7 +13,7 @@
|
|||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="InnerEye/ML/runner.py" />
|
||||
<option name="PARAMETERS" value="--azureml --model=HelloContainer --cluster=training-nc12" />
|
||||
<option name="PARAMETERS" value="--azureml --model=HelloContainer --cluster=training-nc12 --num_nodes=2 --log_level=DEBUG" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="InnerEye/ML/runner.py" />
|
||||
<option name="PARAMETERS" value="--model=HelloWorld" />
|
||||
<option name="PARAMETERS" value="--model=HelloContainer" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
|
|
|
@ -63,6 +63,7 @@ gets uploaded to AzureML, by skipping all test folders.
|
|||
- ([#584](https://github.com/microsoft/InnerEye-DeepLearning/pull/584)) SSL models write the optimizer state for the linear head to the checkpoint now.
|
||||
- ([#594](https://github.com/microsoft/InnerEye-DeepLearning/pull/594)) Pytorch is now non-deterministic by default. Upgrade to AzureML-SDK 1.36
|
||||
- ([#566](https://github.com/microsoft/InnerEye-DeepLearning/pull/566)) Update `hi-ml` dependency to `hi-ml-azure`.
|
||||
- ([#591](https://github.com/microsoft/InnerEye-DeepLearning/pull/591)) Upgrade Pytorch Lightning to 1.5.0
|
||||
- ([#572](https://github.com/microsoft/InnerEye-DeepLearning/pull/572)) Updated to new version of hi-ml package
|
||||
- ([#617](https://github.com/microsoft/InnerEye-DeepLearning/pull/617)) Provide an easier way for LightningContainers to add callbacks.
|
||||
- ([#596](https://github.com/microsoft/InnerEye-DeepLearning/pull/596)) Add `cudatoolkit=11.1` specification to environment.yml.
|
||||
|
|
|
@ -5,10 +5,9 @@
|
|||
from typing import Any, List, Optional
|
||||
|
||||
import torch
|
||||
from torchmetrics import Metric
|
||||
from pl_bolts.models.self_supervised import SSLEvaluator
|
||||
from health_ml.utils import log_on_epoch
|
||||
from torch.nn import functional as F
|
||||
from pl_bolts.models.self_supervised import SSLEvaluator
|
||||
from torch.nn import ModuleList, functional as F
|
||||
|
||||
from InnerEye.ML.SSL.encoders import get_encoder_output_dim
|
||||
from InnerEye.ML.dataset.scalar_sample import ScalarItem
|
||||
|
@ -38,18 +37,12 @@ class SSLClassifier(LightningModuleWithOptimizer, DeviceAwareModule):
|
|||
n_classes=num_classes,
|
||||
p=0.20)
|
||||
if self.num_classes == 2:
|
||||
self.train_metrics: List[Metric] = \
|
||||
[AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
|
||||
self.val_metrics: List[Metric] = \
|
||||
[AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()]
|
||||
self.train_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
|
||||
self.val_metrics = ModuleList([AreaUnderRocCurve(), AreaUnderPrecisionRecallCurve(), Accuracy05()])
|
||||
else:
|
||||
# Note that for multi-class, Accuracy05 is the standard multi-class accuracy.
|
||||
self.train_metrics = [Accuracy05()]
|
||||
self.val_metrics = [Accuracy05()]
|
||||
|
||||
def on_train_start(self) -> None:
|
||||
for metric in [*self.train_metrics, *self.val_metrics]:
|
||||
metric.to(device=self.device) # type: ignore
|
||||
self.train_metrics = ModuleList([Accuracy05()])
|
||||
self.val_metrics = ModuleList([Accuracy05()])
|
||||
|
||||
def train(self, mode: bool = True) -> Any:
|
||||
self.classifier_head.train(mode)
|
||||
|
|
|
@ -3,8 +3,6 @@
|
|||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from pl_bolts.callbacks.ssl_online import SSLOnlineEvaluator
|
||||
|
@ -14,8 +12,9 @@ from torch import Tensor as T
|
|||
from torch.nn import SyncBatchNorm, functional as F
|
||||
from torch.nn.parallel import DistributedDataParallel
|
||||
from torchmetrics import Metric
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
from InnerEye.ML.SSL.utils import SSLDataModuleType
|
||||
from InnerEye.ML.SSL.utils import SSLDataModuleType, add_submodules_to_same_device
|
||||
from InnerEye.ML.lightning_metrics import Accuracy05, AreaUnderPrecisionRecallCurve, AreaUnderRocCurve
|
||||
from InnerEye.ML.utils.layer_util import set_model_to_eval_mode
|
||||
from health_ml.utils import log_on_epoch
|
||||
|
@ -81,10 +80,17 @@ class SSLOnlineEvaluatorInnerEye(SSLOnlineEvaluator):
|
|||
If training happens via DDP, SyncBatchNorm is enabled for the online evaluator, and it is converted to
|
||||
a DDP module.
|
||||
"""
|
||||
for metric in [*self.train_metrics, *self.val_metrics]:
|
||||
metric.to(device=pl_module.device) # type: ignore
|
||||
for prefix, metrics in [("train", self.train_metrics), ("val", self.val_metrics)]:
|
||||
add_submodules_to_same_device(pl_module, metrics, prefix=prefix)
|
||||
self.evaluator.to(pl_module.device)
|
||||
accelerator = trainer.accelerator_connector
|
||||
if hasattr(trainer, "accelerator_connector"):
|
||||
# This works with Lightning 1.3.8
|
||||
accelerator = trainer.accelerator_connector
|
||||
elif hasattr(trainer, "_accelerator_connector"):
|
||||
# This works with Lightning 1.5.5
|
||||
accelerator = trainer._accelerator_connector
|
||||
else:
|
||||
raise ValueError("Unable to retrieve the accelerator information")
|
||||
if accelerator.is_distributed:
|
||||
if accelerator.use_ddp:
|
||||
self.evaluator = SyncBatchNorm.convert_sync_batchnorm(self.evaluator)
|
||||
|
@ -152,7 +158,7 @@ class SSLOnlineEvaluatorInnerEye(SSLOnlineEvaluator):
|
|||
for metric in self.val_metrics:
|
||||
log_on_epoch(pl_module, f"ssl_online_evaluator/val/{metric.name}", metric)
|
||||
|
||||
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx) -> None: # type: ignore
|
||||
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx) -> None: # type: ignore
|
||||
"""
|
||||
Get and log training metrics, perform network update.
|
||||
"""
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
import logging
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
import torch
|
||||
from yacs.config import CfgNode
|
||||
|
@ -119,3 +119,23 @@ def SSLModelLoader(ssl_class: Any, num_classes: int) -> Any:
|
|||
n_hidden=None)
|
||||
|
||||
return _wrap
|
||||
|
||||
|
||||
def add_submodules_to_same_device(module: torch.nn.Module,
|
||||
submodules: Iterable[torch.nn.Module],
|
||||
prefix: str = "") -> None:
|
||||
"""
|
||||
Adds each of the given submodules to the "main" module, and moves them to the same device as the "main"
|
||||
module. The submodules get a name derived from their class name, with the given prefix.
|
||||
|
||||
:param module: The module to which submodules should be added.
|
||||
:param submodules: The submodules to add.
|
||||
:param prefix: A string prefix that will be used to create the name of the submodule.
|
||||
"""
|
||||
|
||||
def _class_name(o: Any) -> str:
|
||||
return type(o).__name__
|
||||
|
||||
for m in submodules:
|
||||
m.to(device=module.device) # type: ignore
|
||||
module.add_module(f"{prefix}{_class_name(m)}", m)
|
||||
|
|
|
@ -32,8 +32,10 @@ REGRESSION_TEST_OUTPUT_FOLDER = "OUTPUT"
|
|||
REGRESSION_TEST_AZUREML_FOLDER = "AZUREML_OUTPUT"
|
||||
REGRESSION_TEST_AZUREML_PARENT_FOLDER = "AZUREML_PARENT_OUTPUT"
|
||||
CONTENTS_MISMATCH = "Contents mismatch"
|
||||
FILE_FORMAT_ERROR = "File format error"
|
||||
MISSING_FILE = "Missing"
|
||||
TEXT_FILE_SUFFIXES = [".txt", ".csv", ".json", ".html", ".md"]
|
||||
CSV_SUFFIX = ".csv"
|
||||
TEXT_FILE_SUFFIXES = [".txt", ".json", ".html", ".md"]
|
||||
|
||||
INFERENCE_DISABLED_WARNING = "Not performing comparison of model against baseline(s), because inference is currently " \
|
||||
"disabled. If comparison is required, use either the inference_on_test_set or " \
|
||||
|
@ -185,7 +187,7 @@ def get_comparison_baselines(outputs_folder: Path, azure_config: AzureConfig,
|
|||
return comparison_baselines
|
||||
|
||||
|
||||
def compare_files(expected: Path, actual: Path) -> str:
|
||||
def compare_files(expected: Path, actual: Path, csv_relative_tolerance: float = 0.0) -> str:
|
||||
"""
|
||||
Compares two individual files for regression testing. It returns an empty string if the two files appear identical.
|
||||
If the files are not identical, an error message with details is return. This handles known text file formats,
|
||||
|
@ -195,16 +197,35 @@ def compare_files(expected: Path, actual: Path) -> str:
|
|||
:param expected: A file that contains the expected contents. The type of comparison (text or binary) is chosen
|
||||
based on the extension of this file.
|
||||
:param actual: A file that contains the actual contents.
|
||||
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
|
||||
If 0.0, do not allow any discrepancy.
|
||||
:return: An empty string if the files appear identical, or otherwise an error message with details.
|
||||
"""
|
||||
|
||||
def print_lines(prefix: str, lines: List[str]) -> None:
|
||||
num_lines = len(lines)
|
||||
count = min(5, num_lines)
|
||||
logging.debug(f"{prefix} {num_lines} lines, first {count} of those:")
|
||||
logging.debug(os.linesep.join(lines[:count]))
|
||||
logging.info(f"{prefix} {num_lines} lines, first {count} of those:")
|
||||
logging.info(os.linesep.join(lines[:count]))
|
||||
|
||||
if expected.suffix in TEXT_FILE_SUFFIXES:
|
||||
def try_read_csv(prefix: str, file: Path) -> Optional[pd.DataFrame]:
|
||||
try:
|
||||
return pd.read_csv(file)
|
||||
except Exception as ex:
|
||||
logging.info(f"{prefix} file can't be read as CSV: {str(ex)}")
|
||||
return None
|
||||
|
||||
if expected.suffix == CSV_SUFFIX:
|
||||
expected_df = try_read_csv("Expected", expected)
|
||||
actual_df = try_read_csv("Actual", actual)
|
||||
if expected_df is None or actual_df is None:
|
||||
return FILE_FORMAT_ERROR
|
||||
try:
|
||||
pd.testing.assert_frame_equal(actual_df, expected_df, rtol=csv_relative_tolerance)
|
||||
except Exception as ex:
|
||||
logging.info(str(ex))
|
||||
return CONTENTS_MISMATCH
|
||||
elif expected.suffix in TEXT_FILE_SUFFIXES:
|
||||
# Compare line-by-line to avoid issues with line separators
|
||||
expected_lines = expected.read_text().splitlines()
|
||||
actual_lines = actual.read_text().splitlines()
|
||||
|
@ -216,12 +237,13 @@ def compare_files(expected: Path, actual: Path) -> str:
|
|||
expected_binary = expected.read_bytes()
|
||||
actual_binary = actual.read_bytes()
|
||||
if expected_binary != actual_binary:
|
||||
logging.debug(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
|
||||
logging.info(f"Expected {len(expected_binary)} bytes, actual {len(actual_binary)} bytes")
|
||||
return CONTENTS_MISMATCH
|
||||
return ""
|
||||
|
||||
|
||||
def compare_folder_contents(expected_folder: Path,
|
||||
csv_relative_tolerance: float,
|
||||
actual_folder: Optional[Path] = None,
|
||||
run: Optional[Run] = None) -> List[str]:
|
||||
"""
|
||||
|
@ -230,9 +252,12 @@ def compare_folder_contents(expected_folder: Path,
|
|||
(or the AzureML run), with exactly the same contents, in the same folder structure.
|
||||
For example, if there is a file "<expected>/foo/bar/contents.txt", then there must also be a file
|
||||
"<actual>/foo/bar/contents.txt"
|
||||
|
||||
:param expected_folder: A folder with files that are expected to be present.
|
||||
:param actual_folder: The output folder with the actually produced files.
|
||||
:param run: An AzureML run
|
||||
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
|
||||
If 0.0, do not allow any discrepancy.
|
||||
:return: A list of human readable error messages, with message and file path. If no errors are found, the list is
|
||||
empty.
|
||||
"""
|
||||
|
@ -256,7 +281,8 @@ def compare_folder_contents(expected_folder: Path,
|
|||
run.download_file(name=str(file_relative), output_file_path=str(actual_file))
|
||||
else:
|
||||
raise ValueError("One of the two arguments run, actual_folder must be provided.")
|
||||
message = compare_files(expected=file, actual=actual_file) if actual_file.exists() else MISSING_FILE
|
||||
message = compare_files(expected=file, actual=actual_file,
|
||||
csv_relative_tolerance=csv_relative_tolerance) if actual_file.exists() else MISSING_FILE
|
||||
if message:
|
||||
messages.append(f"{message}: {file_relative}")
|
||||
logging.info(f"File {file_relative}: {message or 'OK'}")
|
||||
|
@ -265,15 +291,18 @@ def compare_folder_contents(expected_folder: Path,
|
|||
return messages
|
||||
|
||||
|
||||
def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
|
||||
def compare_folders_and_run_outputs(expected: Path, actual: Path, csv_relative_tolerance: float) -> None:
|
||||
"""
|
||||
Compares the actual set of run outputs in the `actual` folder against an expected set of files in the `expected`
|
||||
folder. The `expected` folder can have two special subfolders AZUREML_OUTPUT and AZUREML_PARENT_OUTPUT, that
|
||||
contain files that are expected to be present in the AzureML run context of the present run (AZUREML_OUTPUT)
|
||||
or the run context of the parent run (AZUREML_PARENT_OUTPUT).
|
||||
If a file is missing, or does not have the expected contents, an exception is raised.
|
||||
|
||||
:param expected: A folder with files that are expected to be present.
|
||||
:param actual: The output folder with the actually produced files.
|
||||
:param csv_relative_tolerance: When comparing CSV files, use this as the maximum allowed relative discrepancy.
|
||||
If 0.0, do not allow any discrepancy.
|
||||
"""
|
||||
if not expected.is_dir():
|
||||
raise ValueError(f"Folder with expected files does not exist: {expected}")
|
||||
|
@ -289,7 +318,10 @@ def compare_folders_and_run_outputs(expected: Path, actual: Path) -> None:
|
|||
if actual_folder is None and run_to_compare is None:
|
||||
raise ValueError(f"The set of expected test results in {expected} contains a folder "
|
||||
f"{subfolder}, but there is no (parent) run to compare against.")
|
||||
new_messages = compare_folder_contents(folder, actual_folder=actual_folder, run=run_to_compare)
|
||||
new_messages = compare_folder_contents(folder,
|
||||
actual_folder=actual_folder,
|
||||
run=run_to_compare,
|
||||
csv_relative_tolerance=csv_relative_tolerance)
|
||||
if new_messages:
|
||||
messages.append(f"Issues in {message_prefix}:")
|
||||
messages.extend(new_messages)
|
||||
|
|
|
@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|||
import numpy as np
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from pytorch_lightning.metrics import MeanAbsoluteError
|
||||
from torchmetrics.regression import MeanAbsoluteError
|
||||
from torch.optim import Adam, Optimizer
|
||||
from torch.optim.lr_scheduler import StepLR, _LRScheduler
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
|
|
@ -243,6 +243,10 @@ class WorkflowParams(param.Parameterized):
|
|||
"folder, and their contents must match exactly. When running in AzureML, you need to "
|
||||
"ensure that this folder is part of the snapshot that gets uploaded. The path should "
|
||||
"be relative to the repository root directory.")
|
||||
regression_test_csv_tolerance: float = \
|
||||
param.Number(default=0.0, allow_None=False,
|
||||
doc="When comparing CSV files during regression tests, use this value as the maximum allowed "
|
||||
"relative difference of actual and expected results. Default: 0.0 (must match exactly)")
|
||||
|
||||
def validate(self) -> None:
|
||||
if sum([bool(param) for param in [self.weights_url, self.local_weights_path, self.model_id]]) > 1:
|
||||
|
@ -583,7 +587,7 @@ class TrainerParams(param.Parameterized):
|
|||
param.Boolean(default=False,
|
||||
doc="Controls the PyTorch Lightning trainer flags 'deterministic' and 'benchmark'. If "
|
||||
"'pl_deterministic' is True, results are perfectly reproducible. If False, they are not, but "
|
||||
"you may see training speed increases.")
|
||||
"you may see significant training speed increases.")
|
||||
pl_find_unused_parameters: bool = \
|
||||
param.Boolean(default=False,
|
||||
doc="Controls the PyTorch Lightning flag 'find_unused_parameters' for the DDP plugin. "
|
||||
|
@ -606,7 +610,7 @@ class TrainerParams(param.Parameterized):
|
|||
monitor_gpu: bool = param.Boolean(default=False,
|
||||
doc="If True, add the GPUStatsMonitor callback to the Lightning trainer object. "
|
||||
"This will write GPU utilization metrics every 50 batches by default.")
|
||||
monitor_loading: bool = param.Boolean(default=True,
|
||||
monitor_loading: bool = param.Boolean(default=False,
|
||||
doc="If True, add the BatchTimeCallback callback to the Lightning trainer "
|
||||
"object. This will monitor how long individual batches take to load.")
|
||||
|
||||
|
|
|
@ -253,14 +253,6 @@ class InnerEyeLightning(LightningModule):
|
|||
self.train_epoch_metrics_logger.flush()
|
||||
self.val_epoch_metrics_logger.flush()
|
||||
|
||||
@property
|
||||
def use_sync_dist(self) -> bool:
|
||||
"""
|
||||
Returns True if metric logging should use sync_dist=True. This is read off from the use_ddp flag of the trainer.
|
||||
"""
|
||||
assert isinstance(self.trainer, Trainer)
|
||||
return self.trainer.accelerator_connector.use_ddp
|
||||
|
||||
def training_epoch_end(self, outputs: List[Any]) -> None:
|
||||
# Write out all the metrics that have been accumulated in the StoringLogger in the previous epoch.
|
||||
# Metrics for the very last epoch are written in on_train_end
|
||||
|
@ -314,32 +306,29 @@ class InnerEyeLightning(LightningModule):
|
|||
name: Union[MetricType, str],
|
||||
value: Any,
|
||||
is_training: bool,
|
||||
reduce_fx: Callable = torch.mean,
|
||||
sync_dist_override: Optional[bool] = None,
|
||||
sync_dist_op: Any = "mean") -> None:
|
||||
reduce_fx: Union[str, Callable] = "mean",
|
||||
sync_dist_override: Optional[bool] = None) -> None:
|
||||
"""
|
||||
Logs a metrics to Pytorch Lightning with the on_epoch flag set. The metric will get a prefix indicating
|
||||
if it is a training or a validation metric. A custom reducer function can be provided.
|
||||
The method also ensures that the correct synchronization across nodes is used. If the value to log is a
|
||||
floating point, it is converted to a Tensor on the current device to enable synchronization.
|
||||
|
||||
:param sync_dist_override: If not None, use this value for the sync_dist argument to self.log. If None,
|
||||
set it automatically depending on the use of DDP.
|
||||
:param name: The name of the metric to log
|
||||
:param value: The value of the metric. This can be a tensor, floating point value, or a Metric class.
|
||||
:param is_training: If true, give the metric a "train/" prefix, otherwise a "val/" prefix.
|
||||
:param reduce_fx: The reduce function to apply to step values. Default: torch.mean
|
||||
:param sync_dist_op: The reduce operation to use when synchronizing the tensors across GPUs. This must be
|
||||
a value recognized by sync_ddp: Either 'None' to use 'sum' as aggregate, or 'mean' or 'avg'
|
||||
:param reduce_fx: The reduce function to use when synchronizing the tensors across GPUs. This must be
|
||||
a value recognized by sync_ddp: "sum", "mean"
|
||||
"""
|
||||
metric_name = name if isinstance(name, str) else name.value
|
||||
prefix = TRAIN_PREFIX if is_training else VALIDATION_PREFIX
|
||||
sync_dist = self.use_sync_dist if sync_dist_override is None else sync_dist_override
|
||||
log_on_epoch(self,
|
||||
name=prefix + metric_name,
|
||||
value=value,
|
||||
sync_dist=sync_dist,
|
||||
reduce_fx=reduce_fx,
|
||||
sync_dist_op=sync_dist_op)
|
||||
sync_dist=sync_dist_override,
|
||||
reduce_fx=reduce_fx)
|
||||
|
||||
def store_epoch_results(self, metrics: DictStrFloat, epoch: int, is_training: bool) -> None:
|
||||
"""
|
||||
|
|
|
@ -10,7 +10,7 @@ import torch
|
|||
import torch.nn.functional as F
|
||||
import torchmetrics as metrics
|
||||
from torchmetrics import Metric
|
||||
from pytorch_lightning.metrics.functional import accuracy, auc, auroc, precision_recall_curve, roc
|
||||
from torchmetrics.functional import accuracy, auc, auroc, precision_recall_curve, roc
|
||||
from torch.nn import ModuleList
|
||||
|
||||
from InnerEye.Common.metrics_constants import AVERAGE_DICE_SUFFIX, MetricType, TRAIN_PREFIX, VALIDATION_PREFIX
|
||||
|
|
|
@ -92,11 +92,11 @@ class SegmentationLightning(InnerEyeLightning):
|
|||
|
||||
# apply mask if required
|
||||
if mask is not None:
|
||||
posteriors = image_util.apply_mask_to_posteriors(posteriors=posteriors, mask=mask)
|
||||
posteriors = image_util.apply_mask_to_posteriors(posteriors=posteriors, mask=mask) # type: ignore
|
||||
|
||||
# post process posteriors to compute result
|
||||
segmentation = image_util.posteriors_to_segmentation(posteriors=posteriors)
|
||||
self.compute_metrics(cropped_sample, segmentation, is_training)
|
||||
segmentation = image_util.posteriors_to_segmentation(posteriors=posteriors) # type: ignore
|
||||
self.compute_metrics(cropped_sample, segmentation, is_training) # type: ignore
|
||||
|
||||
self.write_loss(is_training, loss)
|
||||
return loss
|
||||
|
@ -114,10 +114,10 @@ class SegmentationLightning(InnerEyeLightning):
|
|||
# Dice NaN means that both ground truth and prediction are empty.
|
||||
dice_per_crop_and_class = compute_dice_across_patches(
|
||||
segmentation=segmentation,
|
||||
ground_truth=cropped_sample.labels_center_crop,
|
||||
ground_truth=cropped_sample.labels_center_crop, # type: ignore
|
||||
allow_multiple_classes_for_each_pixel=True)[:, 1:]
|
||||
# Number of foreground voxels per class, across all crops
|
||||
foreground_voxels = metrics_util.get_number_of_voxels_per_class(cropped_sample.labels)[:, 1:]
|
||||
foreground_voxels = metrics_util.get_number_of_voxels_per_class(cropped_sample.labels)[:, 1:] # type: ignore
|
||||
# Store Dice and voxel count per sample in the minibatch. We need a custom aggregation logic for Dice
|
||||
# because it can be NaN. Also use custom logging for voxel count because Lightning's batch-size weighted
|
||||
# average has a bug.
|
||||
|
@ -150,8 +150,7 @@ class SegmentationLightning(InnerEyeLightning):
|
|||
self.log_on_epoch(name=MetricType.SUBJECT_COUNT,
|
||||
value=num_subjects,
|
||||
is_training=is_training,
|
||||
reduce_fx=torch.sum,
|
||||
sync_dist_op="sum")
|
||||
reduce_fx=torch.sum)
|
||||
|
||||
def training_or_validation_epoch_end(self, is_training: bool) -> None:
|
||||
"""
|
||||
|
@ -292,7 +291,7 @@ class ScalarLightning(InnerEyeLightning):
|
|||
logger = self.train_subject_outputs_logger if is_training else self.val_subject_outputs_logger # type: ignore
|
||||
logger.flush()
|
||||
|
||||
def transfer_batch_to_device(self, batch: Any, device: torch.device) -> Any: # type: ignore
|
||||
def transfer_batch_to_device(self, batch: Any, device: torch.device, dataloader_idx: int) -> Any: # type: ignore
|
||||
"""
|
||||
For sequence models, transfer the nested lists of items to the given GPU device.
|
||||
For all other models, this relies on the superclass to move the batch of data to the GPU.
|
||||
|
|
|
@ -14,7 +14,6 @@ import numpy as np
|
|||
import torch
|
||||
import torch.nn.functional as F
|
||||
from azureml.core import Run
|
||||
from numpy.core.numeric import NaN
|
||||
|
||||
from InnerEye.Azure.azure_util import get_run_context_or_default
|
||||
from InnerEye.Common.metrics_constants import LoggingColumns, MetricType
|
||||
|
@ -174,9 +173,9 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
|
|||
continue
|
||||
# Skip but record if nan_image
|
||||
elif nan_images[i]:
|
||||
add_metric(MetricType.DICE, NaN)
|
||||
add_metric(MetricType.HAUSDORFF_mm, NaN)
|
||||
add_metric(MetricType.MEAN_SURFACE_DIST_mm, NaN)
|
||||
add_metric(MetricType.DICE, np.nan)
|
||||
add_metric(MetricType.HAUSDORFF_mm, np.nan)
|
||||
add_metric(MetricType.MEAN_SURFACE_DIST_mm, np.nan)
|
||||
continue
|
||||
check_size_matches(prediction, ground_truth[i], arg1_name="prediction", arg2_name="ground_truth")
|
||||
if not is_binary_array(prediction):
|
||||
|
|
|
@ -9,7 +9,7 @@ from pathlib import Path
|
|||
from typing import Any, List, Optional, Tuple, TypeVar
|
||||
|
||||
from pytorch_lightning import Callback, LightningModule, Trainer, seed_everything
|
||||
from pytorch_lightning.callbacks import GPUStatsMonitor, ModelCheckpoint
|
||||
from pytorch_lightning.callbacks import GPUStatsMonitor, ModelCheckpoint, TQDMProgressBar
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
from pytorch_lightning.plugins import DDPPlugin
|
||||
|
||||
|
@ -18,14 +18,14 @@ from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context
|
|||
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, change_working_directory
|
||||
from InnerEye.Common.resource_monitor import ResourceMonitor
|
||||
from InnerEye.ML.common import ARGS_TXT, ModelExecutionMode, RECOVERY_CHECKPOINT_FILE_NAME, VISUALIZATION_FOLDER
|
||||
from InnerEye.ML.utils.checkpoint_handling import create_best_checkpoint
|
||||
from InnerEye.ML.lightning_base import InnerEyeContainer, InnerEyeLightning
|
||||
from InnerEye.ML.lightning_container import LightningContainer
|
||||
from InnerEye.ML.lightning_loggers import StoringLogger
|
||||
from InnerEye.ML.lightning_models import SUBJECT_OUTPUT_PER_RANK_PREFIX, ScalarLightning, \
|
||||
get_subject_output_file_per_rank
|
||||
from InnerEye.ML.utils.checkpoint_handling import create_best_checkpoint
|
||||
from health_azure.utils import is_global_rank_zero, is_local_rank_zero
|
||||
from health_ml.utils import AzureMLLogger, AzureMLProgressBar, BatchTimeCallback, log_on_epoch
|
||||
from health_ml.utils import AzureMLLogger, AzureMLProgressBar, log_on_epoch
|
||||
|
||||
TEMP_PREFIX = "temp/"
|
||||
|
||||
|
@ -66,7 +66,7 @@ class InnerEyeRecoveryCheckpointCallback(ModelCheckpoint):
|
|||
super().__init__(dirpath=str(container.checkpoint_folder),
|
||||
monitor="epoch_started",
|
||||
filename=RECOVERY_CHECKPOINT_FILE_NAME + "_{epoch}",
|
||||
period=container.recovery_checkpoint_save_interval,
|
||||
every_n_epochs=container.recovery_checkpoint_save_interval,
|
||||
save_top_k=container.recovery_checkpoints_save_last_k,
|
||||
mode="max",
|
||||
save_last=False)
|
||||
|
@ -74,6 +74,7 @@ class InnerEyeRecoveryCheckpointCallback(ModelCheckpoint):
|
|||
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule, unused: bool = None) -> None:
|
||||
# The metric to monitor must be logged on all ranks in distributed training
|
||||
log_on_epoch(pl_module, name="epoch_started", value=trainer.current_epoch, sync_dist=False) # type: ignore
|
||||
super().on_train_epoch_end(trainer, pl_module)
|
||||
|
||||
|
||||
def create_lightning_trainer(container: LightningContainer,
|
||||
|
@ -92,17 +93,23 @@ def create_lightning_trainer(container: LightningContainer,
|
|||
logging.debug(f"resume_from_checkpoint: {resume_from_checkpoint}")
|
||||
num_gpus = container.num_gpus_per_node()
|
||||
effective_num_gpus = num_gpus * num_nodes
|
||||
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of GPU memory).
|
||||
if effective_num_gpus > 1:
|
||||
accelerator: Optional[str] = "ddp"
|
||||
# Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin prints out
|
||||
# lengthy warnings about the performance impact of find_unused_parameters.
|
||||
plugins = [DDPPlugin(num_nodes=num_nodes, sync_batchnorm=True,
|
||||
find_unused_parameters=container.pl_find_unused_parameters)]
|
||||
strategy = None
|
||||
if effective_num_gpus == 0:
|
||||
accelerator = "cpu"
|
||||
devices = 1
|
||||
message = "CPU"
|
||||
else:
|
||||
accelerator = None
|
||||
plugins = []
|
||||
logging.info(f"Using {num_gpus} GPUs per node with accelerator '{accelerator}'")
|
||||
accelerator = "gpu"
|
||||
devices = num_gpus
|
||||
message = f"{devices} GPU"
|
||||
if effective_num_gpus > 1:
|
||||
# Accelerator should be "ddp" when running large models in AzureML (when using DDP_spawn, we get out of
|
||||
# GPU memory).
|
||||
# Initialize the DDP plugin. The default for pl_find_unused_parameters is False. If True, the plugin
|
||||
# prints out lengthy warnings about the performance impact of find_unused_parameters.
|
||||
strategy = DDPPlugin(find_unused_parameters=container.pl_find_unused_parameters)
|
||||
message += "s per node with DDP"
|
||||
logging.info(f"Using {message}")
|
||||
tensorboard_logger = TensorBoardLogger(save_dir=str(container.logs_folder), name="Lightning", version="")
|
||||
loggers = [tensorboard_logger, AzureMLLogger(False)]
|
||||
storing_logger = StoringLogger()
|
||||
|
@ -111,8 +118,7 @@ def create_lightning_trainer(container: LightningContainer,
|
|||
precision = 32 if num_gpus == 0 else 16 if container.use_mixed_precision else 32
|
||||
# The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
|
||||
# https://pytorch.org/docs/stable/notes/randomness.html
|
||||
# For the classification models, we observed only a small performance deterioration (increase in 10sec on total
|
||||
# training time of 22min) when switching to deterministic.
|
||||
# Note that switching to deterministic models can have large performance downside.
|
||||
if container.pl_deterministic:
|
||||
deterministic = True
|
||||
benchmark = False
|
||||
|
@ -134,7 +140,9 @@ def create_lightning_trainer(container: LightningContainer,
|
|||
recovery_checkpoint_callback,
|
||||
]
|
||||
if container.monitor_loading:
|
||||
callbacks.append(BatchTimeCallback())
|
||||
# TODO antonsc: Remove after fixing the callback.
|
||||
raise NotImplementedError("Monitoring batch loading times has been temporarily disabled.")
|
||||
# callbacks.append(BatchTimeCallback())
|
||||
if num_gpus > 0 and container.monitor_gpu:
|
||||
logging.info("Adding monitoring for GPU utilization")
|
||||
callbacks.append(GPUStatsMonitor(intra_step_time=True, inter_step_time=True))
|
||||
|
@ -150,19 +158,23 @@ def create_lightning_trainer(container: LightningContainer,
|
|||
callbacks.extend(container.get_callbacks())
|
||||
is_azureml_run = not is_offline_run_context(RUN_CONTEXT)
|
||||
progress_bar_refresh_rate = container.pl_progress_bar_refresh_rate
|
||||
if progress_bar_refresh_rate is None:
|
||||
progress_bar_refresh_rate = 50
|
||||
logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
|
||||
f"To change, modify the pl_progress_bar_refresh_rate field of the container.")
|
||||
if is_azureml_run:
|
||||
if progress_bar_refresh_rate is None:
|
||||
progress_bar_refresh_rate = 50
|
||||
logging.info(f"The progress bar refresh rate is not set. Using a default of {progress_bar_refresh_rate}. "
|
||||
f"To change, modify the pl_progress_bar_refresh_rate field of the container.")
|
||||
callbacks.append(AzureMLProgressBar(refresh_rate=progress_bar_refresh_rate,
|
||||
write_to_logging_info=True,
|
||||
print_timestamp=False))
|
||||
else:
|
||||
callbacks.append(TQDMProgressBar(refresh_rate=progress_bar_refresh_rate))
|
||||
# Read out additional model-specific args here.
|
||||
# We probably want to keep essential ones like numgpu and logging.
|
||||
trainer = Trainer(default_root_dir=str(container.outputs_folder),
|
||||
deterministic=deterministic,
|
||||
benchmark=benchmark,
|
||||
accelerator=accelerator,
|
||||
plugins=plugins,
|
||||
strategy=strategy,
|
||||
max_epochs=container.num_epochs,
|
||||
# Both these arguments can be integers or floats. If integers, it is the number of batches.
|
||||
# If float, it's the fraction of batches. We default to 1.0 (processing all batches).
|
||||
|
@ -172,12 +184,11 @@ def create_lightning_trainer(container: LightningContainer,
|
|||
check_val_every_n_epoch=container.pl_check_val_every_n_epoch,
|
||||
callbacks=callbacks,
|
||||
logger=loggers,
|
||||
progress_bar_refresh_rate=progress_bar_refresh_rate,
|
||||
num_nodes=num_nodes,
|
||||
gpus=num_gpus,
|
||||
devices=devices,
|
||||
precision=precision,
|
||||
sync_batchnorm=True,
|
||||
terminate_on_nan=container.detect_anomaly,
|
||||
detect_anomaly=container.detect_anomaly,
|
||||
profiler=container.pl_profiler,
|
||||
resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
|
||||
**additional_args)
|
||||
|
|
|
@ -377,7 +377,8 @@ class MLRunner:
|
|||
logging.info("Comparing the current results against stored results")
|
||||
if self.is_normal_run_or_crossval_child_0():
|
||||
compare_folders_and_run_outputs(expected=self.container.regression_test_folder,
|
||||
actual=self.container.outputs_folder)
|
||||
actual=self.container.outputs_folder,
|
||||
csv_relative_tolerance=self.container.regression_test_csv_tolerance)
|
||||
else:
|
||||
logging.info("Skipping because this is not cross-validation child run 0.")
|
||||
|
||||
|
@ -718,7 +719,9 @@ class MLRunner:
|
|||
|
||||
def are_sibling_runs_finished(self) -> bool:
|
||||
"""
|
||||
Checks if all child runs (except the current run) of the current run's parent are completed or failed.
|
||||
Checks if all child runs (except the current run) of the current run's parent are completed, failed,
|
||||
or cancelled.
|
||||
|
||||
:return: True if all sibling runs of the current run have finished (they either completed successfully,
|
||||
or failed). False if any of them is still pending (running or queued).
|
||||
"""
|
||||
|
@ -729,7 +732,7 @@ class MLRunner:
|
|||
expected_number_cross_validation_splits=n_splits)
|
||||
pending_runs = [x.id for x in child_runs
|
||||
if (x.id != RUN_CONTEXT.id)
|
||||
and (x.get_status() not in [RunStatus.COMPLETED, RunStatus.FAILED])]
|
||||
and (x.get_status() not in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELED])]
|
||||
all_runs_finished = len(pending_runs) == 0
|
||||
if not all_runs_finished:
|
||||
logging.info(f"Waiting for sibling run(s) to finish: {pending_runs}")
|
||||
|
|
|
@ -117,7 +117,6 @@ class Runner:
|
|||
:param model_deployment_hook: an optional function for deploying a model in an application-specific way.
|
||||
If present, it should take a model config (SegmentationModelBase), an AzureConfig, and an AzureML
|
||||
Model as arguments, and return an optional Path and a further object of any type.
|
||||
:param command_line_args: command-line arguments to use; if None, use sys.argv.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
|
@ -230,6 +229,8 @@ class Runner:
|
|||
and not self.lightning_container.azure_dataset_id:
|
||||
raise ValueError("When running an InnerEye built-in model in AzureML, the 'azure_dataset_id' "
|
||||
"property must be set.")
|
||||
# https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
|
||||
env_variables = {"CUBLAS_WORKSPACE_CONFIG": ":4096:8"} if self.lightning_container.pl_deterministic else {}
|
||||
source_config = SourceConfig(
|
||||
root_folder=self.project_root,
|
||||
entry_script=Path(sys.argv[0]).resolve(),
|
||||
|
@ -238,7 +239,8 @@ class Runner:
|
|||
hyperdrive_config_func=(self.model_config.get_hyperdrive_config if self.model_config
|
||||
else self.lightning_container.get_hyperdrive_config),
|
||||
# For large jobs, upload of results can time out because of large checkpoint files. Default is 600
|
||||
upload_timeout_seconds=86400
|
||||
upload_timeout_seconds=86400,
|
||||
environment_variables=env_variables
|
||||
)
|
||||
# Reduce the size of the snapshot by adding unused folders to amlignore. The Test* subfolders are only needed
|
||||
# when running pytest.
|
||||
|
@ -259,6 +261,10 @@ class Runner:
|
|||
"""
|
||||
A function that will be called right after job submission.
|
||||
"""
|
||||
# Set the default display name to what was provided as the "tag". This will affect single runs
|
||||
# and Hyperdrive parent runs
|
||||
if self.azure_config.tag:
|
||||
azure_run.display_name = self.azure_config.tag
|
||||
# Add an extra tag that depends on the run that was actually submitted. This is used for later filtering
|
||||
# run in cross validation analysis
|
||||
recovery_id = create_run_recovery_id(azure_run)
|
||||
|
@ -332,11 +338,14 @@ class Runner:
|
|||
commandline_args=" ".join(source_config.script_params)),
|
||||
after_submission=after_submission_hook,
|
||||
hyperdrive_config=hyperdrive_config)
|
||||
# Set the default display name to what was provided as the "tag"
|
||||
if self.azure_config.tag:
|
||||
azure_run_info.run.display_name = self.azure_config.tag
|
||||
if self.azure_config.tag and azure_run_info.run:
|
||||
if self.lightning_container.perform_cross_validation:
|
||||
# This code is only reached inside Azure. Set display name again - this will now affect
|
||||
# Hypdrive child runs (for other jobs, this has already been done after submission)
|
||||
cv_index = self.lightning_container.cross_validation_split_index
|
||||
full_display_name = f"{self.azure_config.tag} {cv_index}"
|
||||
azure_run_info.run.display_name = full_display_name
|
||||
else:
|
||||
# compute_cluster_name is a required parameter in early versions of the HI-ML package
|
||||
azure_run_info = submit_to_azure_if_needed(
|
||||
input_datasets=input_datasets,
|
||||
submit_to_azureml=False)
|
||||
|
|
|
@ -224,9 +224,9 @@ def r2_score(model_output: Union[torch.Tensor, np.ndarray], label: Union[torch.T
|
|||
Computes the coefficient of determination R2. Represents the proportion of variance explained
|
||||
by the (independent) variables in the model. R2 = 1 - Mean(SquaredErrors) / Variance(Labels)
|
||||
"""
|
||||
if torch.is_tensor(label):
|
||||
if isinstance(label, torch.Tensor):
|
||||
label = label.detach().cpu().numpy()
|
||||
if torch.is_tensor(model_output):
|
||||
if isinstance(model_output, torch.Tensor):
|
||||
model_output = model_output.detach().cpu().numpy()
|
||||
return sklearn_r2_score(label, model_output)
|
||||
|
||||
|
@ -260,10 +260,10 @@ def convert_input_and_label(model_output: Union[torch.Tensor, np.ndarray],
|
|||
model_output = torch.tensor(model_output)
|
||||
if not torch.is_tensor(label):
|
||||
label = torch.tensor(label)
|
||||
return model_output.float(), label.float()
|
||||
return model_output.float(), label.float() # type: ignore
|
||||
|
||||
|
||||
def is_missing_ground_truth(ground_truth: np.array) -> bool:
|
||||
def is_missing_ground_truth(ground_truth: np.ndarray) -> bool:
|
||||
"""
|
||||
calculate_metrics_per_class in metrics.py and plot_contours_for_all_classes in plotting.py both
|
||||
check whether there is ground truth missing using this simple check for NaN value at 0, 0, 0.
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
subject_count,loss,learning_rate,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
|
||||
2.000000,0.718559,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,98256.000000,0,-1
|
||||
2.000000,0.792989,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,43307.000000,13992.500000,1,-1
|
||||
2.000000,0.718717,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,126273.000000,0,-1
|
||||
2.000000,0.775692,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,84030.000000,0.000000,1,-1
|
||||
|
|
|
|
@ -1,3 +1,3 @@
|
|||
subject_count,loss,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
|
||||
2.000000,0.715468,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,89502.000000,0,-1
|
||||
2.000000,0.715476,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,89502.000000,1,-1
|
||||
2.000000,0.716739,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,84282.000000,0,-1
|
||||
2.000000,0.716731,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,84282.000000,1,-1
|
||||
|
|
|
|
@ -1,3 +1,3 @@
|
|||
subject_count,loss,learning_rate,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
|
||||
4.000000,0.753852,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,18609.000000,61179.000000,0,-1
|
||||
4.000000,0.773389,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,33453.000000,24258.500000,1,-1
|
||||
4.000000,0.716914,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,99342.000000,0,-1
|
||||
4.000000,0.773825,0.000090,0.000000,0.000000,0.000000,0.000000,181.250000,83803.250000,122.250000,1,-1
|
||||
|
|
|
|
@ -1,3 +1,3 @@
|
|||
subject_count,loss,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
|
||||
4.000000,0.758059,0.000000,0.000000,0.000000,0.000000,0.000000,32021.000000,48380.750000,0,-1
|
||||
4.000000,0.758054,0.000000,0.000000,0.000000,0.000000,0.000000,32021.000000,48380.750000,1,-1
|
||||
4.000000,0.729041,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,65335.000000,0,-1
|
||||
4.000000,0.729027,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,65335.000000,1,-1
|
||||
|
|
|
|
@ -1,3 +1,3 @@
|
|||
subject_count,loss,learning_rate,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
|
||||
2.000000,0.720039,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,110279.000000,0,0
|
||||
2.000000,0.793326,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,43307.000000,3900.500000,1,0
|
||||
2.000000,0.723947,0.000100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,148093.000000,0,0
|
||||
2.000000,0.768604,0.000090,0.000000,0.000000,0.000000,0.000000,0.000000,86614.000000,0.000000,1,0
|
||||
|
|
|
|
@ -1,3 +1,3 @@
|
|||
subject_count,loss,Dice/AverageAcrossStructures,Dice/spinalcord,Dice/lung_r,Dice/lung_l,VoxelCount/spinalcord,VoxelCount/lung_r,VoxelCount/lung_l,epoch,cross_validation_split_index
|
||||
2.000000,0.721449,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,54751.500000,0,0
|
||||
2.000000,0.721449,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,54751.500000,1,0
|
||||
2.000000,0.715168,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,94367.500000,0,0
|
||||
2.000000,0.715166,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,94367.500000,1,0
|
||||
|
|
|
|
@ -12,8 +12,9 @@ up the most recently run AzureML job from most_recent_run.txt
|
|||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
from unittest import mock
|
||||
|
||||
import numpy as np
|
||||
|
@ -57,7 +58,7 @@ from health_azure.himl import RUN_RECOVERY_FILE
|
|||
|
||||
FALLBACK_SINGLE_RUN = "refs_pull_606_merge:refs_pull_606_merge_1638867172_17ba8dc5"
|
||||
FALLBACK_ENSEMBLE_RUN = "refs_pull_606_merge:HD_b8a6ad93-8c19-45de-8ea1-f87fce92c3bd"
|
||||
FALLBACK_2NODE_RUN = "refs_pull_606_merge:refs_pull_606_merge_1638867224_8d8072fe"
|
||||
FALLBACK_2NODE_RUN = "refs_pull_593_merge:refs_pull_591_merge_1639416130_e5d29ba7"
|
||||
FALLBACK_CV_GLAUCOMA = "refs_pull_545_merge:HD_72ecc647-07c3-4353-a538-620346114ebd"
|
||||
FALLBACK_HELLO_CONTAINER_RUN = "refs_pull_606_merge:refs_pull_606_merge_1638867108_789991ac"
|
||||
|
||||
|
@ -386,6 +387,28 @@ def test_register_and_score_model(test_output_dirs: OutputFolderForTests) -> Non
|
|||
assert_nifti_content(str(expected_segmentation_path), expected_shape, image_header, [3], np.ubyte)
|
||||
|
||||
|
||||
def get_job_log_file(run: Run, index: Optional[int] = None) -> str:
|
||||
"""
|
||||
Reads the job log file (70_driver_log.txt or std_log.txt) of the given job. If an index is provided, get
|
||||
the matching file from a multi-node job.
|
||||
:return: The contents of the job log file.
|
||||
"""
|
||||
assert run.status == RunStatus.COMPLETED
|
||||
files = run.get_file_names()
|
||||
suffix = (f"_{index}" if index is not None else "") + ".txt"
|
||||
file1 = "azureml-logs/70_driver_log" + suffix
|
||||
file2 = "user_logs/std_log" + suffix
|
||||
if file1 in files:
|
||||
file = file1
|
||||
elif file2 in files:
|
||||
file = file2
|
||||
else:
|
||||
raise ValueError(f"No log file ({file1} or {file2}) present in the run. Existing files: {files}")
|
||||
downloaded = tempfile.NamedTemporaryFile().name
|
||||
run.download_file(name=file, output_file_path=downloaded)
|
||||
return Path(downloaded).read_text()
|
||||
|
||||
|
||||
@pytest.mark.after_training_2node
|
||||
def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
|
@ -393,19 +416,9 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
|
|||
"""
|
||||
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
|
||||
assert run.status == RunStatus.COMPLETED
|
||||
files = run.get_file_names()
|
||||
# There are two nodes, so there should be one log file per node.
|
||||
log0_path = "azureml-logs/70_driver_log_0.txt"
|
||||
log1_path = "azureml-logs/70_driver_log_1.txt"
|
||||
assert log0_path in files, "Node rank 0 log file is missing"
|
||||
assert log1_path in files, "Node rank 1 log file is missing"
|
||||
# Download both log files and check their contents
|
||||
log0 = test_output_dirs.root_dir / log0_path
|
||||
log1 = test_output_dirs.root_dir / log1_path
|
||||
run.download_file(log0_path, output_file_path=str(log0))
|
||||
run.download_file(log1_path, output_file_path=str(log1))
|
||||
log0_txt = log0.read_text()
|
||||
log1_txt = log1.read_text()
|
||||
log0_txt = get_job_log_file(run, index=0)
|
||||
log1_txt = get_job_log_file(run, index=1)
|
||||
# Only the node at rank 0 should be done certain startup activities, like visualizing crops.
|
||||
# Running inference similarly should only run on one node.
|
||||
for in_log0_only in ["Visualizing the effect of sampling random crops for training",
|
||||
|
@ -418,8 +431,8 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None:
|
|||
assert training_indicator in log1_txt
|
||||
# Check diagnostic messages that show if DDP was set up correctly. This could fail if Lightning
|
||||
# changes its diagnostic outputs.
|
||||
assert "initializing ddp: GLOBAL_RANK: 0, MEMBER: 1/4" in log0_txt
|
||||
assert "initializing ddp: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
|
||||
assert "initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4" in log0_txt
|
||||
assert "initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4" in log1_txt
|
||||
|
||||
|
||||
@pytest.mark.skip("The recovery job hangs after completing on AML")
|
||||
|
@ -443,19 +456,9 @@ def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None:
|
|||
main()
|
||||
run = get_most_recent_run(fallback_run_id_for_local_execution=FALLBACK_2NODE_RUN)
|
||||
assert run.status == RunStatus.COMPLETED
|
||||
files = run.get_file_names()
|
||||
# There are two nodes, so there should be one log file per node.
|
||||
log0_path = "azureml-logs/70_driver_log_0.txt"
|
||||
log1_path = "azureml-logs/70_driver_log_1.txt"
|
||||
assert log0_path in files, "Node rank 0 log file is missing"
|
||||
assert log1_path in files, "Node rank 1 log file is missing"
|
||||
# Download both log files and check their contents
|
||||
log0 = test_output_dirs.root_dir / log0_path
|
||||
log1 = test_output_dirs.root_dir / log1_path
|
||||
run.download_file(log0_path, output_file_path=str(log0))
|
||||
run.download_file(log1_path, output_file_path=str(log1))
|
||||
log0_txt = log0.read_text()
|
||||
log1_txt = log1.read_text()
|
||||
log0_txt = get_job_log_file(run, index=0)
|
||||
log1_txt = get_job_log_file(run, index=1)
|
||||
assert "Downloading multiple files from run" in log0_txt
|
||||
assert "Downloading multiple files from run" not in log1_txt
|
||||
assert "Loading checkpoint that was created at (epoch = 2, global_step = 2)" in log0_txt
|
||||
|
|
|
@ -9,7 +9,7 @@ import pandas as pd
|
|||
import param
|
||||
import torch
|
||||
from pytorch_lightning import LightningDataModule, LightningModule
|
||||
from pytorch_lightning.metrics import MeanSquaredError
|
||||
from torchmetrics.regression import MeanSquaredError
|
||||
from torch import Tensor
|
||||
from torch.nn import Identity
|
||||
from torch.utils.data import DataLoader, Dataset
|
||||
|
|
|
@ -209,6 +209,8 @@ def test_rnn_classifier_via_config_1(use_combined_model: bool,
|
|||
use_mean_teacher_model=use_mean_teacher_model,
|
||||
should_validate=False)
|
||||
config.use_mixed_precision = True
|
||||
# Necessary because torch otherwise says "index_add_cuda_ does not have a deterministic implementation"
|
||||
config.pl_deterministic = False
|
||||
config.set_output_to(test_output_dirs.root_dir)
|
||||
config.dataset_data_frame = _get_mock_sequence_dataset()
|
||||
# Patch the load_images function that will be called once we access a dataset item
|
||||
|
@ -377,6 +379,8 @@ def test_rnn_classifier_via_config_2(test_output_dirs: OutputFolderForTests) ->
|
|||
dataset_contents += f"S{subject},{i},{value},{label}\n"
|
||||
logging_to_stdout()
|
||||
config = ToySequenceModel2(should_validate=False)
|
||||
# Necessary because torch otherwise says "index_add_cuda_ does not have a deterministic implementation"
|
||||
config.pl_deterministic = False
|
||||
config.num_epochs = 2
|
||||
config.set_output_to(test_output_dirs.root_dir)
|
||||
config.dataset_data_frame = _get_mock_sequence_dataset(dataset_contents)
|
||||
|
|
|
@ -273,6 +273,8 @@ def test_image_encoder_with_segmentation(test_output_dirs: OutputFolderForTests,
|
|||
aggregation_type=aggregation_type,
|
||||
scan_size=scan_size)
|
||||
config.use_mixed_precision = True
|
||||
# Necessary because torch otherwise says "avg_pool3d_backward_cuda does not have a deterministic implementation"
|
||||
config.pl_deterministic = False
|
||||
config.set_output_to(test_output_dirs.root_dir)
|
||||
config.num_epochs = 1
|
||||
config.local_dataset = Path()
|
||||
|
|
|
@ -2,23 +2,22 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import h5py
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from health_ml.utils import BatchTimeCallback
|
||||
import shutil
|
||||
|
||||
from pathlib import Path
|
||||
from torch.utils.data import DataLoader
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, is_windows, logging_to_stdout
|
||||
from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
|
||||
from InnerEye.Common.metrics_constants import MetricType, TRAIN_PREFIX, TrackedMetrics, VALIDATION_PREFIX
|
||||
from InnerEye.Common.metrics_constants import MetricType, TrackedMetrics, VALIDATION_PREFIX
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, CHECKPOINT_SUFFIX, DATASET_CSV_FILE_NAME, \
|
||||
ModelExecutionMode, \
|
||||
|
@ -116,19 +115,20 @@ def _test_model_train(output_dirs: OutputFolderForTests,
|
|||
model_training_result, _ = model_train_unittest(train_config, dirs=output_dirs)
|
||||
assert isinstance(model_training_result, StoringLogger)
|
||||
# Check that all metrics from the BatchTimeCallback are present
|
||||
for epoch, epoch_results in model_training_result.results_per_epoch.items():
|
||||
for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]:
|
||||
for metric_type in [BatchTimeCallback.EPOCH_TIME,
|
||||
BatchTimeCallback.BATCH_TIME + " avg",
|
||||
BatchTimeCallback.BATCH_TIME + " max",
|
||||
BatchTimeCallback.EXCESS_LOADING_TIME]:
|
||||
expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type
|
||||
assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}"
|
||||
# Excess loading time can be zero because that only measure batches over the threshold
|
||||
if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME:
|
||||
value = epoch_results[expected]
|
||||
assert isinstance(value, float)
|
||||
assert value > 0.0, f"Time for {expected} should be > 0"
|
||||
# # TODO: re-enable once the BatchTimeCallback is fixed
|
||||
# for epoch, epoch_results in model_training_result.results_per_epoch.items():
|
||||
# for prefix in [TRAIN_PREFIX, VALIDATION_PREFIX]:
|
||||
# for metric_type in [BatchTimeCallback.EPOCH_TIME,
|
||||
# BatchTimeCallback.BATCH_TIME + " avg",
|
||||
# BatchTimeCallback.BATCH_TIME + " max",
|
||||
# BatchTimeCallback.EXCESS_LOADING_TIME]:
|
||||
# expected = BatchTimeCallback.METRICS_PREFIX + prefix + metric_type
|
||||
# assert expected in epoch_results, f"Expected {expected} in results for epoch {epoch}"
|
||||
# # Excess loading time can be zero because that only measure batches over the threshold
|
||||
# if metric_type != BatchTimeCallback.EXCESS_LOADING_TIME:
|
||||
# value = epoch_results[expected]
|
||||
# assert isinstance(value, float)
|
||||
# assert value > 0.0, f"Time for {expected} should be > 0"
|
||||
|
||||
actual_train_losses = model_training_result.get_train_metric(MetricType.LOSS.value)
|
||||
actual_val_losses = model_training_result.get_val_metric(MetricType.LOSS.value)
|
||||
|
|
|
@ -13,7 +13,8 @@ from InnerEye.Common.common_util import CROSSVAL_RESULTS_FOLDER, logging_to_stdo
|
|||
from InnerEye.Common.fixed_paths import MODEL_INFERENCE_JSON_FILE_NAME
|
||||
from InnerEye.Common.output_directories import OutputFolderForTests
|
||||
from InnerEye.ML import baselines_util
|
||||
from InnerEye.ML.baselines_util import REGRESSION_TEST_AZUREML_FOLDER, REGRESSION_TEST_AZUREML_PARENT_FOLDER, \
|
||||
from InnerEye.ML.baselines_util import FILE_FORMAT_ERROR, REGRESSION_TEST_AZUREML_FOLDER, \
|
||||
REGRESSION_TEST_AZUREML_PARENT_FOLDER, \
|
||||
REGRESSION_TEST_OUTPUT_FOLDER, compare_files, compare_folder_contents, compare_folders_and_run_outputs
|
||||
from InnerEye.ML.common import FINAL_MODEL_FOLDER
|
||||
from InnerEye.ML.run_ml import MLRunner
|
||||
|
@ -63,6 +64,40 @@ def test_compare_files_text(test_output_dirs: OutputFolderForTests, file_extensi
|
|||
assert compare_files(expected=expected, actual=actual) == baselines_util.CONTENTS_MISMATCH
|
||||
|
||||
|
||||
def test_compare_files_csv(test_output_dirs: OutputFolderForTests) -> None:
|
||||
expected = test_output_dirs.root_dir / "expected.csv"
|
||||
actual = test_output_dirs.root_dir / "actual.does_not_matter"
|
||||
expected.write_text("""foo,bar
|
||||
1.0,10.0""")
|
||||
actual.write_text("""foo,bar
|
||||
1.0001,10.001""")
|
||||
assert compare_files(expected=expected, actual=actual, csv_relative_tolerance=1e-2) == ""
|
||||
assert compare_files(expected=expected, actual=actual, csv_relative_tolerance=1e-3) == ""
|
||||
assert compare_files(expected=expected, actual=actual, csv_relative_tolerance=2e-4) == ""
|
||||
assert compare_files(expected=expected, actual=actual,
|
||||
csv_relative_tolerance=9e-5) == baselines_util.CONTENTS_MISMATCH
|
||||
|
||||
|
||||
def test_compare_files_empty_csv(test_output_dirs: OutputFolderForTests) -> None:
|
||||
"""
|
||||
If either of the two CSV files is empty, it should not raise an error, but exit gracefully.
|
||||
"""
|
||||
expected = test_output_dirs.root_dir / "expected.csv"
|
||||
actual = test_output_dirs.root_dir / "actual.csv"
|
||||
valid_csv = """foo,bar
|
||||
1.0,10.0"""
|
||||
empty_csv = ""
|
||||
for expected_contents, actual_contents in [(empty_csv, empty_csv),
|
||||
(valid_csv, empty_csv),
|
||||
(empty_csv, valid_csv)]:
|
||||
expected.write_text(expected_contents)
|
||||
actual.write_text(actual_contents)
|
||||
assert compare_files(expected=expected, actual=actual) == FILE_FORMAT_ERROR
|
||||
expected.write_text(valid_csv)
|
||||
actual.write_text(valid_csv)
|
||||
assert compare_files(expected=expected, actual=actual) == ""
|
||||
|
||||
|
||||
@pytest.mark.parametrize("file_extension", [".png", ".whatever"])
|
||||
def test_compare_files_binary(test_output_dirs: OutputFolderForTests, file_extension: str) -> None:
|
||||
"""
|
||||
|
@ -107,7 +142,8 @@ def test_compare_folder(test_output_dirs: OutputFolderForTests) -> None:
|
|||
(expected / subfolder / mismatch).write_text("contents1")
|
||||
(actual / subfolder / mismatch).write_text("contents2")
|
||||
|
||||
messages = compare_folder_contents(expected_folder=expected, actual_folder=actual)
|
||||
messages = compare_folder_contents(expected_folder=expected, actual_folder=actual,
|
||||
csv_relative_tolerance=0.0)
|
||||
all_messages = " ".join(messages)
|
||||
# No issues expected
|
||||
assert matching not in all_messages
|
||||
|
@ -130,13 +166,14 @@ def test_compare_plain_outputs(test_output_dirs: OutputFolderForTests) -> None:
|
|||
file1 = folder / "output.txt"
|
||||
create_folder_and_write_text(file1, "Something")
|
||||
# First comparison should pass
|
||||
compare_folders_and_run_outputs(expected=expected, actual=actual)
|
||||
compare_folders_and_run_outputs(expected=expected, actual=actual, csv_relative_tolerance=0.0)
|
||||
# Now add a file to the set of expected files that does not exist in the run: comparison should now fail
|
||||
no_such_file = "no_such_file.txt"
|
||||
file2 = expected / no_such_file
|
||||
create_folder_and_write_text(file2, "foo")
|
||||
with pytest.raises(ValueError) as ex:
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
message = ex.value.args[0].splitlines()
|
||||
assert f"{baselines_util.MISSING_FILE}: {no_such_file}" in message
|
||||
|
||||
|
@ -156,18 +193,21 @@ def test_compare_folder_against_run(test_output_dirs: OutputFolderForTests) -> N
|
|||
'"model_configs_namespace": "InnerEye.ML.configs.segmentation.BasicModel2Epochs"}')
|
||||
with mock.patch("InnerEye.ML.baselines_util.RUN_CONTEXT", run):
|
||||
# First comparison only on the .json file should pass
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
# Now add a file to the set of expected files that does not exist in the run: comparison should now fail
|
||||
no_such_file = "no_such_file.txt"
|
||||
file2 = test_output_dirs.root_dir / REGRESSION_TEST_AZUREML_FOLDER / no_such_file
|
||||
create_folder_and_write_text(file2, "foo")
|
||||
with pytest.raises(ValueError) as ex:
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
message = ex.value.args[0].splitlines()
|
||||
assert f"{baselines_util.MISSING_FILE}: {no_such_file}" in message
|
||||
# Now run the same comparison that failed previously, without mocking the RUN_CONTEXT. This should now
|
||||
# realize that the present run is an offline run, and skip the comparison
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
|
||||
|
||||
@pytest.mark.after_training_ensemble_run
|
||||
|
@ -190,10 +230,12 @@ No outliers found
|
|||
No outliers found""")
|
||||
with mock.patch("InnerEye.ML.baselines_util.PARENT_RUN_CONTEXT", parent_run):
|
||||
# No plain files to compare. The file Test_outliers.txt should be compared and found to match.
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
create_folder_and_write_text(file1, "foo")
|
||||
with pytest.raises(ValueError) as ex:
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
message = ex.value.args[0].splitlines()
|
||||
assert f"{baselines_util.CONTENTS_MISMATCH}: {CROSSVAL_RESULTS_FOLDER}/{file1.name}" in message
|
||||
# Now add a file to the set of expected files that does not exist in the run: comparison should now fail
|
||||
|
@ -201,11 +243,13 @@ No outliers found""")
|
|||
file2 = test_output_dirs.root_dir / REGRESSION_TEST_AZUREML_PARENT_FOLDER / no_such_file
|
||||
create_folder_and_write_text(file2, "foo")
|
||||
with pytest.raises(ValueError) as ex:
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
message = ex.value.args[0].splitlines()
|
||||
assert f"{baselines_util.MISSING_FILE}: {no_such_file}" in message
|
||||
# Now run the same comparison without mocking the PARENT_RUN_CONTEXT. This should now
|
||||
# realize that the present run is a crossval child run
|
||||
with pytest.raises(ValueError) as ex:
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd())
|
||||
compare_folders_and_run_outputs(expected=test_output_dirs.root_dir, actual=Path.cwd(),
|
||||
csv_relative_tolerance=0.0)
|
||||
assert "no (parent) run to compare against" in str(ex)
|
||||
|
|
|
@ -46,9 +46,9 @@ def create_model_and_store_checkpoint(config: ModelConfigBase, checkpoint_path:
|
|||
model = model.cuda() # type: ignore
|
||||
trainer.model = model
|
||||
# Before saving, the values for epoch and step are incremented. Save them here in such a way that we can assert
|
||||
# easily later.
|
||||
trainer.current_epoch = FIXED_EPOCH - 1 # type: ignore
|
||||
trainer.global_step = FIXED_GLOBAL_STEP - 1 # type: ignore
|
||||
# easily later. We can't mock that because otherwise the mock object would be written to disk (that fails)
|
||||
trainer.fit_loop.current_epoch = FIXED_EPOCH - 1 # type: ignore
|
||||
trainer.fit_loop.global_step = FIXED_GLOBAL_STEP - 1 # type: ignore
|
||||
# In PL, it is the Trainer's responsibility to save the model. Checkpoint handling refers back to the trainer
|
||||
# to get a save_func. Mimicking that here.
|
||||
trainer.save_checkpoint(checkpoint_path, weights_only=weights_only)
|
||||
|
|
|
@ -2,11 +2,10 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
|
||||
# ------------------------------------------------------------------------------------------
|
||||
import math
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
from unittest import mock
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
@ -16,6 +15,7 @@ from pytorch_lightning import Trainer
|
|||
from pytorch_lightning.callbacks import ModelCheckpoint
|
||||
from torch.nn import Module
|
||||
from torch.optim.lr_scheduler import _LRScheduler
|
||||
from typing import Dict
|
||||
|
||||
from InnerEye.Common import fixed_paths
|
||||
from InnerEye.Common.common_util import is_windows
|
||||
|
@ -135,8 +135,9 @@ def test_innereye_ssl_container_cifar10_resnet_simclr() -> None:
|
|||
assert len(checkpoint["optimizer_states"]) == 1
|
||||
assert len(checkpoint["lr_schedulers"]) == 1
|
||||
assert "callbacks" in checkpoint
|
||||
assert SSLOnlineEvaluatorInnerEye in checkpoint["callbacks"]
|
||||
callback_state = checkpoint["callbacks"][SSLOnlineEvaluatorInnerEye]
|
||||
callback_name = SSLOnlineEvaluatorInnerEye.__name__
|
||||
assert callback_name in checkpoint["callbacks"]
|
||||
callback_state = checkpoint["callbacks"][callback_name]
|
||||
assert SSLOnlineEvaluatorInnerEye.OPTIMIZER_STATE_NAME in callback_state
|
||||
assert SSLOnlineEvaluatorInnerEye.EVALUATOR_STATE_NAME in callback_state
|
||||
|
||||
|
@ -336,8 +337,9 @@ def test_online_evaluator_recovery(test_output_dirs: OutputFolderForTests) -> No
|
|||
# It's somewhat obsolete, but we can now check that the checkpoint file really contained the optimizer and weights
|
||||
checkpoint = torch.load(last_checkpoint)
|
||||
assert "callbacks" in checkpoint
|
||||
assert SSLOnlineEvaluatorInnerEye in checkpoint["callbacks"]
|
||||
callback_state = checkpoint["callbacks"][SSLOnlineEvaluatorInnerEye]
|
||||
callback_name = SSLOnlineEvaluatorInnerEye.__name__
|
||||
assert callback_name in checkpoint["callbacks"]
|
||||
callback_state = checkpoint["callbacks"][callback_name]
|
||||
assert SSLOnlineEvaluatorInnerEye.OPTIMIZER_STATE_NAME in callback_state
|
||||
assert SSLOnlineEvaluatorInnerEye.EVALUATOR_STATE_NAME in callback_state
|
||||
|
||||
|
@ -359,7 +361,8 @@ def test_online_evaluator_not_distributed() -> None:
|
|||
# Standard trainer without DDP
|
||||
trainer = Trainer()
|
||||
# Test the flag that the internal logic of on_pretrain_routine_start uses
|
||||
assert not trainer.accelerator_connector.is_distributed
|
||||
assert hasattr(trainer, "_accelerator_connector")
|
||||
assert not trainer._accelerator_connector.is_distributed
|
||||
mock_module = mock.MagicMock(device=torch.device("cpu"))
|
||||
callback.on_pretrain_routine_start(trainer, mock_module)
|
||||
assert isinstance(callback.evaluator, Module)
|
||||
|
@ -378,19 +381,19 @@ def test_online_evaluator_distributed() -> None:
|
|||
with mock.patch("InnerEye.ML.SSL.lightning_modules.ssl_online_evaluator.DistributedDataParallel",
|
||||
return_value=mock_ddp_result) as mock_ddp:
|
||||
callback = SSLOnlineEvaluatorInnerEye(class_weights=None,
|
||||
z_dim=1,
|
||||
num_classes=2,
|
||||
dataset="foo",
|
||||
drop_p=0.2,
|
||||
learning_rate=1e-5)
|
||||
z_dim=1,
|
||||
num_classes=2,
|
||||
dataset="foo",
|
||||
drop_p=0.2,
|
||||
learning_rate=1e-5)
|
||||
|
||||
# Trainer with DDP
|
||||
device = torch.device("cuda:0")
|
||||
mock_module = mock.MagicMock(device=device)
|
||||
trainer = Trainer(accelerator="ddp", gpus=2)
|
||||
# Test the two flags that the internal logic of on_pretrain_routine_start uses
|
||||
assert trainer.accelerator_connector.is_distributed
|
||||
assert trainer.accelerator_connector.use_ddp
|
||||
assert trainer._accelerator_connector.is_distributed
|
||||
assert trainer._accelerator_connector.use_ddp
|
||||
original_evaluator = callback.evaluator
|
||||
callback.on_pretrain_routine_start(trainer, mock_module)
|
||||
# Check that SyncBatchNorm has been turned on
|
||||
|
|
|
@ -98,7 +98,7 @@ jobs:
|
|||
- name: tag
|
||||
value: 'TrainEnsemble'
|
||||
- name: more_switches
|
||||
value: '--pl_deterministic --regression_test_folder=RegressionTestResults/PR_TrainEnsemble'
|
||||
value: '--pl_deterministic --log_level=DEBUG --regression_test_folder=RegressionTestResults/PR_TrainEnsemble'
|
||||
pool:
|
||||
vmImage: 'ubuntu-18.04'
|
||||
steps:
|
||||
|
|
|
@ -35,7 +35,7 @@ steps:
|
|||
# work on the package
|
||||
- bash: |
|
||||
source activate InnerEye
|
||||
pytest ./Tests/ -m "not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)" --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-config=.coveragerc --cov-report=xml -n 2 --dist=loadscope --verbose
|
||||
pytest ./Tests/ -m "not (gpu or azureml or after_training_single_run or after_training_ensemble_run or inference or after_training_2node or after_training_glaucoma_cv_run or after_training_hello_container)" --doctest-modules --junitxml=junit/test-results.xml --cov=. --cov-config=.coveragerc --cov-report=xml --verbose
|
||||
env:
|
||||
APPLICATION_KEY: $(InnerEyeDeepLearningServicePrincipalKey)
|
||||
DATASETS_ACCOUNT_KEY: $(InnerEyePublicDatasetsStorageKey)
|
||||
|
|
|
@ -8,7 +8,7 @@ dependencies:
|
|||
- pip=20.1.1
|
||||
- python=3.7.3
|
||||
- pytorch=1.8.0
|
||||
- python-blosc==1.7.0
|
||||
- python-blosc=1.7.0
|
||||
- torchvision=0.9.0
|
||||
- pip:
|
||||
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
|
||||
|
@ -52,8 +52,8 @@ dependencies:
|
|||
- pytest-cov==2.10.1
|
||||
- pytest-forked==1.3.0
|
||||
- pytest-xdist==1.34.0
|
||||
- pytorch-lightning==1.3.8
|
||||
- rich==5.1.1
|
||||
- pytorch-lightning==1.5.5
|
||||
- rich==10.13.0
|
||||
- rpdb==0.1.6
|
||||
- ruamel.yaml==0.16.12
|
||||
- runstats==1.8.0
|
||||
|
@ -68,6 +68,6 @@ dependencies:
|
|||
- tensorboard==2.3.0
|
||||
- tensorboardX==2.1
|
||||
- torchprof==1.3.3
|
||||
- torchmetrics==0.4.1
|
||||
- torchmetrics==0.6.0
|
||||
- umap-learn==0.5.2
|
||||
- yacs==0.1.8
|
||||
|
|
2
fastMRI
2
fastMRI
|
@ -1 +1 @@
|
|||
Subproject commit f2070aeb7a5e7d1b0e45c6aad247d18d074705a8
|
||||
Subproject commit 13560d2f198cc72f06e01675e9ecee509ce5639a
|
Загрузка…
Ссылка в новой задаче