In this PR:
- Fix is introduced to handle missing cross-validation rounds (e.g., when a round fails)
- Extra validation epoch is handled
- Downloading the AML metrics json and saving it as a dataframe is separated into two functions (current version gives a `KeyError` when the json is downloaded for the first time)

Co-authored-by: Harshita Sharma <t-hsharma@microsoft.com>
This commit is contained in:
Harshita Sharma 2022-11-07 09:39:09 +00:00 коммит произвёл GitHub
Родитель 41d30807aa
Коммит 6ba3cfe685
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 171 добавлений и 100 удалений

Просмотреть файл

@ -18,8 +18,9 @@ from health_cpath.utils.output_utils import (AML_LEGACY_TEST_OUTPUTS_CSV, AML_TE
AML_VAL_OUTPUTS_CSV)
from health_cpath.utils.report_utils import (collect_hyperdrive_metrics, collect_hyperdrive_outputs,
child_runs_have_val_and_test_outputs, get_best_epoch_metrics,
get_best_epochs, get_hyperdrive_metrics_table, get_formatted_run_info,
collect_class_info)
get_best_epochs, get_child_runs_hyperparams, get_hyperdrive_metrics_table,
get_formatted_run_info, collect_class_info, get_max_epochs,
download_hyperdrive_metrics_if_required)
from health_cpath.utils.naming import MetricsKey, ModelKey
@ -53,10 +54,16 @@ def generate_html_report(parent_run_id: str, output_dir: Path,
report.add_heading("Azure ML metrics", level=2)
# Download metrics from AML. Can take several seconds for each child run
metrics_df = collect_hyperdrive_metrics(
parent_run_id, report_dir, aml_workspace, overwrite=overwrite, hyperdrive_arg_name=hyperdrive_arg_name
)
best_epochs = get_best_epochs(metrics_df, f'{ModelKey.VAL}/{primary_metric}', maximise=True)
metrics_json = download_hyperdrive_metrics_if_required(parent_run_id, report_dir, aml_workspace,
overwrite=overwrite, hyperdrive_arg_name=hyperdrive_arg_name)
# Get metrics dataframe from the downloaded json file
metrics_df = collect_hyperdrive_metrics(metrics_json=metrics_json)
hyperparameters_children = get_child_runs_hyperparams(metrics_df)
max_epochs_dict = get_max_epochs(hyperparams_children=hyperparameters_children)
best_epochs = get_best_epochs(metrics_df=metrics_df, primary_metric=f'{ModelKey.VAL}/{primary_metric}',
max_epochs_dict=max_epochs_dict, maximise=True)
# Add training curves for loss and AUROC (train and val.)
render_training_curves(report, heading="Training curves", level=3,
@ -64,7 +71,7 @@ def generate_html_report(parent_run_id: str, output_dir: Path,
primary_metric=primary_metric)
# Get metrics list with class names
num_classes, class_names = collect_class_info(metrics_df=metrics_df)
num_classes, class_names = collect_class_info(hyperparams_children=hyperparameters_children)
base_metrics_list: List[str] = [MetricsKey.ACC, MetricsKey.AUROC, MetricsKey.AVERAGE_PRECISION,
MetricsKey.COHENKAPPA]
@ -87,56 +94,58 @@ def generate_html_report(parent_run_id: str, output_dir: Path,
metrics_df=metrics_df, best_epochs=None,
base_metrics_list=base_metrics_list, metrics_prefix=f'{ModelKey.TEST}/')
has_val_and_test_outputs = child_runs_have_val_and_test_outputs(parent_run)
# Get output data frames
if has_val_and_test_outputs:
output_filename_val = AML_VAL_OUTPUTS_CSV
outputs_dfs_val = collect_hyperdrive_outputs(parent_run_id=parent_run_id, download_dir=report_dir,
aml_workspace=aml_workspace,
output_filename=output_filename_val, overwrite=overwrite,
hyperdrive_arg_name=hyperdrive_arg_name)
if include_test:
output_filename_test = AML_TEST_OUTPUTS_CSV if has_val_and_test_outputs else AML_LEGACY_TEST_OUTPUTS_CSV
outputs_dfs_test = collect_hyperdrive_outputs(parent_run_id=parent_run_id, download_dir=report_dir,
aml_workspace=aml_workspace,
output_filename=output_filename_test, overwrite=overwrite,
hyperdrive_arg_name=hyperdrive_arg_name)
if num_classes == 1:
# Currently ROC and PR curves rendered only for binary case
# TODO: Enable rendering of multi-class ROC and PR curves
report.add_heading("ROC and PR curves", level=2)
# Get output data frames if available
try:
has_val_and_test_outputs = child_runs_have_val_and_test_outputs(parent_run)
if has_val_and_test_outputs:
# Add val. ROC and PR curves
render_roc_and_pr_curves(report=report, heading="Validation ROC and PR curves", level=3,
report_dir=report_dir,
outputs_dfs=outputs_dfs_val,
prefix=f'{ModelKey.VAL}_')
output_filename_val = AML_VAL_OUTPUTS_CSV
outputs_dfs_val = collect_hyperdrive_outputs(parent_run_id=parent_run_id, download_dir=report_dir,
aml_workspace=aml_workspace,
output_filename=output_filename_val, overwrite=overwrite,
hyperdrive_arg_name=hyperdrive_arg_name)
if include_test:
output_filename_test = AML_TEST_OUTPUTS_CSV if has_val_and_test_outputs else AML_LEGACY_TEST_OUTPUTS_CSV
outputs_dfs_test = collect_hyperdrive_outputs(parent_run_id=parent_run_id, download_dir=report_dir,
aml_workspace=aml_workspace,
output_filename=output_filename_test, overwrite=overwrite,
hyperdrive_arg_name=hyperdrive_arg_name)
if num_classes == 1:
# Currently ROC and PR curves rendered only for binary case
# TODO: Enable rendering of multi-class ROC and PR curves
report.add_heading("ROC and PR curves", level=2)
if has_val_and_test_outputs:
# Add val. ROC and PR curves
render_roc_and_pr_curves(report=report, heading="Validation ROC and PR curves", level=3,
report_dir=report_dir,
outputs_dfs=outputs_dfs_val,
prefix=f'{ModelKey.VAL}_')
if include_test:
# Add test ROC and PR curves
render_roc_and_pr_curves(report=report, heading="Test ROC and PR curves", level=3,
report_dir=report_dir,
outputs_dfs=outputs_dfs_test,
prefix=f'{ModelKey.TEST}_')
# Add confusion matrices for each fold
report.add_heading("Confusion matrices", level=2)
if has_val_and_test_outputs:
# Add val. confusion matrices
render_confusion_matrices(report=report, heading="Validation confusion matrices", level=3,
class_names=class_names,
report_dir=report_dir, outputs_dfs=outputs_dfs_val,
prefix=f'{ModelKey.VAL}_')
if include_test:
# Add test ROC and PR curves
render_roc_and_pr_curves(report=report, heading="Test ROC and PR curves", level=3,
report_dir=report_dir,
outputs_dfs=outputs_dfs_test,
prefix=f'{ModelKey.TEST}_')
# Add test confusion matrices
render_confusion_matrices(report=report, heading="Test confusion matrices", level=3,
class_names=class_names,
report_dir=report_dir, outputs_dfs=outputs_dfs_test,
prefix=f'{ModelKey.TEST}_')
# Add confusion matrices for each fold
report.add_heading("Confusion matrices", level=2)
if has_val_and_test_outputs:
# Add val. confusion matrices
render_confusion_matrices(report=report, heading="Validation confusion matrices", level=3,
class_names=class_names,
report_dir=report_dir, outputs_dfs=outputs_dfs_val,
prefix=f'{ModelKey.VAL}_')
if include_test:
# Add test confusion matrices
render_confusion_matrices(report=report, heading="Test confusion matrices", level=3,
class_names=class_names,
report_dir=report_dir, outputs_dfs=outputs_dfs_test,
prefix=f'{ModelKey.TEST}_')
except ValueError as e:
print(e)
print("Since all expected output files were not found, skipping ROC-PR curves and confusion matrices.")
# TODO: Add qualitative model outputs
# report.add_heading("Qualitative model outputs", level=2)

Просмотреть файл

@ -229,14 +229,17 @@ def plot_hyperdrive_training_curves(metrics_df: pd.DataFrame, train_metric: str,
for k in sorted(metrics_df.columns):
train_values = metrics_df.loc[train_metric, k]
val_values = metrics_df.loc[val_metric, k]
line, = ax.plot(train_values, **TRAIN_STYLE, label=f"Child {k}")
color = line.get_color()
ax.plot(val_values, color=color, **VAL_STYLE)
if train_values is not None:
line, = ax.plot(train_values, **TRAIN_STYLE, label=f"Child {k}")
color = line.get_color()
if val_values is not None:
ax.plot(val_values, color=color, **VAL_STYLE)
if best_epochs is not None:
best_epoch = best_epochs[k]
ax.plot(best_epoch, train_values[best_epoch], color=color, zorder=1000, **BEST_TRAIN_MARKER_STYLE)
ax.plot(best_epoch, val_values[best_epoch], color=color, zorder=1000, **BEST_VAL_MARKER_STYLE)
ax.axvline(best_epoch, color=color, **BEST_EPOCH_LINE_STYLE)
if best_epoch is not None:
ax.plot(best_epoch, train_values[best_epoch], color=color, zorder=1000, **BEST_TRAIN_MARKER_STYLE)
ax.plot(best_epoch, val_values[best_epoch], color=color, zorder=1000, **BEST_VAL_MARKER_STYLE)
ax.axvline(best_epoch, color=color, **BEST_EPOCH_LINE_STYLE)
ax.grid(color='0.9')
ax.set_xlabel("Epoch")
if ylabel:

Просмотреть файл

@ -89,6 +89,7 @@ class AMLMetricsJsonKey(str, Enum):
VALUE = 'value'
N_CLASSES = 'n_classes'
CLASS_NAMES = 'class_names'
MAX_EPOCHS = 'max_epochs'
class PlotOption(Enum):

Просмотреть файл

@ -4,7 +4,7 @@
# -------------------------------------------------------------------------------------------
from pathlib import Path
from typing import Dict, List, Sequence, Tuple
from typing import Any, Dict, List, Sequence, Tuple
import dateutil.parser
import numpy as np
@ -92,10 +92,10 @@ def collect_hyperdrive_outputs(parent_run_id: str, download_dir: Path, aml_works
return dict(sorted(all_outputs_dfs.items())) # type: ignore
def collect_hyperdrive_metrics(parent_run_id: str, download_dir: Path, aml_workspace: Workspace,
hyperdrive_arg_name: str = "crossval_index",
overwrite: bool = False) -> pd.DataFrame:
"""Fetch metrics logged to Azure ML from hyperdrive runs as a dataframe.
def download_hyperdrive_metrics_if_required(parent_run_id: str, download_dir: Path, aml_workspace: Workspace,
hyperdrive_arg_name: str = "crossval_index",
overwrite: bool = False) -> Path:
"""Fetch metrics logged to Azure ML from hyperdrive runs.
Will only download the metrics if they do not already exist locally, as this can take several
seconds for each child run.
@ -105,12 +105,11 @@ def collect_hyperdrive_metrics(parent_run_id: str, download_dir: Path, aml_works
:param aml_workspace: Azure ML workspace in which the runs were executed.
:param hyperdrive_arg_name: Name of the Hyperdrive argument used for indexing the child runs.
:param overwrite: Whether to force the download even if metrics are already saved locally.
:return: A dataframe in the format returned by :py:func:`~health_azure.aggregate_hyperdrive_metrics()`.
:return: The path of the downloaded json file.
"""
metrics_json = download_dir / "aml_metrics.json"
if not overwrite and metrics_json.is_file():
print(f"AML metrics file already exists at {metrics_json}")
metrics_df = pd.read_json(metrics_json)
else:
metrics_df = aggregate_hyperdrive_metrics(run_id=parent_run_id,
child_run_arg_name=hyperdrive_arg_name,
@ -118,7 +117,17 @@ def collect_hyperdrive_metrics(parent_run_id: str, download_dir: Path, aml_works
metrics_json.parent.mkdir(parents=True, exist_ok=True)
print(f"Writing AML metrics file to {metrics_json}")
df_to_json(metrics_df, metrics_json)
return metrics_df.sort_index(axis='columns')
return metrics_json
def collect_hyperdrive_metrics(metrics_json: Path) -> pd.DataFrame:
"""
Collect the hyperdrive metrics from the downloaded metrics json file in a dataframe.
:param metrics_json: Path of the downloaded metrics file `aml_metrics.json`.
:return: A dataframe in the format returned by :py:func:`~health_azure.aggregate_hyperdrive_metrics()`.
"""
metrics_df = pd.read_json(metrics_json).sort_index(axis='columns')
return metrics_df
def get_hyperdrive_metrics_table(metrics_df: pd.DataFrame, metrics_list: Sequence[str]) -> pd.DataFrame:
@ -139,13 +148,16 @@ def get_hyperdrive_metrics_table(metrics_df: pd.DataFrame, metrics_list: Sequenc
values: pd.Series = metrics_df.loc[metric]
mean = values.mean()
std = values.std()
row = [metric] + [f"{v:.3f}" for v in values] + [f"{mean:.3f} ± {std:.3f}"]
round_values: List[str] = [f"{v:.3f}" if v is not None else str(np.nan) for v in values]
agg_values: List[str] = [f"{mean:.3f} ± {std:.3f}"]
row = [metric] + round_values + agg_values
metrics_rows.append(row)
table = pd.DataFrame(metrics_rows, columns=header).set_index(header[0])
return table
def get_best_epochs(metrics_df: pd.DataFrame, primary_metric: str, maximise: bool = True) -> Dict[int, int]:
def get_best_epochs(metrics_df: pd.DataFrame, primary_metric: str, max_epochs_dict: Dict[int, int],
maximise: bool = True) -> Dict[int, Any]:
"""Determine the best epoch for each hyperdrive child run based on a given metric.
The returned epoch indices are relative to the logging frequency of the chosen metric, i.e.
@ -154,16 +166,26 @@ def get_best_epochs(metrics_df: pd.DataFrame, primary_metric: str, maximise: boo
:param metrics_df: Metrics dataframe, as returned by :py:func:`collect_hyperdrive_metrics()` and
:py:func:`~health_azure.aggregate_hyperdrive_metrics()`.
:param primary_metric: Name of the reference metric to optimise.
:max_epochs_dict: A dictionary of the maximum number of epochs in each cross-validation round.
:param maximise: Whether the given metric should be maximised (minimised if `False`).
:return: Dictionary mapping each hyperdrive child index to its best epoch.
"""
best_fn = np.argmax if maximise else np.argmin
best_epochs = metrics_df.loc[primary_metric].apply(best_fn)
return best_epochs.to_dict()
best_epochs: Dict[int, Any] = {}
for child_index in metrics_df.columns:
primary_metric_list = metrics_df[child_index][primary_metric]
if primary_metric_list is not None:
# If extra validation epoch was logged (N+1), return only the first N elements
primary_metric_list = primary_metric_list[:-1] \
if (len(primary_metric_list) == max_epochs_dict[child_index] + 1) else primary_metric_list
best_epochs[child_index] = int(np.argmax(primary_metric_list)
if maximise else np.argmin(primary_metric_list))
else:
best_epochs[child_index] = None
return best_epochs
def get_best_epoch_metrics(metrics_df: pd.DataFrame, metrics_list: Sequence[str],
best_epochs: Dict[int, int]) -> pd.DataFrame:
best_epochs: Dict[int, Any]) -> pd.DataFrame:
"""Extract the values of the selected hyperdrive metrics at the given best epochs.
The `best_epoch` indices are relative to the logging frequency of the chosen primary metric,
@ -179,7 +201,7 @@ def get_best_epoch_metrics(metrics_df: pd.DataFrame, metrics_list: Sequence[str]
containing only scalar values.
"""
best_metrics = [metrics_df.loc[metrics_list, k].apply(lambda values: values[epoch])
for k, epoch in best_epochs.items()]
if epoch is not None else metrics_df.loc[metrics_list, k] for k, epoch in best_epochs.items()]
best_metrics_df = pd.DataFrame(best_metrics).T
return best_metrics_df
@ -216,20 +238,30 @@ def get_formatted_run_info(parent_run: Run) -> str:
return html
def collect_class_info(metrics_df: pd.DataFrame) -> Tuple[int, List[str]]:
def get_child_runs_hyperparams(metrics_df: pd.DataFrame) -> Dict[int, Dict]:
"""
Get the class names from metrics dataframe
:param metrics_df: Metrics dataframe, as returned by :py:func:`collect_hyperdrive_metrics()` and
Get the hyperparameters of each child run from the metrics dataframe.
:param: metrics_df: Metrics dataframe, as returned by :py:func:`collect_hyperdrive_metrics()` and
:py:func:`~health_azure.aggregate_hyperdrive_metrics()`.
:return: Number of classes and list of class names
:return: A dictionary of hyperparameter dictionaries for the child runs.
"""
hyperparams = metrics_df[0][AMLMetricsJsonKey.HYPERPARAMS]
hyperparams_name = hyperparams[AMLMetricsJsonKey.NAME]
hyperparams_value = hyperparams[AMLMetricsJsonKey.VALUE]
num_classes_index = hyperparams_name.index(AMLMetricsJsonKey.N_CLASSES)
num_classes = int(hyperparams_value[num_classes_index])
class_names_index = hyperparams_name.index(AMLMetricsJsonKey.CLASS_NAMES)
class_names = hyperparams_value[class_names_index]
hyperparams_children = {}
for child_index in metrics_df.columns:
hyperparams = metrics_df[child_index][AMLMetricsJsonKey.HYPERPARAMS]
hyperparams_dict = dict(zip(hyperparams[AMLMetricsJsonKey.NAME], hyperparams[AMLMetricsJsonKey.VALUE]))
hyperparams_children[child_index] = hyperparams_dict
return hyperparams_children
def collect_class_info(hyperparams_children: Dict[int, Dict]) -> Tuple[int, List[str]]:
"""
Get the class names from the hyperparameters of child runs.
:param hyperparams_children: Dict of hyperparameter dicts, as returned by :py:func:`get_child_runs_hyperparams()`.
:return: Number of classes and list of class names.
"""
hyperparams_single_run = list(hyperparams_children.values())[0]
num_classes = int(hyperparams_single_run[AMLMetricsJsonKey.N_CLASSES])
class_names = hyperparams_single_run[AMLMetricsJsonKey.CLASS_NAMES]
if class_names == "None":
class_names = None
else:
@ -237,3 +269,15 @@ def collect_class_info(metrics_df: pd.DataFrame) -> Tuple[int, List[str]]:
class_names = [name.lstrip() for name in class_names[1:-1].replace("'", "").split(',')]
class_names = validate_class_names(class_names=class_names, n_classes=num_classes)
return (num_classes, list(class_names))
def get_max_epochs(hyperparams_children: Dict[int, Dict]) -> Dict[int, int]:
"""
Get the maximum number of epochs for each round from the metrics dataframe.
:param hyperparams_children: Dict of hyperparameter dicts, as returned by :py:func:`get_child_runs_hyperparams()`.
:return: Dictionary with the number of epochs in each hyperdrive run.
"""
max_epochs_dict = {}
for child_index in hyperparams_children.keys():
max_epochs_dict[child_index] = int(hyperparams_children[child_index][AMLMetricsJsonKey.MAX_EPOCHS])
return max_epochs_dict

Просмотреть файл

@ -1,6 +1,6 @@
import json
from pathlib import Path
from typing import Dict, List, Sequence, Union
from typing import Any, Dict, List, Sequence, Union
from unittest.mock import MagicMock, patch
import numpy as np
@ -14,7 +14,7 @@ from health_cpath.utils.output_utils import (AML_LEGACY_TEST_OUTPUTS_CSV, AML_OU
from health_cpath.utils.report_utils import (collect_hyperdrive_metrics, collect_hyperdrive_outputs,
child_runs_have_val_and_test_outputs, get_best_epoch_metrics,
get_best_epochs, get_hyperdrive_metrics_table,
run_has_val_and_test_outputs)
run_has_val_and_test_outputs, download_hyperdrive_metrics_if_required)
def test_run_has_val_and_test_outputs() -> None:
@ -176,13 +176,24 @@ def metrics_df() -> pd.DataFrame:
'val/auroc': [0.8, 0.9, 0.7],
'test/accuracy': 0.9,
'test/auroc': 0.9
}
},
4: {'val/accuracy': None,
'val/auroc': None,
'test/accuracy': None,
'test/auroc': None
}
})
@pytest.fixture
def best_epochs(metrics_df: pd.DataFrame) -> Dict[int, int]:
return get_best_epochs(metrics_df, 'val/accuracy', maximise=True)
def max_epochs_dict() -> Dict[int, int]:
return {0: 3, 1: 10, 3: 3, 4: 3}
@pytest.fixture
def best_epochs(metrics_df: pd.DataFrame, max_epochs_dict: Dict[int, int]) -> Dict[int, Any]:
return get_best_epochs(metrics_df=metrics_df, primary_metric='val/accuracy',
max_epochs_dict=max_epochs_dict, maximise=True)
@pytest.fixture
@ -195,13 +206,15 @@ def best_epoch_metrics(metrics_df: pd.DataFrame, best_epochs: Dict[int, int]) ->
def test_collect_hyperdrive_metrics(metrics_df: pd.DataFrame, tmp_path: Path, overwrite: bool) -> None:
with patch('health_cpath.utils.report_utils.aggregate_hyperdrive_metrics',
return_value=metrics_df) as mock_aggregate:
returned_df = collect_hyperdrive_metrics(parent_run_id="", download_dir=tmp_path,
aml_workspace=None, overwrite=overwrite)
returned_json = download_hyperdrive_metrics_if_required(parent_run_id="", download_dir=tmp_path,
aml_workspace=None, overwrite=overwrite)
returned_df = collect_hyperdrive_metrics(metrics_json=returned_json)
mock_aggregate.assert_called_once()
mock_aggregate.reset_mock()
new_returned_df = collect_hyperdrive_metrics(parent_run_id="", download_dir=tmp_path,
aml_workspace=None, overwrite=overwrite)
new_returned_json = download_hyperdrive_metrics_if_required(parent_run_id="", download_dir=tmp_path,
aml_workspace=None, overwrite=overwrite)
new_returned_df = collect_hyperdrive_metrics(metrics_json=new_returned_json)
if overwrite:
mock_aggregate.assert_called_once()
else:
@ -211,12 +224,13 @@ def test_collect_hyperdrive_metrics(metrics_df: pd.DataFrame, tmp_path: Path, ov
@pytest.mark.parametrize('maximise', [True, False])
def test_get_best_epochs(metrics_df: pd.DataFrame, maximise: bool) -> None:
best_epochs = get_best_epochs(metrics_df, 'val/accuracy', maximise=maximise)
def test_get_best_epochs(metrics_df: pd.DataFrame, max_epochs_dict: Dict[int, int], maximise: bool) -> None:
best_epochs = get_best_epochs(metrics_df=metrics_df, primary_metric='val/accuracy',
max_epochs_dict=max_epochs_dict, maximise=maximise)
assert list(best_epochs.keys()) == list(metrics_df.columns)
assert all(isinstance(epoch, int) for epoch in best_epochs.values())
assert all(isinstance(epoch, (int, type(None))) for epoch in best_epochs.values())
expected_best = {0: 0, 1: 1, 3: 2} if maximise else {0: 1, 1: 2, 3: 0}
expected_best = {0: 0, 1: 1, 3: 2, 4: None} if maximise else {0: 1, 1: 2, 3: 0, 4: None}
for split in metrics_df.columns:
assert best_epochs[split] == expected_best[split]
@ -244,4 +258,4 @@ def test_get_hyperdrive_metrics_table(
original_values = df.loc[metrics_list].values
table_values = metrics_table.iloc[:, :-1].applymap(float).values
assert (table_values == original_values).all()
assert (table_values[~pd.isnull(table_values)] == original_values[~pd.isnull(original_values)]).all()