Improve mlos-viz for multiple repeats of a config and add tests (#633)

- Mark `mlos_viz` as `typed` for `mypy`
- Bump version
- Mock calls to matplotlib/dabl for testing
- Add plotting of top-N configs
- Improve plots for handling repeat config trials via variance error
bars

---------

Co-authored-by: Sergiy Matusevych <sergiym@microsoft.com>
This commit is contained in:
Brian Kroth 2024-01-28 19:46:44 -06:00 коммит произвёл GitHub
Родитель 3a367972c6
Коммит a45f97dc01
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
24 изменённых файлов: 739 добавлений и 73 удалений

Просмотреть файл

@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.0
current_version = 0.4.1
commit = True
tag = True

Просмотреть файл

@ -38,6 +38,7 @@
"jupyterlab",
"keepalive",
"kwargs",
"kword",
"libmamba",
"linalg",
"llamatune",
@ -57,6 +58,7 @@
"pylint",
"pyplot",
"pytest",
"quantile",
"Quickstart",
"refcnt",
"rexec",
@ -82,6 +84,8 @@
"workerinput",
"xdist",
"xlabel",
"xlabels",
"xticks",
"ylabel"
]
// vim: set ft=jsonc:

2
.github/workflows/build-dist-test.ps1 поставляемый
Просмотреть файл

@ -114,6 +114,8 @@ if ($LASTEXITCODE -ne 0) {
}
# Run a simple mlos_viz test.
# To do that, we need the fixtures from mlos_bench, so make those available too.
$env:PYTHONPATH = "mlos_bench"
conda run -n mlos-dist-test python -m pytest mlos_viz/mlos_viz/tests/test_dabl_plot.py
if ($LASTEXITCODE -ne 0) {
Write-Error "Failed to run mlos_viz tests."

Просмотреть файл

@ -335,7 +335,8 @@ build/dist-test.$(PYTHON_VERSION).build-stamp: $(PYTHON_FILES) build/dist-test-e
# Run a simple test that uses the mlos_bench wheel (full tests can be checked with `make test`).
conda run -n mlos-dist-test-$(PYTHON_VERSION) python3 -m pytest mlos_bench/mlos_bench/tests/environments/mock_env_test.py
# Run a simple test that uses the mlos_viz wheel (full tests can be checked with `make test`).
conda run -n mlos-dist-test-$(PYTHON_VERSION) python3 -m pytest mlos_viz/mlos_viz/tests/test_dabl_plot.py
# To do that, we need the fixtures from mlos_bench, so make those available too.
PYTHONPATH=mlos_bench conda run -n mlos-dist-test-$(PYTHON_VERSION) python3 -m pytest mlos_viz/mlos_viz/tests/test_dabl_plot.py
touch $@
dist-test-clean: dist-test-env-clean

Просмотреть файл

@ -36,7 +36,7 @@ copyright = '2024, GSL'
author = 'GSL'
# The full version, including alpha/beta/rc tags
release = '0.4.0'
release = '0.4.1'
try:
from setuptools_scm import get_version

Просмотреть файл

@ -7,4 +7,4 @@ Version number for the mlos_bench package.
"""
# NOTE: This should be managed by bumpversion.
_VERSION = '0.4.0'
_VERSION = '0.4.1'

Просмотреть файл

@ -8,7 +8,7 @@ Base interface for accessing the stored benchmark experiment data.
from abc import ABCMeta, abstractmethod
from distutils.util import strtobool # pylint: disable=deprecated-module
from typing import Dict, Optional, Tuple, TYPE_CHECKING
from typing import Dict, Literal, Optional, Tuple, TYPE_CHECKING
import pandas
@ -73,7 +73,7 @@ class ExperimentData(metaclass=ABCMeta):
@property
@abstractmethod
def objectives(self) -> Dict[str, str]:
def objectives(self) -> Dict[str, Literal["min", "max"]]:
"""
Retrieve the experiment's objectives data from the storage.

Просмотреть файл

@ -5,7 +5,7 @@
"""
An interface to access the experiment benchmark data stored in SQL DB.
"""
from typing import Dict, Optional
from typing import Dict, Literal, Optional
import logging
@ -51,8 +51,8 @@ class ExperimentSqlData(ExperimentData):
self._schema = schema
@property
def objectives(self) -> Dict[str, str]:
objectives: Dict[str, str] = {}
def objectives(self) -> Dict[str, Literal["min", "max"]]:
objectives: Dict[str, Literal["min", "max"]] = {}
# First try to lookup the objectives from the experiment metadata in the storage layer.
if hasattr(self._schema, "objectives"):
with self._engine.connect() as conn:
@ -60,6 +60,7 @@ class ExperimentSqlData(ExperimentData):
self._schema.objectives.select().where(
self._schema.objectives.c.exp_id == self._experiment_id,
).order_by(
# TODO: return weight as well
self._schema.objectives.c.weight.desc(),
self._schema.objectives.c.optimization_target.asc(),
)
@ -98,6 +99,8 @@ class ExperimentSqlData(ExperimentData):
elif opt_direction != objectives[opt_target]:
_LOG.warning("Experiment %s has multiple trial optimization directions for optimization_target %s=%s",
self, opt_target, objectives[opt_target])
for opt_tgt, opt_dir in objectives.items():
assert opt_dir in {None, "min", "max"}, f"Unexpected opt_dir {opt_dir} for opt_tgt {opt_tgt}."
return objectives
# TODO: provide a way to get individual data to avoid repeated bulk fetches where only small amounts of data is accessed.

Просмотреть файл

@ -52,7 +52,8 @@ class TunableConfigTrialGroupSqlData(TunableConfigTrialGroupData):
with self._engine.connect() as conn:
tunable_config_trial_group = conn.execute(
self._schema.trial.select().with_only_columns(
func.min(self._schema.trial.c.trial_id).cast(Integer).label('tunable_config_trial_group_id'),
func.min(self._schema.trial.c.trial_id).cast(Integer).label( # pylint: disable=not-callable
'tunable_config_trial_group_id'),
).where(
self._schema.trial.c.exp_id == self._experiment_id,
self._schema.trial.c.config_id == self._tunable_config_id,

Просмотреть файл

@ -3,5 +3,5 @@
# Licensed under the MIT License.
#
"""
Test for mlos_bench sql storage.
Tests for mlos_bench sql storage.
"""

Просмотреть файл

@ -63,10 +63,14 @@ def exp_storage_with_trials(exp_storage: SqlStorage.Experiment) -> SqlStorage.Ex
"""
# Add some trials to that experiment.
# Note: we're just fabricating some made up function for the ML libraries to try and learn.
base_score = 5.0
base_score = 10.0
tunable_name = "kernel_sched_latency_ns"
tunable_default = exp_storage.tunables.get_tunable(tunable_name)[0].default
tunable = exp_storage.tunables.get_tunable(tunable_name)[0]
tunable_default = tunable.default
assert isinstance(tunable_default, int)
tunable_min = tunable.range[0]
tunable_max = tunable.range[1]
tunable_range = tunable_max - tunable_min
seed = 42
rand_seed(seed)
opt = MockOptimizer(tunables=exp_storage.tunables, config={
@ -85,14 +89,15 @@ def exp_storage_with_trials(exp_storage: SqlStorage.Experiment) -> SqlStorage.Ex
"trial_number": config_i * CONFIG_TRIAL_REPEAT_COUNT + repeat_j + 1,
})
assert trial.tunable_config_id == config_i + 1
trial.update_telemetry(status=Status.RUNNING, metrics=[
(datetime.utcnow(), "some-metric", base_score + random() / 10),
])
tunable_value = float(tunables.get_tunable(tunable_name)[0].numerical_value)
tunable_value_norm = base_score * (tunable_value - tunable_min) / tunable_range
trial.update_telemetry(status=Status.RUNNING, metrics=[
(datetime.utcnow(), "some-metric", tunable_value_norm + random() / 100),
])
trial.update(Status.SUCCEEDED, datetime.utcnow(), metrics={
# Give some variance on the score.
# And some influence from the tunable value.
"score": base_score + 10 * ((tunable_value / tunable_default) - 1) + random() / 10,
"score": tunable_value_norm + random() / 100
})
return exp_storage

Просмотреть файл

@ -26,7 +26,7 @@ def test_exp_trial_data(exp_data: ExperimentData) -> None:
assert trial.status == Status.SUCCEEDED
assert trial.metadata_dict["trial_number"] == trial_id
assert list(trial.results_dict.keys()) == ["score"]
assert trial.results_dict["score"] == pytest.approx(5.0, rel=0.1)
assert trial.results_dict["score"] == pytest.approx(0.0, abs=0.1)
assert isinstance(trial.ts_start, datetime)
assert isinstance(trial.ts_end, datetime)
# Note: tests for telemetry are in test_update_telemetry()

Просмотреть файл

@ -7,4 +7,4 @@ Version number for the mlos_core package.
"""
# NOTE: This should be managed by bumpversion.
_VERSION = '0.4.0'
_VERSION = '0.4.1'

Просмотреть файл

@ -8,13 +8,13 @@ from the mlos_bench framework for benchmarking and optimization automation.
"""
from enum import Enum
from typing import Any, Dict, Literal, Optional
import warnings
from matplotlib import pyplot as plt
import seaborn as sns
import pandas
from mlos_bench.storage.base_experiment_data import ExperimentData
from mlos_viz import base
from mlos_viz.util import expand_results_data_args
class MlosVizMethod(Enum):
@ -22,41 +22,8 @@ class MlosVizMethod(Enum):
What method to use for visualizing the experiment results.
"""
AUTO = "dabl" # use dabl as the current default
DABL = "dabl"
def _plot_optimizer_trends(exp_data: ExperimentData) -> None:
"""
Plots the optimizer trends for the Experiment.
Intended to be used from a Jupyter notebook.
Parameters
----------
exp_data: ExperimentData
The experiment data to plot.
"""
for objective in exp_data.objectives:
objective_column = ExperimentData.RESULT_COLUMN_PREFIX + objective
results_df = exp_data.results_df
plt.rcParams["figure.figsize"] = (10, 4)
sns.scatterplot(
x=results_df.trial_id, y=results_df[objective_column],
alpha=0.7, label="Trial") # Result of each trial
sns.lineplot(
x=results_df.trial_id, y=results_df[objective_column].cummin(),
label="Incumbent") # the best result so far (cummin)
plt.yscale('log')
plt.xlabel("Trial number")
plt.ylabel(objective)
plt.title("Optimizer Trends for Experiment: " + exp_data.experiment_id)
plt.grid()
plt.show() # type: ignore[no-untyped-call]
AUTO = DABL # use dabl as the current default
def ignore_plotter_warnings(plotter_method: MlosVizMethod = MlosVizMethod.AUTO) -> None:
@ -69,8 +36,7 @@ def ignore_plotter_warnings(plotter_method: MlosVizMethod = MlosVizMethod.AUTO)
plotter_method: MlosVizMethod
The method to use for visualizing the experiment results.
"""
warnings.filterwarnings("ignore", category=FutureWarning)
base.ignore_plotter_warnings()
if plotter_method == MlosVizMethod.DABL:
import mlos_viz.dabl # pylint: disable=import-outside-toplevel
mlos_viz.dabl.ignore_plotter_warnings()
@ -78,9 +44,12 @@ def ignore_plotter_warnings(plotter_method: MlosVizMethod = MlosVizMethod.AUTO)
raise NotImplementedError(f"Unhandled method: {plotter_method}")
def plot(exp_data: ExperimentData,
def plot(exp_data: Optional[ExperimentData] = None, *,
results_df: Optional[pandas.DataFrame] = None,
objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
plotter_method: MlosVizMethod = MlosVizMethod.AUTO,
filter_warnings: bool = True) -> None:
filter_warnings: bool = True,
**kwargs: Any) -> None:
"""
Plots the results of the experiment.
@ -90,18 +59,28 @@ def plot(exp_data: ExperimentData,
----------
exp_data: ExperimentData
The experiment data to plot.
results_df : Optional["pandas.DataFrame"]
Optional results_df to plot.
If not provided, defaults to exp_data.results_df property.
objectives : Optional[Dict[str, Literal["min", "max"]]]
Optional objectives to plot.
If not provided, defaults to exp_data.objectives property.
plotter_method: MlosVizMethod
The method to use for visualizing the experiment results.
filter_warnings: bool
Whether or not to filter some warnings from the plotter.
kwargs : dict
Remaining keyword arguments are passed along to the underlying plotter(s).
"""
_plot_optimizer_trends(exp_data)
if filter_warnings:
ignore_plotter_warnings(plotter_method)
(results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
base.plot_optimizer_trends(exp_data, results_df=results_df, objectives=objectives)
base.plot_top_n_configs(exp_data, results_df=results_df, objectives=objectives, **kwargs)
if MlosVizMethod.DABL:
import mlos_viz.dabl # pylint: disable=import-outside-toplevel
mlos_viz.dabl.plot(exp_data)
mlos_viz.dabl.plot(exp_data, results_df=results_df, objectives=objectives)
else:
raise NotImplementedError(f"Unhandled method: {plotter_method}")

439
mlos_viz/mlos_viz/base.py Normal file
Просмотреть файл

@ -0,0 +1,439 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
"""
Base functions for visualizing, explain, and gain insights from results.
"""
from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple, Union
import re
import warnings
from importlib.metadata import version
from matplotlib import pyplot as plt
import pandas
from pandas.api.types import is_numeric_dtype
from pandas.core.groupby.generic import SeriesGroupBy
import seaborn as sns
from mlos_bench.storage.base_experiment_data import ExperimentData
from mlos_viz.util import expand_results_data_args
_SEABORN_VERS = version('seaborn')
def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> Dict[str, Any]:
"""
Assembles a smaller kwargs dict for the specified target function.
Note: this only works with non-positional kwargs (e.g., those after a * arg).
"""
target_kwargs = {}
for kword in target.__kwdefaults__: # or {} # intentionally omitted for now
if kword in kwargs:
target_kwargs[kword] = kwargs[kword]
return target_kwargs
def ignore_plotter_warnings() -> None:
"""
Suppress some annoying warnings from third-party data visualization packages by
adding them to the warnings filter.
"""
warnings.filterwarnings("ignore", category=FutureWarning)
if _SEABORN_VERS <= '0.13.1':
warnings.filterwarnings("ignore", category=DeprecationWarning, module="seaborn", # but actually comes from pandas
message="is_categorical_dtype is deprecated and will be removed in a future version.")
def _add_groupby_desc_column(results_df: pandas.DataFrame,
groupby_columns: Optional[List[str]] = None,
) -> Tuple[pandas.DataFrame, List[str], str]:
"""
Adds a group descriptor column to the results_df.
Parameters
----------
results_df: ExperimentData
The experiment data to add the descriptor column to.
groupby_columns: Optional[List[str]]
"""
# Compose a new groupby_column for display purposes that is the
# concatenation of the min trial_id (the first one) of each config trial
# group and the config_id.
# Note: It's need to be a string (e.g., categorical) for boxplot and lineplot to
# be on the same axis anyways.
if groupby_columns is None:
groupby_columns = ["tunable_config_trial_group_id", "tunable_config_id"]
groupby_column = ",".join(groupby_columns)
results_df[groupby_column] = results_df[groupby_columns].astype(str).apply(
lambda x: ",".join(x), axis=1) # pylint: disable=unnecessary-lambda
groupby_columns.append(groupby_column)
return (results_df, groupby_columns, groupby_column)
def augment_results_df_with_config_trial_group_stats(exp_data: Optional[ExperimentData] = None,
*,
results_df: Optional[pandas.DataFrame] = None,
requested_result_cols: Optional[Iterable[str]] = None,
) -> pandas.DataFrame:
# pylint: disable=too-complex
"""
Add a number of useful statistical measure columns to the results dataframe.
In particular, for each numeric result, we add the following columns for each
requested result column:
- ".p50": the median of each config trial group results
- ".p75": the p75 of each config trial group results
- ".p90": the p90 of each config trial group results
- ".p95": the p95 of each config trial group results
- ".p99": the p95 of each config trial group results
- ".mean": the mean of each config trial group results
- ".stddev": the mean of each config trial group results
- ".var": the variance of each config trial group results
- ".var_zscore": the zscore of this group (i.e., variance relative to the stddev
of all group variances). This can be useful for filtering out outliers (e.g.,
configs with high variance relative to others by restricting to abs < 2 to
remove those two standard deviations from the mean variance across all config
trial groups).
Additionally, we add a "tunable_config_trial_group_size" column that indicates
the number of trials using a particular config.
Parameters
----------
exp_data : ExperimentData
The ExperimentData (e.g., obtained from the storage layer) to plot.
results_df : Optional[pandas.DataFrame]
The results dataframe to augment, by default None to use the results_df property.
requested_result_cols : Optional[Iterable[str]]
Which results columns to augment, by default None to use all results columns
that look numeric.
Returns
-------
pandas.DataFrame
The augmented results dataframe.
"""
if results_df is None:
if exp_data is None:
raise ValueError("Either exp_data or results_df must be provided.")
results_df = exp_data.results_df
results_groups = results_df.groupby("tunable_config_id")
if len(results_groups) <= 1:
raise ValueError(f"Not enough data: {len(results_groups)}")
if requested_result_cols is None:
result_cols = set(col for col in results_df.columns if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX))
else:
result_cols = set(col for col in requested_result_cols
if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and col in results_df.columns)
result_cols.update(set(ExperimentData.RESULT_COLUMN_PREFIX + col for col in requested_result_cols
if ExperimentData.RESULT_COLUMN_PREFIX in results_df.columns))
def compute_zscore_for_group_agg(
results_groups_perf: "SeriesGroupBy",
stats_df: pandas.DataFrame,
result_col: str,
agg: Union[Literal["mean"], Literal["var"], Literal["std"]]
) -> None:
results_groups_perf_aggs = results_groups_perf.agg(agg) # TODO: avoid recalculating?
# Compute the zscore of the chosen aggregate performance of each group into each row in the dataframe.
stats_df[result_col + f".{agg}_mean"] = results_groups_perf_aggs.mean()
stats_df[result_col + f".{agg}_stddev"] = results_groups_perf_aggs.std()
stats_df[result_col + f".{agg}_zscore"] = \
(stats_df[result_col + f".{agg}"] - stats_df[result_col + f".{agg}_mean"]) \
/ stats_df[result_col + f".{agg}_stddev"]
stats_df.drop(columns=[result_col + ".var_" + agg for agg in ("mean", "stddev")], inplace=True)
augmented_results_df = results_df
augmented_results_df["tunable_config_trial_group_size"] = results_groups["trial_id"].transform("count")
for result_col in result_cols:
if not result_col.startswith(ExperimentData.RESULT_COLUMN_PREFIX):
continue
if re.search(r"(start|end).*time", result_col, flags=re.IGNORECASE):
# Ignore computing variance on things like that look like timestamps.
continue
if not is_numeric_dtype(results_df[result_col]):
continue
if results_df[result_col].unique().size == 1:
continue
results_groups_perf = results_groups[result_col]
stats_df = pandas.DataFrame()
stats_df[result_col + ".mean"] = results_groups_perf.transform("mean", numeric_only=True)
stats_df[result_col + ".var"] = results_groups_perf.transform("var")
stats_df[result_col + ".stddev"] = stats_df[result_col + ".var"].apply(lambda x: x**0.5)
compute_zscore_for_group_agg(results_groups_perf, stats_df, result_col, "var")
quantiles = [0.50, 0.75, 0.90, 0.95, 0.99]
for quantile in quantiles: # TODO: can we do this in one pass?
quantile_col = result_col + f".p{int(quantile*100)}"
stats_df[quantile_col] = results_groups_perf.transform("quantile", quantile)
augmented_results_df = pandas.concat([augmented_results_df, stats_df], axis=1)
return augmented_results_df
def limit_top_n_configs(exp_data: Optional[ExperimentData] = None,
*,
results_df: Optional[pandas.DataFrame] = None,
objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
top_n_configs: int = 10,
method: Literal["mean", "p50", "p75", "p90", "p95", "p99"] = "mean",
) -> Tuple[pandas.DataFrame, List[int], Dict[str, bool]]:
# pylint: disable=too-many-locals
"""
Utility function to process the results and determine the best performing
configs including potential repeats to help assess variability.
Parameters
----------
exp_data : Optional[ExperimentData]
The ExperimentData (e.g., obtained from the storage layer) to operate on.
results_df : Optional[pandas.DataFrame]
The results dataframe to augment, by default None to use the results_df property.
objectives : Iterable[str], optional
Which result column(s) to use for sorting the configs, and in which direction ("min" or "max").
By default None to automatically select the experiment objectives.
top_n_configs : int, optional
How many configs to return, including the default, by default 20.
method: Literal["mean", "median", "p50", "p75", "p90", "p95", "p99"] = "mean",
Which statistical method to use when sorting the config groups before determining the cutoff, by default "mean".
Returns
-------
(top_n_config_results_df, top_n_config_ids, orderby_cols) : Tuple[pandas.DataFrame, List[int], Dict[str, bool]]
The filtered results dataframe, the config ids, and the columns used to order the configs.
"""
# Do some input checking first.
if method not in ["mean", "median", "p50", "p75", "p90", "p95", "p99"]:
raise ValueError(f"Invalid method: {method}")
# Prepare the orderby columns.
(results_df, objs_cols) = expand_results_data_args(exp_data, results_df=results_df, objectives=objectives)
assert isinstance(results_df, pandas.DataFrame)
# Augment the results dataframe with some useful stats.
results_df = augment_results_df_with_config_trial_group_stats(
exp_data=exp_data,
results_df=results_df,
requested_result_cols=objs_cols.keys(),
)
# Note: mypy seems to lose its mind for some reason and keeps forgetting that
# results_df is not None and is in fact a DataFrame, so we periodically assert
# it in this func for now.
assert results_df is not None
orderby_cols: Dict[str, bool] = {obj_col + f".{method}": ascending for (obj_col, ascending) in objs_cols.items()}
config_id_col = "tunable_config_id"
group_id_col = "tunable_config_trial_group_id" # first trial_id per config group
trial_id_col = "trial_id"
default_config_id = results_df[trial_id_col].min() if exp_data is None else exp_data.default_tunable_config_id
assert default_config_id is not None, "Failed to determine default config id."
# Filter out configs whose variance is too large.
# But also make sure the default configs is still in the resulting dataframe
# (for comparison purposes).
for obj_col in objs_cols:
assert results_df is not None
if method == "mean":
singletons_mask = results_df["tunable_config_trial_group_size"] == 1
else:
singletons_mask = results_df["tunable_config_trial_group_size"] > 1
results_df = results_df.loc[(
(results_df[f"{obj_col}.var_zscore"].abs() < 2)
| (singletons_mask)
| (results_df[config_id_col] == default_config_id)
)]
assert results_df is not None
# Also, filter results that are worse than the default.
default_config_results_df = results_df.loc[results_df[config_id_col] == default_config_id]
for (orderby_col, ascending) in orderby_cols.items():
default_vals = default_config_results_df[orderby_col].unique()
assert len(default_vals) == 1
default_val = default_vals[0]
assert results_df is not None
if ascending:
results_df = results_df.loc[(results_df[orderby_col] <= default_val)]
else:
results_df = results_df.loc[(results_df[orderby_col] >= default_val)]
# Now regroup and filter to the top-N configs by their group performance dimensions.
assert results_df is not None
group_results_df: pandas.DataFrame = results_df.groupby(config_id_col).first()[orderby_cols.keys()]
top_n_config_ids: List[int] = group_results_df.sort_values(
by=list(orderby_cols.keys()), ascending=list(orderby_cols.values())).head(top_n_configs).index.tolist()
# Remove the default config if it's included. We'll add it back later.
if default_config_id in top_n_config_ids:
top_n_config_ids.remove(default_config_id)
# Get just the top-n config results.
# Sort by the group ids.
top_n_config_results_df = results_df.loc[(
results_df[config_id_col].isin(top_n_config_ids)
)].sort_values([group_id_col, config_id_col, trial_id_col])
# Place the default config at the top of the list.
top_n_config_ids.insert(0, default_config_id)
top_n_config_results_df = pandas.concat([default_config_results_df, top_n_config_results_df], axis=0)
return (top_n_config_results_df, top_n_config_ids, orderby_cols)
def plot_optimizer_trends(
exp_data: Optional[ExperimentData] = None,
*,
results_df: Optional[pandas.DataFrame] = None,
objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
) -> None:
"""
Plots the optimizer trends for the Experiment.
Parameters
----------
exp_data : ExperimentData
The ExperimentData (e.g., obtained from the storage layer) to plot.
results_df : Optional["pandas.DataFrame"]
Optional results_df to plot.
If not provided, defaults to exp_data.results_df property.
objectives : Optional[Dict[str, Literal["min", "max"]]]
Optional objectives to plot.
If not provided, defaults to exp_data.objectives property.
"""
(results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
(results_df, groupby_columns, groupby_column) = _add_groupby_desc_column(results_df)
for (objective_column, ascending) in obj_cols.items():
incumbent_column = objective_column + ".incumbent"
# Determine the mean of each config trial group to match the box plots.
group_results_df = results_df.groupby(groupby_columns)[objective_column].mean()\
.reset_index().sort_values(groupby_columns)
#
# Note: technically the optimizer (usually) uses the *first* result for a
# given config trial group before moving on to a new config (x-axis), so
# plotting the mean may be slightly misleading when trying to understand the
# actual path taken by the optimizer in case of high variance samples.
# Here's a way to do that, though it can also be misleading if the optimizer
# later gets a worse value for that config group as well.
#
# group_results_df = results_df.sort_values(groupby_columns + ["trial_id"]).groupby(
# groupby_columns).head(1)[groupby_columns + [objective_column]].reset_index()
# Calculate the incumbent (best seen so far)
if ascending:
group_results_df[incumbent_column] = group_results_df[objective_column].cummin()
else:
group_results_df[incumbent_column] = group_results_df[objective_column].cummax()
(_fig, axis) = plt.subplots(figsize=(15, 5))
# Result of each set of trials for a config
sns.boxplot(
data=results_df,
x=groupby_column,
y=objective_column,
ax=axis,
)
# Results of the best so far.
axis = sns.lineplot(
data=group_results_df,
x=groupby_column,
y=incumbent_column,
alpha=0.7,
label="Mean of Incumbent Config Trial Group",
ax=axis,
)
plt.yscale('log')
plt.ylabel(objective_column.replace(ExperimentData.RESULT_COLUMN_PREFIX, ""))
plt.xlabel("Config Trial Group ID, Config ID")
plt.xticks(rotation=90, fontsize=8)
plt.title("Optimizer Trends for Experiment: " + exp_data.experiment_id if exp_data is not None else "")
plt.grid()
plt.show() # type: ignore[no-untyped-call]
def plot_top_n_configs(exp_data: Optional[ExperimentData] = None,
*,
results_df: Optional[pandas.DataFrame] = None,
objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
with_scatter_plot: bool = False,
**kwargs: Any,
) -> None:
# pylint: disable=too-many-locals
"""
Plots the top-N configs along with the default config for the given ExperimentData.
Intended to be used from a Jupyter notebook.
Parameters
----------
exp_data: ExperimentData
The experiment data to plot.
results_df : Optional["pandas.DataFrame"]
Optional results_df to plot.
If not provided, defaults to exp_data.results_df property.
objectives : Optional[Dict[str, Literal["min", "max"]]]
Optional objectives to plot.
If not provided, defaults to exp_data.objectives property.
with_scatter_plot : bool
Whether to also add scatter plot to the output figure.
kwargs : dict
Remaining keyword arguments are passed along to the limit_top_n_configs function.
"""
(results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
top_n_config_args = _get_kwarg_defaults(limit_top_n_configs, **kwargs)
if "results_df" not in top_n_config_args:
top_n_config_args["results_df"] = results_df
if "objectives" not in top_n_config_args:
top_n_config_args["objectives"] = objectives
(top_n_config_results_df, _top_n_config_ids, orderby_cols) = limit_top_n_configs(exp_data=exp_data, **top_n_config_args)
(top_n_config_results_df, _groupby_columns, groupby_column) = _add_groupby_desc_column(top_n_config_results_df)
top_n = len(top_n_config_results_df[groupby_column].unique()) - 1
for (orderby_col, ascending) in orderby_cols.items():
opt_tgt = orderby_col.replace(ExperimentData.RESULT_COLUMN_PREFIX, "")
(_fig, axis) = plt.subplots()
sns.violinplot(
data=top_n_config_results_df,
x=groupby_column,
y=orderby_col,
ax=axis,
)
if with_scatter_plot:
sns.scatterplot(
data=top_n_config_results_df,
x=groupby_column,
y=orderby_col,
legend=None,
ax=axis,
)
plt.grid()
(xticks, xlabels) = plt.xticks()
# default should be in the first position based on top_n_configs() return
xlabels[0] = "default" # type: ignore[call-overload]
plt.xticks(xticks, xlabels) # type: ignore[arg-type]
plt.xlabel("Config Trial Group, Config ID")
plt.xticks(rotation=90)
plt.ylabel(opt_tgt)
plt.yscale('log')
extra_title = "(lower is better)" if ascending else "(lower is better)"
plt.title(f"Top {top_n} configs {opt_tgt} {extra_title}")
plt.show() # type: ignore[no-untyped-call]

Просмотреть файл

@ -5,14 +5,22 @@
"""
Small wrapper functions for dabl plotting functions via mlos_bench data.
"""
from typing import Dict, Optional, Literal
import warnings
import dabl
import pandas
from mlos_bench.storage.base_experiment_data import ExperimentData
from mlos_viz.util import expand_results_data_args
def plot(exp_data: ExperimentData) -> None:
def plot(exp_data: Optional[ExperimentData] = None, *,
results_df: Optional[pandas.DataFrame] = None,
objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
) -> None:
"""
Plots the Experiment results data using dabl.
@ -20,20 +28,35 @@ def plot(exp_data: ExperimentData) -> None:
----------
exp_data : ExperimentData
The ExperimentData (e.g., obtained from the storage layer) to plot.
results_df : Optional["pandas.DataFrame"]
Optional results_df to plot.
If not provided, defaults to exp_data.results_df property.
objectives : Optional[Dict[str, Literal["min", "max"]]]
Optional objectives to plot.
If not provided, defaults to exp_data.objectives property.
"""
for objective in exp_data.objectives:
objective_column = ExperimentData.RESULT_COLUMN_PREFIX + objective
dabl.plot(exp_data.results_df, objective_column)
(results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
for obj_col in obj_cols:
dabl.plot(X=results_df, target_col=obj_col)
def ignore_plotter_warnings() -> None:
"""
Add some filters to ignore warnings from the plotter.
"""
# pylint: disable=import-outside-toplevel
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", module="dabl", category=UserWarning, message="Could not infer format")
warnings.filterwarnings("ignore", module="dabl", category=UserWarning, message="(Dropped|Discarding) .* outliers")
warnings.filterwarnings("ignore", module="dabl", category=UserWarning, message="Not plotting highly correlated")
warnings.filterwarnings("ignore", module="dabl", category=UserWarning,
message="Missing values in target_col have been removed for regression")
from sklearn.exceptions import UndefinedMetricWarning # pylint: disable=import-outside-toplevel
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", module="sklearn", category=UndefinedMetricWarning, message="Recall is ill-defined")
warnings.filterwarnings("ignore", category=DeprecationWarning,
message="is_categorical_dtype is deprecated and will be removed in a future version.")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="sklearn",
message="is_sparse is deprecated and will be removed in a future version.")
from matplotlib._api.deprecation import MatplotlibDeprecationWarning
warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning, module="dabl",
message="The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed")

Просмотреть файл

Просмотреть файл

@ -0,0 +1,9 @@
# `mlos-viz` tests
For now we only check plotting via running the core APIs with `DISPLAY` disabled and potentially via basic mocking of the underlying libraries.
In the future we may want to consider adding more full fledge testing and check infra for the graphs produced.
## See Also
- [How can I write unit tests against code that uses matplotlib?](https://stackoverflow.com/questions/27948126/how-can-i-write-unit-tests-against-code-that-uses-matplotlib)

Просмотреть файл

@ -0,0 +1,19 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
"""
Unit tests for mlos_viz.
"""
import sys
import seaborn # pylint: disable=unused-import # (used by patch) # noqa: unused
BASE_MATPLOTLIB_SHOW_PATCH = "mlos_viz.base.plt.show"
if sys.version_info >= (3, 11):
SEABORN_BOXPLOT_PATCH = "dabl.plot.supervised.sns.boxplot"
else:
SEABORN_BOXPLOT_PATCH = "seaborn.boxplot"

Просмотреть файл

@ -0,0 +1,20 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
"""
Export test fixtures for mlos_viz.
"""
from mlos_bench.tests import tunable_groups_fixtures
from mlos_bench.tests.storage.sql import fixtures as sql_storage_fixtures
# Expose some of those as local names so they can be picked up as fixtures by pytest.
storage = sql_storage_fixtures.storage
exp_storage = sql_storage_fixtures.exp_storage
exp_storage_with_trials = sql_storage_fixtures.exp_storage_with_trials
exp_data = sql_storage_fixtures.exp_data
tunable_groups_config = tunable_groups_fixtures.tunable_groups_config
tunable_groups = tunable_groups_fixtures.tunable_groups

Просмотреть файл

@ -0,0 +1,41 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
"""
Unit tests for mlos_viz.
"""
import warnings
from unittest.mock import patch, Mock
from mlos_bench.storage.base_experiment_data import ExperimentData
from mlos_viz.base import ignore_plotter_warnings, plot_optimizer_trends, plot_top_n_configs
from mlos_viz.tests import BASE_MATPLOTLIB_SHOW_PATCH
@patch(BASE_MATPLOTLIB_SHOW_PATCH)
def test_plot_optimizer_trends(mock_show: Mock, exp_data: ExperimentData) -> None:
"""Tests plotting optimizer trends."""
# For now, just ensure that no errors are thrown.
# TODO: Check that a plot was actually produced matching our specifications.
with warnings.catch_warnings():
warnings.simplefilter("error")
ignore_plotter_warnings()
plot_optimizer_trends(exp_data)
assert mock_show.call_count == 1
@patch(BASE_MATPLOTLIB_SHOW_PATCH)
def test_plot_top_n_configs(mock_show: Mock, exp_data: ExperimentData) -> None:
"""Tests plotting top N configs."""
# For now, just ensure that no errors are thrown.
# TODO: Check that a plot was actually produced matching our specifications.
with warnings.catch_warnings():
warnings.simplefilter("error")
ignore_plotter_warnings()
plot_top_n_configs(exp_data)
assert mock_show.call_count == 1

Просмотреть файл

@ -6,8 +6,24 @@
Unit tests for mlos_viz.dabl.plot.
"""
import warnings
def test_placeholder() -> None:
"""Placeholder test."""
# TODO: Remove this and implement real tests for mlos_viz.plot()
# See Also: https://stackoverflow.com/questions/27948126/how-can-i-write-unit-tests-against-code-that-uses-matplotlib
from unittest.mock import patch, Mock
from mlos_bench.storage.base_experiment_data import ExperimentData
from mlos_viz import dabl
from mlos_viz.tests import SEABORN_BOXPLOT_PATCH
@patch(SEABORN_BOXPLOT_PATCH, create=True)
def test_dabl_plot(mock_boxplot: Mock, exp_data: ExperimentData) -> None:
"""Tests plotting via dabl."""
# For now, just ensure that no errors are thrown.
# TODO: Check that a plot was actually produced matching our specifications.
with warnings.catch_warnings():
warnings.simplefilter("error")
dabl.ignore_plotter_warnings()
dabl.plot(exp_data)
assert mock_boxplot.call_count >= 1

Просмотреть файл

@ -0,0 +1,37 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
"""
Unit tests for mlos_viz.
"""
import random
import warnings
from unittest.mock import patch, Mock
from mlos_bench.storage.base_experiment_data import ExperimentData
from mlos_viz import MlosVizMethod, plot
from mlos_viz.tests import BASE_MATPLOTLIB_SHOW_PATCH, SEABORN_BOXPLOT_PATCH
def test_auto_method_type() -> None:
"""Ensure the AUTO method is what we expect."""
assert MlosVizMethod.AUTO.value == MlosVizMethod.DABL.value
@patch(BASE_MATPLOTLIB_SHOW_PATCH)
@patch(SEABORN_BOXPLOT_PATCH)
def test_plot(mock_show: Mock, mock_boxplot: Mock, exp_data: ExperimentData) -> None:
"""Tests core plot() API."""
# For now, just ensure that no errors are thrown.
# TODO: Check that a plot was actually produced matching our specifications.
with warnings.catch_warnings():
warnings.simplefilter("error")
random.seed(42)
plot(exp_data, filter_warnings=True)
assert mock_show.call_count >= 2 # from the two base plots and anything dabl did
assert mock_boxplot.call_count >= 1 # from anything dabl did

67
mlos_viz/mlos_viz/util.py Normal file
Просмотреть файл

@ -0,0 +1,67 @@
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
#
"""
Utility functions for manipulating experiment results data.
"""
from typing import Dict, Literal, Optional, Tuple
import pandas
from mlos_bench.storage.base_experiment_data import ExperimentData
def expand_results_data_args(
exp_data: Optional[ExperimentData] = None,
results_df: Optional[pandas.DataFrame] = None,
objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
) -> Tuple[pandas.DataFrame, Dict[str, bool]]:
"""
Expands some common arguments for working with results data.
Used by mlos_viz as well.
Parameters
----------
exp_data : Optional[ExperimentData], optional
ExperimentData to operate on.
results_df : Optional[pandas.DataFrame], optional
Optional results_df argument.
Defaults to exp_data.results_df property.
objectives : Optional[Dict[str, Literal["min", "max"]]], optional
Optional objectives set to operate on.
Defaults to exp_data.objectives property.
Returns
-------
Tuple[pandas.DataFrame, Dict[str, bool]]
The results dataframe and the objectives columns in the dataframe, plus whether or not they are in ascending order.
"""
# Prepare the orderby columns.
if results_df is None:
if exp_data is None:
raise ValueError("Must provide either exp_data or both results_df and objectives.")
results_df = exp_data.results_df
if objectives is None:
if exp_data is None:
raise ValueError("Must provide either exp_data or both results_df and objectives.")
objectives = exp_data.objectives
objs_cols: Dict[str, bool] = {}
for (opt_tgt, opt_dir) in objectives.items():
if opt_dir not in ["min", "max"]:
raise ValueError(f"Unexpected optimization direction for target {opt_tgt}: {opt_dir}")
ascending = opt_dir == "min"
if opt_tgt.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and opt_tgt in results_df.columns:
objs_cols[opt_tgt] = ascending
elif ExperimentData.RESULT_COLUMN_PREFIX + opt_tgt in results_df.columns:
objs_cols[ExperimentData.RESULT_COLUMN_PREFIX + opt_tgt] = ascending
else:
raise UserWarning(f"{opt_tgt} is not a result column for experiment {exp_data}")
# Note: these copies are important to avoid issues with downstream consumers.
# It is more efficient to copy the dataframe than to go back to the original data source.
# TODO: However, it should be possible to later fixup the downstream consumers
# (which are currently still internal to mlos-viz) to make their own data
# sources if necessary. That will of course need tests.
return (results_df.copy(), objs_cols.copy())