Improve mlos-viz for multiple repeats of a config and add tests (#633)

- Mark `mlos_viz` as `typed` for `mypy` - Bump version - Mock calls to matplotlib/dabl for testing - Add plotting of top-N configs - Improve plots for handling repeat config trials via variance error bars --------- Co-authored-by: Sergiy Matusevych <sergiym@microsoft.com>
2024-01-28 19:46:44 -06:00 · 2024-01-28 19:46:44 -06:00 · a45f97dc01
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.0
+current_version = 0.4.1
 commit = True
 tag = True

--- a/.cspell.json
+++ b/.cspell.json
@ -38,6 +38,7 @@
        "jupyterlab",
        "keepalive",
        "kwargs",
+        "kword",
        "libmamba",
        "linalg",
        "llamatune",
@ -57,6 +58,7 @@
        "pylint",
        "pyplot",
        "pytest",
+        "quantile",
        "Quickstart",
        "refcnt",
        "rexec",
@ -82,6 +84,8 @@
        "workerinput",
        "xdist",
        "xlabel",
+        "xlabels",
+        "xticks",
        "ylabel"
    ]
    // vim: set ft=jsonc:
--- a/.github/workflows/build-dist-test.ps1
+++ b/.github/workflows/build-dist-test.ps1
@ -114,6 +114,8 @@ if ($LASTEXITCODE -ne 0) {
 }

 # Run a simple mlos_viz test.
+# To do that, we need the fixtures from mlos_bench, so make those available too.
+$env:PYTHONPATH = "mlos_bench"
 conda run -n mlos-dist-test python -m pytest mlos_viz/mlos_viz/tests/test_dabl_plot.py
 if ($LASTEXITCODE -ne 0) {
    Write-Error "Failed to run mlos_viz tests."
--- a/3
+++ b/3
@ -335,7 +335,8 @@ build/dist-test.$(PYTHON_VERSION).build-stamp: $(PYTHON_FILES) build/dist-test-e
 	# Run a simple test that uses the mlos_bench wheel (full tests can be checked with `make test`).
 	conda run -n mlos-dist-test-$(PYTHON_VERSION) python3 -m pytest mlos_bench/mlos_bench/tests/environments/mock_env_test.py
 	# Run a simple test that uses the mlos_viz wheel (full tests can be checked with `make test`).
-	conda run -n mlos-dist-test-$(PYTHON_VERSION) python3 -m pytest mlos_viz/mlos_viz/tests/test_dabl_plot.py
+	# To do that, we need the fixtures from mlos_bench, so make those available too.
+	PYTHONPATH=mlos_bench conda run -n mlos-dist-test-$(PYTHON_VERSION) python3 -m pytest mlos_viz/mlos_viz/tests/test_dabl_plot.py
 	touch $@

 dist-test-clean: dist-test-env-clean
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@ -36,7 +36,7 @@ copyright = '2024, GSL'
 author = 'GSL'

 # The full version, including alpha/beta/rc tags
-release = '0.4.0'
+release = '0.4.1'

 try:
    from setuptools_scm import get_version
--- a/mlos_bench/_version.py
+++ b/mlos_bench/_version.py
@ -7,4 +7,4 @@ Version number for the mlos_bench package.
 """

 # NOTE: This should be managed by bumpversion.
-_VERSION = '0.4.0'
+_VERSION = '0.4.1'
--- a/mlos_bench/mlos_bench/storage/base_experiment_data.py
+++ b/mlos_bench/mlos_bench/storage/base_experiment_data.py
@ -8,7 +8,7 @@ Base interface for accessing the stored benchmark experiment data.

 from abc import ABCMeta, abstractmethod
 from distutils.util import strtobool    # pylint: disable=deprecated-module
-from typing import Dict, Optional, Tuple, TYPE_CHECKING
+from typing import Dict, Literal, Optional, Tuple, TYPE_CHECKING

 import pandas

@ -73,7 +73,7 @@ class ExperimentData(metaclass=ABCMeta):

    @property
    @abstractmethod
-    def objectives(self) -> Dict[str, str]:
+    def objectives(self) -> Dict[str, Literal["min", "max"]]:
        """
        Retrieve the experiment's objectives data from the storage.

--- a/mlos_bench/mlos_bench/storage/sql/experiment_data.py
+++ b/mlos_bench/mlos_bench/storage/sql/experiment_data.py
@ -5,7 +5,7 @@
 """
 An interface to access the experiment benchmark data stored in SQL DB.
 """
-from typing import Dict, Optional
+from typing import Dict, Literal, Optional

 import logging

@ -51,8 +51,8 @@ class ExperimentSqlData(ExperimentData):
        self._schema = schema

    @property
-    def objectives(self) -> Dict[str, str]:
-        objectives: Dict[str, str] = {}
+    def objectives(self) -> Dict[str, Literal["min", "max"]]:
+        objectives: Dict[str, Literal["min", "max"]] = {}
        # First try to lookup the objectives from the experiment metadata in the storage layer.
        if hasattr(self._schema, "objectives"):
            with self._engine.connect() as conn:
@ -60,6 +60,7 @@ class ExperimentSqlData(ExperimentData):
                    self._schema.objectives.select().where(
                        self._schema.objectives.c.exp_id == self._experiment_id,
                    ).order_by(
+                        # TODO: return weight as well
                        self._schema.objectives.c.weight.desc(),
                        self._schema.objectives.c.optimization_target.asc(),
                    )
@ -98,6 +99,8 @@ class ExperimentSqlData(ExperimentData):
            elif opt_direction != objectives[opt_target]:
                _LOG.warning("Experiment %s has multiple trial optimization directions for optimization_target %s=%s",
                             self, opt_target, objectives[opt_target])
+        for opt_tgt, opt_dir in objectives.items():
+            assert opt_dir in {None, "min", "max"}, f"Unexpected opt_dir {opt_dir} for opt_tgt {opt_tgt}."
        return objectives

    # TODO: provide a way to get individual data to avoid repeated bulk fetches where only small amounts of data is accessed.
--- a/mlos_bench/mlos_bench/storage/sql/tunable_config_trial_group_data.py
+++ b/mlos_bench/mlos_bench/storage/sql/tunable_config_trial_group_data.py
@ -52,7 +52,8 @@ class TunableConfigTrialGroupSqlData(TunableConfigTrialGroupData):
        with self._engine.connect() as conn:
            tunable_config_trial_group = conn.execute(
                self._schema.trial.select().with_only_columns(
-                    func.min(self._schema.trial.c.trial_id).cast(Integer).label('tunable_config_trial_group_id'),
+                    func.min(self._schema.trial.c.trial_id).cast(Integer).label(    # pylint: disable=not-callable
+                        'tunable_config_trial_group_id'),
                ).where(
                    self._schema.trial.c.exp_id == self._experiment_id,
                    self._schema.trial.c.config_id == self._tunable_config_id,
--- a/mlos_bench/mlos_bench/tests/storage/sql/init.py
+++ b/mlos_bench/mlos_bench/tests/storage/sql/init.py
@ -3,5 +3,5 @@
 # Licensed under the MIT License.
 #
 """
-Test for mlos_bench sql storage.
+Tests for mlos_bench sql storage.
 """
--- a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py
+++ b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py
@ -63,10 +63,14 @@ def exp_storage_with_trials(exp_storage: SqlStorage.Experiment) -> SqlStorage.Ex
    """
    # Add some trials to that experiment.
    # Note: we're just fabricating some made up function for the ML libraries to try and learn.
-    base_score = 5.0
+    base_score = 10.0
    tunable_name = "kernel_sched_latency_ns"
-    tunable_default = exp_storage.tunables.get_tunable(tunable_name)[0].default
+    tunable = exp_storage.tunables.get_tunable(tunable_name)[0]
+    tunable_default = tunable.default
    assert isinstance(tunable_default, int)
+    tunable_min = tunable.range[0]
+    tunable_max = tunable.range[1]
+    tunable_range = tunable_max - tunable_min
    seed = 42
    rand_seed(seed)
    opt = MockOptimizer(tunables=exp_storage.tunables, config={
@ -85,14 +89,15 @@ def exp_storage_with_trials(exp_storage: SqlStorage.Experiment) -> SqlStorage.Ex
                "trial_number": config_i * CONFIG_TRIAL_REPEAT_COUNT + repeat_j + 1,
            })
            assert trial.tunable_config_id == config_i + 1
-            trial.update_telemetry(status=Status.RUNNING, metrics=[
-                (datetime.utcnow(), "some-metric", base_score + random() / 10),
-            ])
            tunable_value = float(tunables.get_tunable(tunable_name)[0].numerical_value)
+            tunable_value_norm = base_score * (tunable_value - tunable_min) / tunable_range
+            trial.update_telemetry(status=Status.RUNNING, metrics=[
+                (datetime.utcnow(), "some-metric", tunable_value_norm + random() / 100),
+            ])
            trial.update(Status.SUCCEEDED, datetime.utcnow(), metrics={
                # Give some variance on the score.
                # And some influence from the tunable value.
-                "score": base_score + 10 * ((tunable_value / tunable_default) - 1) + random() / 10,
+                "score": tunable_value_norm + random() / 100
            })
    return exp_storage

--- a/mlos_bench/mlos_bench/tests/storage/trial_data_test.py
+++ b/mlos_bench/mlos_bench/tests/storage/trial_data_test.py
@ -26,7 +26,7 @@ def test_exp_trial_data(exp_data: ExperimentData) -> None:
    assert trial.status == Status.SUCCEEDED
    assert trial.metadata_dict["trial_number"] == trial_id
    assert list(trial.results_dict.keys()) == ["score"]
-    assert trial.results_dict["score"] == pytest.approx(5.0, rel=0.1)
+    assert trial.results_dict["score"] == pytest.approx(0.0, abs=0.1)
    assert isinstance(trial.ts_start, datetime)
    assert isinstance(trial.ts_end, datetime)
    # Note: tests for telemetry are in test_update_telemetry()
--- a/mlos_core/_version.py
+++ b/mlos_core/_version.py
@ -7,4 +7,4 @@ Version number for the mlos_core package.
 """

 # NOTE: This should be managed by bumpversion.
-_VERSION = '0.4.0'
+_VERSION = '0.4.1'
--- a/mlos_viz/mlos_viz/init.py
+++ b/mlos_viz/mlos_viz/init.py
@ -8,13 +8,13 @@ from the mlos_bench framework for benchmarking and optimization automation.
 """

 from enum import Enum
+from typing import Any, Dict, Literal, Optional

-import warnings
-
-from matplotlib import pyplot as plt
-import seaborn as sns
+import pandas

 from mlos_bench.storage.base_experiment_data import ExperimentData
+from mlos_viz import base
+from mlos_viz.util import expand_results_data_args


 class MlosVizMethod(Enum):
@ -22,41 +22,8 @@ class MlosVizMethod(Enum):
    What method to use for visualizing the experiment results.
    """

-    AUTO = "dabl"   # use dabl as the current default
    DABL = "dabl"
-
-
-def _plot_optimizer_trends(exp_data: ExperimentData) -> None:
-    """
-    Plots the optimizer trends for the Experiment.
-
-    Intended to be used from a Jupyter notebook.
-
-    Parameters
-    ----------
-    exp_data: ExperimentData
-        The experiment data to plot.
-    """
-    for objective in exp_data.objectives:
-        objective_column = ExperimentData.RESULT_COLUMN_PREFIX + objective
-        results_df = exp_data.results_df
-        plt.rcParams["figure.figsize"] = (10, 4)
-
-        sns.scatterplot(
-            x=results_df.trial_id, y=results_df[objective_column],
-            alpha=0.7, label="Trial")  # Result of each trial
-        sns.lineplot(
-            x=results_df.trial_id, y=results_df[objective_column].cummin(),
-            label="Incumbent")  # the best result so far (cummin)
-
-        plt.yscale('log')
-
-        plt.xlabel("Trial number")
-        plt.ylabel(objective)
-
-        plt.title("Optimizer Trends for Experiment: " + exp_data.experiment_id)
-        plt.grid()
-        plt.show()  # type: ignore[no-untyped-call]
+    AUTO = DABL     # use dabl as the current default


 def ignore_plotter_warnings(plotter_method: MlosVizMethod = MlosVizMethod.AUTO) -> None:
@ -69,8 +36,7 @@ def ignore_plotter_warnings(plotter_method: MlosVizMethod = MlosVizMethod.AUTO)
    plotter_method: MlosVizMethod
        The method to use for visualizing the experiment results.
    """
-    warnings.filterwarnings("ignore", category=FutureWarning)
-
+    base.ignore_plotter_warnings()
    if plotter_method == MlosVizMethod.DABL:
        import mlos_viz.dabl    # pylint: disable=import-outside-toplevel
        mlos_viz.dabl.ignore_plotter_warnings()
@ -78,9 +44,12 @@ def ignore_plotter_warnings(plotter_method: MlosVizMethod = MlosVizMethod.AUTO)
        raise NotImplementedError(f"Unhandled method: {plotter_method}")


-def plot(exp_data: ExperimentData,
+def plot(exp_data: Optional[ExperimentData] = None, *,
+         results_df: Optional[pandas.DataFrame] = None,
+         objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
         plotter_method: MlosVizMethod = MlosVizMethod.AUTO,
-         filter_warnings: bool = True) -> None:
+         filter_warnings: bool = True,
+         **kwargs: Any) -> None:
    """
    Plots the results of the experiment.

@ -90,18 +59,28 @@ def plot(exp_data: ExperimentData,
    ----------
    exp_data: ExperimentData
        The experiment data to plot.
+    results_df : Optional["pandas.DataFrame"]
+        Optional results_df to plot.
+        If not provided, defaults to exp_data.results_df property.
+    objectives : Optional[Dict[str, Literal["min", "max"]]]
+        Optional objectives to plot.
+        If not provided, defaults to exp_data.objectives property.
    plotter_method: MlosVizMethod
        The method to use for visualizing the experiment results.
    filter_warnings: bool
        Whether or not to filter some warnings from the plotter.
+    kwargs : dict
+        Remaining keyword arguments are passed along to the underlying plotter(s).
    """
-    _plot_optimizer_trends(exp_data)
-
    if filter_warnings:
        ignore_plotter_warnings(plotter_method)
+    (results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
+
+    base.plot_optimizer_trends(exp_data, results_df=results_df, objectives=objectives)
+    base.plot_top_n_configs(exp_data, results_df=results_df, objectives=objectives, **kwargs)

    if MlosVizMethod.DABL:
        import mlos_viz.dabl    # pylint: disable=import-outside-toplevel
-        mlos_viz.dabl.plot(exp_data)
+        mlos_viz.dabl.plot(exp_data, results_df=results_df, objectives=objectives)
    else:
        raise NotImplementedError(f"Unhandled method: {plotter_method}")
--- a/mlos_viz/mlos_viz/base.py
+++ b/mlos_viz/mlos_viz/base.py
@ -0,0 +1,439 @@
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+"""
+Base functions for visualizing, explain, and gain insights from results.
+"""
+
+from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple, Union
+
+import re
+import warnings
+
+from importlib.metadata import version
+
+from matplotlib import pyplot as plt
+import pandas
+from pandas.api.types import is_numeric_dtype
+from pandas.core.groupby.generic import SeriesGroupBy
+import seaborn as sns
+
+from mlos_bench.storage.base_experiment_data import ExperimentData
+from mlos_viz.util import expand_results_data_args
+
+
+_SEABORN_VERS = version('seaborn')
+
+
+def _get_kwarg_defaults(target: Callable, **kwargs: Any) -> Dict[str, Any]:
+    """
+    Assembles a smaller kwargs dict for the specified target function.
+
+    Note: this only works with non-positional kwargs (e.g., those after a * arg).
+    """
+    target_kwargs = {}
+    for kword in target.__kwdefaults__:     # or {} # intentionally omitted for now
+        if kword in kwargs:
+            target_kwargs[kword] = kwargs[kword]
+    return target_kwargs
+
+
+def ignore_plotter_warnings() -> None:
+    """
+    Suppress some annoying warnings from third-party data visualization packages by
+    adding them to the warnings filter.
+    """
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    if _SEABORN_VERS <= '0.13.1':
+        warnings.filterwarnings("ignore", category=DeprecationWarning, module="seaborn",    # but actually comes from pandas
+                                message="is_categorical_dtype is deprecated and will be removed in a future version.")
+
+
+def _add_groupby_desc_column(results_df: pandas.DataFrame,
+                             groupby_columns: Optional[List[str]] = None,
+                             ) -> Tuple[pandas.DataFrame, List[str], str]:
+    """
+    Adds a group descriptor column to the results_df.
+
+    Parameters
+    ----------
+    results_df: ExperimentData
+        The experiment data to add the descriptor column to.
+    groupby_columns: Optional[List[str]]
+    """
+    # Compose a new groupby_column for display purposes that is the
+    # concatenation of the min trial_id (the first one) of each config trial
+    # group and the config_id.
+    # Note: It's need to be a string (e.g., categorical) for boxplot and lineplot to
+    # be on the same axis anyways.
+    if groupby_columns is None:
+        groupby_columns = ["tunable_config_trial_group_id", "tunable_config_id"]
+    groupby_column = ",".join(groupby_columns)
+    results_df[groupby_column] = results_df[groupby_columns].astype(str).apply(
+        lambda x: ",".join(x), axis=1)  # pylint: disable=unnecessary-lambda
+    groupby_columns.append(groupby_column)
+    return (results_df, groupby_columns, groupby_column)
+
+
+def augment_results_df_with_config_trial_group_stats(exp_data: Optional[ExperimentData] = None,
+                                                     *,
+                                                     results_df: Optional[pandas.DataFrame] = None,
+                                                     requested_result_cols: Optional[Iterable[str]] = None,
+                                                     ) -> pandas.DataFrame:
+    # pylint: disable=too-complex
+    """
+    Add a number of useful statistical measure columns to the results dataframe.
+
+    In particular, for each numeric result, we add the following columns for each
+    requested result column:
+
+    - ".p50": the median of each config trial group results
+
+    - ".p75": the p75 of each config trial group results
+
+    - ".p90": the p90 of each config trial group results
+
+    - ".p95": the p95 of each config trial group results
+
+    - ".p99": the p95 of each config trial group results
+
+    - ".mean": the mean of each config trial group results
+
+    - ".stddev": the mean of each config trial group results
+
+    - ".var": the variance of each config trial group results
+
+    - ".var_zscore": the zscore of this group (i.e., variance relative to the stddev
+      of all group variances). This can be useful for filtering out outliers (e.g.,
+      configs with high variance relative to others by restricting to abs < 2 to
+      remove those two standard deviations from the mean variance across all config
+      trial groups).
+
+    Additionally, we add a "tunable_config_trial_group_size" column that indicates
+    the number of trials using a particular config.
+
+    Parameters
+    ----------
+    exp_data : ExperimentData
+        The ExperimentData (e.g., obtained from the storage layer) to plot.
+    results_df : Optional[pandas.DataFrame]
+        The results dataframe to augment, by default None to use the results_df property.
+    requested_result_cols : Optional[Iterable[str]]
+        Which results columns to augment, by default None to use all results columns
+        that look numeric.
+
+    Returns
+    -------
+    pandas.DataFrame
+        The augmented results dataframe.
+    """
+    if results_df is None:
+        if exp_data is None:
+            raise ValueError("Either exp_data or results_df must be provided.")
+        results_df = exp_data.results_df
+    results_groups = results_df.groupby("tunable_config_id")
+    if len(results_groups) <= 1:
+        raise ValueError(f"Not enough data: {len(results_groups)}")
+
+    if requested_result_cols is None:
+        result_cols = set(col for col in results_df.columns if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX))
+    else:
+        result_cols = set(col for col in requested_result_cols
+                          if col.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and col in results_df.columns)
+        result_cols.update(set(ExperimentData.RESULT_COLUMN_PREFIX + col for col in requested_result_cols
+                               if ExperimentData.RESULT_COLUMN_PREFIX in results_df.columns))
+
+    def compute_zscore_for_group_agg(
+            results_groups_perf: "SeriesGroupBy",
+            stats_df: pandas.DataFrame,
+            result_col: str,
+            agg: Union[Literal["mean"], Literal["var"], Literal["std"]]
+    ) -> None:
+        results_groups_perf_aggs = results_groups_perf.agg(agg)    # TODO: avoid recalculating?
+        # Compute the zscore of the chosen aggregate performance of each group into each row in the dataframe.
+        stats_df[result_col + f".{agg}_mean"] = results_groups_perf_aggs.mean()
+        stats_df[result_col + f".{agg}_stddev"] = results_groups_perf_aggs.std()
+        stats_df[result_col + f".{agg}_zscore"] = \
+            (stats_df[result_col + f".{agg}"] - stats_df[result_col + f".{agg}_mean"]) \
+            / stats_df[result_col + f".{agg}_stddev"]
+        stats_df.drop(columns=[result_col + ".var_" + agg for agg in ("mean", "stddev")], inplace=True)
+
+    augmented_results_df = results_df
+    augmented_results_df["tunable_config_trial_group_size"] = results_groups["trial_id"].transform("count")
+    for result_col in result_cols:
+        if not result_col.startswith(ExperimentData.RESULT_COLUMN_PREFIX):
+            continue
+        if re.search(r"(start|end).*time", result_col, flags=re.IGNORECASE):
+            # Ignore computing variance on things like that look like timestamps.
+            continue
+        if not is_numeric_dtype(results_df[result_col]):
+            continue
+        if results_df[result_col].unique().size == 1:
+            continue
+        results_groups_perf = results_groups[result_col]
+        stats_df = pandas.DataFrame()
+        stats_df[result_col + ".mean"] = results_groups_perf.transform("mean", numeric_only=True)
+        stats_df[result_col + ".var"] = results_groups_perf.transform("var")
+        stats_df[result_col + ".stddev"] = stats_df[result_col + ".var"].apply(lambda x: x**0.5)
+
+        compute_zscore_for_group_agg(results_groups_perf, stats_df, result_col, "var")
+        quantiles = [0.50, 0.75, 0.90, 0.95, 0.99]
+        for quantile in quantiles:     # TODO: can we do this in one pass?
+            quantile_col = result_col + f".p{int(quantile*100)}"
+            stats_df[quantile_col] = results_groups_perf.transform("quantile", quantile)
+        augmented_results_df = pandas.concat([augmented_results_df, stats_df], axis=1)
+    return augmented_results_df
+
+
+def limit_top_n_configs(exp_data: Optional[ExperimentData] = None,
+                        *,
+                        results_df: Optional[pandas.DataFrame] = None,
+                        objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
+                        top_n_configs: int = 10,
+                        method: Literal["mean", "p50", "p75", "p90", "p95", "p99"] = "mean",
+                        ) -> Tuple[pandas.DataFrame, List[int], Dict[str, bool]]:
+    # pylint: disable=too-many-locals
+    """
+    Utility function to process the results and determine the best performing
+    configs including potential repeats to help assess variability.
+
+    Parameters
+    ----------
+    exp_data : Optional[ExperimentData]
+        The ExperimentData (e.g., obtained from the storage layer) to operate on.
+    results_df : Optional[pandas.DataFrame]
+        The results dataframe to augment, by default None to use the results_df property.
+    objectives : Iterable[str], optional
+        Which result column(s) to use for sorting the configs, and in which direction ("min" or "max").
+        By default None to automatically select the experiment objectives.
+    top_n_configs : int, optional
+        How many configs to return, including the default, by default 20.
+    method: Literal["mean", "median", "p50", "p75", "p90", "p95", "p99"] = "mean",
+        Which statistical method to use when sorting the config groups before determining the cutoff, by default "mean".
+
+    Returns
+    -------
+    (top_n_config_results_df, top_n_config_ids, orderby_cols) : Tuple[pandas.DataFrame, List[int], Dict[str, bool]]
+        The filtered results dataframe, the config ids, and the columns used to order the configs.
+    """
+    # Do some input checking first.
+    if method not in ["mean", "median", "p50", "p75", "p90", "p95", "p99"]:
+        raise ValueError(f"Invalid method: {method}")
+
+    # Prepare the orderby columns.
+    (results_df, objs_cols) = expand_results_data_args(exp_data, results_df=results_df, objectives=objectives)
+    assert isinstance(results_df, pandas.DataFrame)
+
+    # Augment the results dataframe with some useful stats.
+    results_df = augment_results_df_with_config_trial_group_stats(
+        exp_data=exp_data,
+        results_df=results_df,
+        requested_result_cols=objs_cols.keys(),
+    )
+    # Note: mypy seems to lose its mind for some reason and keeps forgetting that
+    # results_df is not None and is in fact a DataFrame, so we periodically assert
+    # it in this func for now.
+    assert results_df is not None
+    orderby_cols: Dict[str, bool] = {obj_col + f".{method}": ascending for (obj_col, ascending) in objs_cols.items()}
+
+    config_id_col = "tunable_config_id"
+    group_id_col = "tunable_config_trial_group_id"     # first trial_id per config group
+    trial_id_col = "trial_id"
+
+    default_config_id = results_df[trial_id_col].min() if exp_data is None else exp_data.default_tunable_config_id
+    assert default_config_id is not None, "Failed to determine default config id."
+
+    # Filter out configs whose variance is too large.
+    # But also make sure the default configs is still in the resulting dataframe
+    # (for comparison purposes).
+    for obj_col in objs_cols:
+        assert results_df is not None
+        if method == "mean":
+            singletons_mask = results_df["tunable_config_trial_group_size"] == 1
+        else:
+            singletons_mask = results_df["tunable_config_trial_group_size"] > 1
+        results_df = results_df.loc[(
+            (results_df[f"{obj_col}.var_zscore"].abs() < 2)
+            | (singletons_mask)
+            | (results_df[config_id_col] == default_config_id)
+        )]
+    assert results_df is not None
+
+    # Also, filter results that are worse than the default.
+    default_config_results_df = results_df.loc[results_df[config_id_col] == default_config_id]
+    for (orderby_col, ascending) in orderby_cols.items():
+        default_vals = default_config_results_df[orderby_col].unique()
+        assert len(default_vals) == 1
+        default_val = default_vals[0]
+        assert results_df is not None
+        if ascending:
+            results_df = results_df.loc[(results_df[orderby_col] <= default_val)]
+        else:
+            results_df = results_df.loc[(results_df[orderby_col] >= default_val)]
+
+    # Now regroup and filter to the top-N configs by their group performance dimensions.
+    assert results_df is not None
+    group_results_df: pandas.DataFrame = results_df.groupby(config_id_col).first()[orderby_cols.keys()]
+    top_n_config_ids: List[int] = group_results_df.sort_values(
+        by=list(orderby_cols.keys()), ascending=list(orderby_cols.values())).head(top_n_configs).index.tolist()
+
+    # Remove the default config if it's included. We'll add it back later.
+    if default_config_id in top_n_config_ids:
+        top_n_config_ids.remove(default_config_id)
+    # Get just the top-n config results.
+    # Sort by the group ids.
+    top_n_config_results_df = results_df.loc[(
+        results_df[config_id_col].isin(top_n_config_ids)
+    )].sort_values([group_id_col, config_id_col, trial_id_col])
+    # Place the default config at the top of the list.
+    top_n_config_ids.insert(0, default_config_id)
+    top_n_config_results_df = pandas.concat([default_config_results_df, top_n_config_results_df], axis=0)
+    return (top_n_config_results_df, top_n_config_ids, orderby_cols)
+
+
+def plot_optimizer_trends(
+        exp_data: Optional[ExperimentData] = None,
+        *,
+        results_df: Optional[pandas.DataFrame] = None,
+        objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
+) -> None:
+    """
+    Plots the optimizer trends for the Experiment.
+
+    Parameters
+    ----------
+    exp_data : ExperimentData
+        The ExperimentData (e.g., obtained from the storage layer) to plot.
+    results_df : Optional["pandas.DataFrame"]
+        Optional results_df to plot.
+        If not provided, defaults to exp_data.results_df property.
+    objectives : Optional[Dict[str, Literal["min", "max"]]]
+        Optional objectives to plot.
+        If not provided, defaults to exp_data.objectives property.
+    """
+    (results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
+    (results_df, groupby_columns, groupby_column) = _add_groupby_desc_column(results_df)
+
+    for (objective_column, ascending) in obj_cols.items():
+        incumbent_column = objective_column + ".incumbent"
+
+        # Determine the mean of each config trial group to match the box plots.
+        group_results_df = results_df.groupby(groupby_columns)[objective_column].mean()\
+            .reset_index().sort_values(groupby_columns)
+        #
+        # Note: technically the optimizer (usually) uses the *first* result for a
+        # given config trial group before moving on to a new config (x-axis), so
+        # plotting the mean may be slightly misleading when trying to understand the
+        # actual path taken by the optimizer in case of high variance samples.
+        # Here's a way to do that, though it can also be misleading if the optimizer
+        # later gets a worse value for that config group as well.
+        #
+        # group_results_df = results_df.sort_values(groupby_columns + ["trial_id"]).groupby(
+        #   groupby_columns).head(1)[groupby_columns + [objective_column]].reset_index()
+
+        # Calculate the incumbent (best seen so far)
+        if ascending:
+            group_results_df[incumbent_column] = group_results_df[objective_column].cummin()
+        else:
+            group_results_df[incumbent_column] = group_results_df[objective_column].cummax()
+
+        (_fig, axis) = plt.subplots(figsize=(15, 5))
+
+        # Result of each set of trials for a config
+        sns.boxplot(
+            data=results_df,
+            x=groupby_column,
+            y=objective_column,
+            ax=axis,
+        )
+
+        # Results of the best so far.
+        axis = sns.lineplot(
+            data=group_results_df,
+            x=groupby_column,
+            y=incumbent_column,
+            alpha=0.7,
+            label="Mean of Incumbent Config Trial Group",
+            ax=axis,
+        )
+
+        plt.yscale('log')
+        plt.ylabel(objective_column.replace(ExperimentData.RESULT_COLUMN_PREFIX, ""))
+
+        plt.xlabel("Config Trial Group ID, Config ID")
+        plt.xticks(rotation=90, fontsize=8)
+
+        plt.title("Optimizer Trends for Experiment: " + exp_data.experiment_id if exp_data is not None else "")
+        plt.grid()
+        plt.show()  # type: ignore[no-untyped-call]
+
+
+def plot_top_n_configs(exp_data: Optional[ExperimentData] = None,
+                       *,
+                       results_df: Optional[pandas.DataFrame] = None,
+                       objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
+                       with_scatter_plot: bool = False,
+                       **kwargs: Any,
+                       ) -> None:
+    # pylint: disable=too-many-locals
+    """
+    Plots the top-N configs along with the default config for the given ExperimentData.
+
+    Intended to be used from a Jupyter notebook.
+
+    Parameters
+    ----------
+    exp_data: ExperimentData
+        The experiment data to plot.
+    results_df : Optional["pandas.DataFrame"]
+        Optional results_df to plot.
+        If not provided, defaults to exp_data.results_df property.
+    objectives : Optional[Dict[str, Literal["min", "max"]]]
+        Optional objectives to plot.
+        If not provided, defaults to exp_data.objectives property.
+    with_scatter_plot : bool
+        Whether to also add scatter plot to the output figure.
+    kwargs : dict
+        Remaining keyword arguments are passed along to the limit_top_n_configs function.
+    """
+    (results_df, _obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
+    top_n_config_args = _get_kwarg_defaults(limit_top_n_configs, **kwargs)
+    if "results_df" not in top_n_config_args:
+        top_n_config_args["results_df"] = results_df
+    if "objectives" not in top_n_config_args:
+        top_n_config_args["objectives"] = objectives
+    (top_n_config_results_df, _top_n_config_ids, orderby_cols) = limit_top_n_configs(exp_data=exp_data, **top_n_config_args)
+
+    (top_n_config_results_df, _groupby_columns, groupby_column) = _add_groupby_desc_column(top_n_config_results_df)
+    top_n = len(top_n_config_results_df[groupby_column].unique()) - 1
+
+    for (orderby_col, ascending) in orderby_cols.items():
+        opt_tgt = orderby_col.replace(ExperimentData.RESULT_COLUMN_PREFIX, "")
+        (_fig, axis) = plt.subplots()
+        sns.violinplot(
+            data=top_n_config_results_df,
+            x=groupby_column,
+            y=orderby_col,
+            ax=axis,
+        )
+        if with_scatter_plot:
+            sns.scatterplot(
+                data=top_n_config_results_df,
+                x=groupby_column,
+                y=orderby_col,
+                legend=None,
+                ax=axis,
+            )
+        plt.grid()
+        (xticks, xlabels) = plt.xticks()
+        # default should be in the first position based on top_n_configs() return
+        xlabels[0] = "default"          # type: ignore[call-overload]
+        plt.xticks(xticks, xlabels)     # type: ignore[arg-type]
+        plt.xlabel("Config Trial Group, Config ID")
+        plt.xticks(rotation=90)
+        plt.ylabel(opt_tgt)
+        plt.yscale('log')
+        extra_title = "(lower is better)" if ascending else "(lower is better)"
+        plt.title(f"Top {top_n} configs {opt_tgt} {extra_title}")
+        plt.show()  # type: ignore[no-untyped-call]
--- a/mlos_viz/mlos_viz/dabl.py
+++ b/mlos_viz/mlos_viz/dabl.py
@ -5,14 +5,22 @@
 """
 Small wrapper functions for dabl plotting functions via mlos_bench data.
 """
+from typing import Dict, Optional, Literal
+
 import warnings

 import dabl
+import pandas

 from mlos_bench.storage.base_experiment_data import ExperimentData

+from mlos_viz.util import expand_results_data_args

-def plot(exp_data: ExperimentData) -> None:
+
+def plot(exp_data: Optional[ExperimentData] = None, *,
+         results_df: Optional[pandas.DataFrame] = None,
+         objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
+         ) -> None:
    """
    Plots the Experiment results data using dabl.

@ -20,20 +28,35 @@ def plot(exp_data: ExperimentData) -> None:
    ----------
    exp_data : ExperimentData
        The ExperimentData (e.g., obtained from the storage layer) to plot.
+    results_df : Optional["pandas.DataFrame"]
+        Optional results_df to plot.
+        If not provided, defaults to exp_data.results_df property.
+    objectives : Optional[Dict[str, Literal["min", "max"]]]
+        Optional objectives to plot.
+        If not provided, defaults to exp_data.objectives property.
    """
-    for objective in exp_data.objectives:
-        objective_column = ExperimentData.RESULT_COLUMN_PREFIX + objective
-        dabl.plot(exp_data.results_df, objective_column)
+    (results_df, obj_cols) = expand_results_data_args(exp_data, results_df, objectives)
+    for obj_col in obj_cols:
+        dabl.plot(X=results_df, target_col=obj_col)


 def ignore_plotter_warnings() -> None:
    """
    Add some filters to ignore warnings from the plotter.
    """
+    # pylint: disable=import-outside-toplevel
+    warnings.filterwarnings("ignore", category=FutureWarning)
    warnings.filterwarnings("ignore", module="dabl", category=UserWarning, message="Could not infer format")
    warnings.filterwarnings("ignore", module="dabl", category=UserWarning, message="(Dropped|Discarding) .* outliers")
    warnings.filterwarnings("ignore", module="dabl", category=UserWarning, message="Not plotting highly correlated")
    warnings.filterwarnings("ignore", module="dabl", category=UserWarning,
                            message="Missing values in target_col have been removed for regression")
-    from sklearn.exceptions import UndefinedMetricWarning   # pylint: disable=import-outside-toplevel
+    from sklearn.exceptions import UndefinedMetricWarning
    warnings.filterwarnings("ignore", module="sklearn", category=UndefinedMetricWarning, message="Recall is ill-defined")
+    warnings.filterwarnings("ignore", category=DeprecationWarning,
+                            message="is_categorical_dtype is deprecated and will be removed in a future version.")
+    warnings.filterwarnings("ignore", category=DeprecationWarning, module="sklearn",
+                            message="is_sparse is deprecated and will be removed in a future version.")
+    from matplotlib._api.deprecation import MatplotlibDeprecationWarning
+    warnings.filterwarnings("ignore", category=MatplotlibDeprecationWarning, module="dabl",
+                            message="The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed")
--- a/mlos_viz/mlos_viz/py.typed
+++ b/mlos_viz/mlos_viz/py.typed
--- a/mlos_viz/mlos_viz/tests/README.md
+++ b/mlos_viz/mlos_viz/tests/README.md
@ -0,0 +1,9 @@
+# `mlos-viz` tests
+
+For now we only check plotting via running the core APIs with `DISPLAY` disabled and potentially via basic mocking of the underlying libraries.
+
+In the future we may want to consider adding more full fledge testing and check infra for the graphs produced.
+
+## See Also
+
+- [How can I write unit tests against code that uses matplotlib?](https://stackoverflow.com/questions/27948126/how-can-i-write-unit-tests-against-code-that-uses-matplotlib)
--- a/mlos_viz/mlos_viz/tests/init.py
+++ b/mlos_viz/mlos_viz/tests/init.py
@ -0,0 +1,19 @@
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+"""
+Unit tests for mlos_viz.
+"""
+
+import sys
+
+import seaborn  # pylint: disable=unused-import     # (used by patch)   # noqa: unused
+
+
+BASE_MATPLOTLIB_SHOW_PATCH = "mlos_viz.base.plt.show"
+
+if sys.version_info >= (3, 11):
+    SEABORN_BOXPLOT_PATCH = "dabl.plot.supervised.sns.boxplot"
+else:
+    SEABORN_BOXPLOT_PATCH = "seaborn.boxplot"
--- a/mlos_viz/mlos_viz/tests/conftest.py
+++ b/mlos_viz/mlos_viz/tests/conftest.py
@ -0,0 +1,20 @@
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+"""
+Export test fixtures for mlos_viz.
+"""
+
+from mlos_bench.tests import tunable_groups_fixtures
+from mlos_bench.tests.storage.sql import fixtures as sql_storage_fixtures
+
+# Expose some of those as local names so they can be picked up as fixtures by pytest.
+
+storage = sql_storage_fixtures.storage
+exp_storage = sql_storage_fixtures.exp_storage
+exp_storage_with_trials = sql_storage_fixtures.exp_storage_with_trials
+exp_data = sql_storage_fixtures.exp_data
+
+tunable_groups_config = tunable_groups_fixtures.tunable_groups_config
+tunable_groups = tunable_groups_fixtures.tunable_groups
--- a/mlos_viz/mlos_viz/tests/test_base_plot.py
+++ b/mlos_viz/mlos_viz/tests/test_base_plot.py
@ -0,0 +1,41 @@
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+"""
+Unit tests for mlos_viz.
+"""
+
+import warnings
+
+from unittest.mock import patch, Mock
+
+from mlos_bench.storage.base_experiment_data import ExperimentData
+
+from mlos_viz.base import ignore_plotter_warnings, plot_optimizer_trends, plot_top_n_configs
+
+from mlos_viz.tests import BASE_MATPLOTLIB_SHOW_PATCH
+
+
+@patch(BASE_MATPLOTLIB_SHOW_PATCH)
+def test_plot_optimizer_trends(mock_show: Mock, exp_data: ExperimentData) -> None:
+    """Tests plotting optimizer trends."""
+    # For now, just ensure that no errors are thrown.
+    # TODO: Check that a plot was actually produced matching our specifications.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        ignore_plotter_warnings()
+        plot_optimizer_trends(exp_data)
+    assert mock_show.call_count == 1
+
+
+@patch(BASE_MATPLOTLIB_SHOW_PATCH)
+def test_plot_top_n_configs(mock_show: Mock, exp_data: ExperimentData) -> None:
+    """Tests plotting top N configs."""
+    # For now, just ensure that no errors are thrown.
+    # TODO: Check that a plot was actually produced matching our specifications.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        ignore_plotter_warnings()
+        plot_top_n_configs(exp_data)
+    assert mock_show.call_count == 1
--- a/mlos_viz/mlos_viz/tests/test_dabl_plot.py
+++ b/mlos_viz/mlos_viz/tests/test_dabl_plot.py
@ -6,8 +6,24 @@
 Unit tests for mlos_viz.dabl.plot.
 """

+import warnings

-def test_placeholder() -> None:
-    """Placeholder test."""
-    # TODO: Remove this and implement real tests for mlos_viz.plot()
-    # See Also: https://stackoverflow.com/questions/27948126/how-can-i-write-unit-tests-against-code-that-uses-matplotlib
+from unittest.mock import patch, Mock
+
+from mlos_bench.storage.base_experiment_data import ExperimentData
+
+from mlos_viz import dabl
+
+from mlos_viz.tests import SEABORN_BOXPLOT_PATCH
+
+
+@patch(SEABORN_BOXPLOT_PATCH, create=True)
+def test_dabl_plot(mock_boxplot: Mock, exp_data: ExperimentData) -> None:
+    """Tests plotting via dabl."""
+    # For now, just ensure that no errors are thrown.
+    # TODO: Check that a plot was actually produced matching our specifications.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        dabl.ignore_plotter_warnings()
+        dabl.plot(exp_data)
+    assert mock_boxplot.call_count >= 1
--- a/mlos_viz/mlos_viz/tests/test_mlos_viz.py
+++ b/mlos_viz/mlos_viz/tests/test_mlos_viz.py
@ -0,0 +1,37 @@
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+"""
+Unit tests for mlos_viz.
+"""
+
+import random
+import warnings
+
+from unittest.mock import patch, Mock
+
+from mlos_bench.storage.base_experiment_data import ExperimentData
+
+from mlos_viz import MlosVizMethod, plot
+
+from mlos_viz.tests import BASE_MATPLOTLIB_SHOW_PATCH, SEABORN_BOXPLOT_PATCH
+
+
+def test_auto_method_type() -> None:
+    """Ensure the AUTO method is what we expect."""
+    assert MlosVizMethod.AUTO.value == MlosVizMethod.DABL.value
+
+
+@patch(BASE_MATPLOTLIB_SHOW_PATCH)
+@patch(SEABORN_BOXPLOT_PATCH)
+def test_plot(mock_show: Mock, mock_boxplot: Mock, exp_data: ExperimentData) -> None:
+    """Tests core plot() API."""
+    # For now, just ensure that no errors are thrown.
+    # TODO: Check that a plot was actually produced matching our specifications.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        random.seed(42)
+        plot(exp_data, filter_warnings=True)
+    assert mock_show.call_count >= 2        # from the two base plots and anything dabl did
+    assert mock_boxplot.call_count >= 1     # from anything dabl did
--- a/mlos_viz/mlos_viz/util.py
+++ b/mlos_viz/mlos_viz/util.py
@ -0,0 +1,67 @@
+#
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+#
+"""
+Utility functions for manipulating experiment results data.
+"""
+from typing import Dict, Literal, Optional, Tuple
+
+import pandas
+
+from mlos_bench.storage.base_experiment_data import ExperimentData
+
+
+def expand_results_data_args(
+    exp_data: Optional[ExperimentData] = None,
+    results_df: Optional[pandas.DataFrame] = None,
+    objectives: Optional[Dict[str, Literal["min", "max"]]] = None,
+) -> Tuple[pandas.DataFrame, Dict[str, bool]]:
+    """
+    Expands some common arguments for working with results data.
+
+    Used by mlos_viz as well.
+
+    Parameters
+    ----------
+    exp_data : Optional[ExperimentData], optional
+        ExperimentData to operate on.
+    results_df : Optional[pandas.DataFrame], optional
+        Optional results_df argument.
+        Defaults to exp_data.results_df property.
+    objectives : Optional[Dict[str, Literal["min", "max"]]], optional
+        Optional objectives set to operate on.
+        Defaults to exp_data.objectives property.
+
+    Returns
+    -------
+    Tuple[pandas.DataFrame, Dict[str, bool]]
+        The results dataframe and the objectives columns in the dataframe, plus whether or not they are in ascending order.
+    """
+    # Prepare the orderby columns.
+    if results_df is None:
+        if exp_data is None:
+            raise ValueError("Must provide either exp_data or both results_df and objectives.")
+        results_df = exp_data.results_df
+
+    if objectives is None:
+        if exp_data is None:
+            raise ValueError("Must provide either exp_data or both results_df and objectives.")
+        objectives = exp_data.objectives
+    objs_cols: Dict[str, bool] = {}
+    for (opt_tgt, opt_dir) in objectives.items():
+        if opt_dir not in ["min", "max"]:
+            raise ValueError(f"Unexpected optimization direction for target {opt_tgt}: {opt_dir}")
+        ascending = opt_dir == "min"
+        if opt_tgt.startswith(ExperimentData.RESULT_COLUMN_PREFIX) and opt_tgt in results_df.columns:
+            objs_cols[opt_tgt] = ascending
+        elif ExperimentData.RESULT_COLUMN_PREFIX + opt_tgt in results_df.columns:
+            objs_cols[ExperimentData.RESULT_COLUMN_PREFIX + opt_tgt] = ascending
+        else:
+            raise UserWarning(f"{opt_tgt} is not a result column for experiment {exp_data}")
+    # Note: these copies are important to avoid issues with downstream consumers.
+    # It is more efficient to copy the dataframe than to go back to the original data source.
+    # TODO: However, it should be possible to later fixup the downstream consumers
+    # (which are currently still internal to mlos-viz) to make their own data
+    # sources if necessary.  That will of course need tests.
+    return (results_df.copy(), objs_cols.copy())