Support selecting configurations on which groups should run

Also switch from using CP-SAT to MIP, as it's faster. Part of #1117
2020-06-09 01:57:13 +02:00 · 2020-06-09 01:57:13 +02:00 · d1dae08fec
--- a/bugbug/models/testselect.py
+++ b/bugbug/models/testselect.py
@ -8,12 +8,12 @@ import math
 import pickle
 import statistics
 from functools import reduce
-from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple

 import numpy as np
 import xgboost
 from imblearn.under_sampling import RandomUnderSampler
-from ortools.sat.python import cp_model
+from ortools.linear_solver import pywraplp
 from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline
@ -61,6 +61,7 @@ class TestSelectModel(Model):
            self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = (
                test_scheduling.PAST_FAILURES_GROUP_DB,
                test_scheduling.TOUCHED_TOGETHER_DB,
+                test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB,
            )
        elif granularity == "config_group":
            self.training_dbs.append(test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB)
@ -236,47 +237,49 @@ class TestSelectModel(Model):
            for i in selected_indexes
        }

-    def reduce(self, tasks: Set[str], min_redundancy_confidence: float) -> Set[str]:
-        failing_together = test_scheduling.get_failing_together_db(self.granularity)
-
+    def _get_cost(self, config: str) -> int:
        costs = [
-            (("linux1804-64", "opt"), 1),
-            (("linux1804-64", "debug"), 2),
+            (("linux1804-64", "opt"), 2),
+            (("linux1804-64", "debug"), 3),
            (("windows10", "opt"), 4),
-            (("windows10", "debug"), 4),
-            (("android-em", "opt"), 5),
-            (("android-em", "debug"), 6),
-            (("windows7", "opt"), 7),
-            (("windows7", "debug"), 8),
-            (("mac", "opt"), 9),
-            (("mac", "debug"), 10),
-            (("asan", "opt"), 11),
-            (("asan", "debug"), 12),
-            (("linux1804-32", "opt"), 13),
-            (("linux1804-32", "debug"), 14),
-            (("android-hw", "opt"), 15),
-            (("android-hw", "debug"), 16),
-            (("tsan", "opt"), 17),
-            (("tsan", "debug"), 18),
+            (("windows10", "debug"), 5),
+            (("android-em", "opt"), 6),
+            (("android-em", "debug"), 7),
+            (("windows7", "opt"), 8),
+            (("windows7", "debug"), 9),
+            (("mac", "opt"), 10),
+            (("mac", "debug"), 11),
+            (("asan", "opt"), 12),
+            (("asan", "debug"), 13),
+            (("linux1804-32", "opt"), 14),
+            (("linux1804-32", "debug"), 15),
+            (("android-hw", "opt"), 16),
+            (("android-hw", "debug"), 17),
+            (("tsan", "opt"), 18),
+            (("tsan", "debug"), 19),
+            (("test-linux1804-64-shippable/opt-*-e10s",), 1),
        ]

-        def get_cost(task):
-            for substrings, cost in reversed(costs):
-                if all(s in task for s in substrings):
-                    return cost
+        for substrings, cost in reversed(costs):
+            if all(s in config for s in substrings):
+                return cost

-            raise Exception(f"Couldn't find cost for {task}")
+        raise Exception(f"Couldn't find cost for {config}")

-        model = cp_model.CpModel()
-
-        task_vars = {task: model.NewIntVar(0, 1, task) for task in tasks}
-
-        # Generate 'equivalence groups', containing all tasks that are redundant with each other.
+    def _generate_equivalence_sets(
+        self,
+        tasks: Set[str],
+        min_redundancy_confidence: float,
+        load_failing_together: Callable[[str], Dict[str, Tuple[float, float]]],
+        assume_redundant: bool,
+    ) -> List[Set[str]]:
+        # Generate 'equivalence sets', containing all tasks that are redundant with
+        # each other.
        groups: List[Set[str]] = []
        task_to_groups: Dict[str, Set[int]] = collections.defaultdict(set)
        incompatible_groups: Dict[str, Set[int]] = collections.defaultdict(set)

-        def create_group(task):
+        def create_group(task: str) -> None:
            if task in task_to_groups:
                return

@ -285,7 +288,7 @@ class TestSelectModel(Model):

        # Add task1 to all equivalence groups where task2 is present, and likewise for task2.
        # Skip groups which contain tasks that are not redundant with task1.
-        def add_to_groups(task1, task2):
+        def add_to_groups(task1: str, task2: str) -> None:
            found = False

            if task1 in task_to_groups:
@ -315,7 +318,7 @@ class TestSelectModel(Model):
            task_to_groups[task1].add(len(groups) - 1)
            task_to_groups[task2].add(len(groups) - 1)

-        def mark_incompatible(task1, task2):
+        def mark_incompatible(task1: str, task2: str) -> None:
            if task1 in task_to_groups:
                incompatible_groups[task2].update(task_to_groups[task1])

@ -324,18 +327,23 @@ class TestSelectModel(Model):

        sorted_tasks = sorted(tasks)
        for i, task1 in enumerate(sorted_tasks):
-            key = test_scheduling.failing_together_key(task1)
-            if key not in failing_together:
-                create_group(task1)
-                continue
-
-            failing_together_stats = pickle.loads(failing_together[key])
+            try:
+                failing_together_stats = load_failing_together(task1)
+            except KeyError:
+                if not assume_redundant:
+                    create_group(task1)
+                    continue
+                else:
+                    failing_together_stats = {}

            for task2 in sorted_tasks[i + 1 :]:
                try:
                    support, confidence = failing_together_stats[task2]
                except KeyError:
-                    continue
+                    if not assume_redundant:
+                        continue
+                    else:
+                        confidence = 1.0

                if confidence >= min_redundancy_confidence:
                    add_to_groups(task1, task2)
@ -346,31 +354,135 @@ class TestSelectModel(Model):
            # with it.
            create_group(task1)

-        # Create constraints to ensure at least one task from each set of equivalent
-        # groups is selected.
-        for group in groups:
-            model.Add(sum(task_vars[task] for task in group) >= 1)
+        return groups

-        # Choose the best set of tasks that satisfy the constraints with the lowest cost.
-        model.Minimize(
-            sum(get_cost(task) * task_vars[task] for task in task_vars.keys())
+    def _solve_optimization(self, solver: pywraplp.Solver) -> None:
+        # The MIP solver is usually fast (milliseconds). If we hit a weird problem,
+        # accept a suboptimal solution after 10 seconds.
+        solver.SetTimeLimit(10000)
+        status = solver.Solve()
+
+        if status == pywraplp.Solver.INFEASIBLE:
+            raise Exception("Infeasible problem")
+        elif status == pywraplp.Solver.NOT_SOLVED:
+            raise Exception("Problem unsolved")
+
+    def reduce(self, tasks: Set[str], min_redundancy_confidence: float) -> Set[str]:
+        failing_together = test_scheduling.get_failing_together_db(self.granularity)
+
+        def load_failing_together(task: str) -> Dict[str, Tuple[float, float]]:
+            key = test_scheduling.failing_together_key(task)
+            return pickle.loads(failing_together[key])
+
+        solver = pywraplp.Solver(
+            "select_configs", pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING
        )

-        solver = cp_model.CpSolver()
-        # The CP-SAT solver is usually fast (milliseconds). If we hit a weird problem,
-        # accept a suboptimal solution after 10 seconds.
-        solver.parameters.max_time_in_seconds = 10.0
-        # Presolving considerably slows down the CP-SAT solver.
-        solver.parameters.cp_model_presolve = 0
-        status = solver.Solve(model)
+        task_vars = {task: solver.IntVar(0, 1, task) for task in tasks}

-        if status == cp_model.INFEASIBLE:
-            raise Exception("Infeasible problem")
+        equivalence_sets = self._generate_equivalence_sets(
+            tasks, min_redundancy_confidence, load_failing_together, False
+        )
+
+        # Create constraints to ensure at least one task from each set of equivalent
+        # sets is selected.
+
+        mutually_exclusive = True
+        seen = set()
+        for equivalence_set in equivalence_sets:
+            if any(config in seen for config in equivalence_set):
+                mutually_exclusive = False
+                break
+
+            seen |= equivalence_set
+
+        for equivalence_set in equivalence_sets:
+            sum_constraint = sum(task_vars[task] for task in equivalence_set)
+            if mutually_exclusive:
+                solver.Add(sum_constraint == 1)
+            else:
+                solver.Add(sum_constraint >= 1)
+
+        # Choose the best set of tasks that satisfy the constraints with the lowest cost.
+        solver.Minimize(
+            sum(self._get_cost(task) * task_vars[task] for task in task_vars.keys())
+        )
+
+        self._solve_optimization(solver)

        return {
            task
            for task, task_var in task_vars.items()
-            if solver.Value(task_vars[task]) == 1
+            if task_var.solution_value() == 1
+        }
+
+    def select_configs(
+        self, groups: Set[str], min_redundancy_confidence: float
+    ) -> Set[Tuple[str, str]]:
+        failing_together = test_scheduling.get_failing_together_db("config_group")
+
+        all_configs = pickle.loads(failing_together[b"$ALL_CONFIGS$"])
+        config_costs = {config: self._get_cost(config) for config in all_configs}
+
+        solver = pywraplp.Solver(
+            "select_configs", pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING
+        )
+
+        config_group_vars = {
+            (config, group): solver.BoolVar(f"{group}@{config}")
+            for group in groups
+            for config in all_configs
+        }
+
+        for group in groups:
+            key = test_scheduling.failing_together_key(group)
+            if key in failing_together:
+                failing_together_stats = pickle.loads(failing_together[key])
+            else:
+                failing_together_stats = {}
+
+            def load_failing_together(config: str) -> Dict[str, Tuple[float, float]]:
+                return failing_together_stats[config]
+
+            equivalence_sets = self._generate_equivalence_sets(
+                all_configs, min_redundancy_confidence, load_failing_together, True
+            )
+
+            # Create constraints to ensure at least one task from each set of equivalent
+            # groups is selected.
+
+            mutually_exclusive = True
+            seen = set()
+            for equivalence_set in equivalence_sets:
+                if any(config in seen for config in equivalence_set):
+                    mutually_exclusive = False
+                    break
+
+                seen |= equivalence_set
+
+            for equivalence_set in equivalence_sets:
+                sum_constraint = sum(
+                    config_group_vars[(config, group)] for config in equivalence_set
+                )
+                if mutually_exclusive:
+                    solver.Add(sum_constraint == 1)
+                else:
+                    solver.Add(sum_constraint >= 1)
+
+        # Choose the best set of tasks that satisfy the constraints with the lowest cost.
+        solver.Minimize(
+            sum(
+                config_costs[config] * config_group_vars[(config, group)]
+                for config, group in config_group_vars.keys()
+            )
+        )
+
+        self._solve_optimization(solver)
+
+        return {
+            (config, group)
+            for (config, group), config_group_var in config_group_vars.items()
+            if config_group_var.solution_value() == 1
        }

    def evaluation(self) -> None:
@ -380,15 +492,16 @@ class TestSelectModel(Model):
        # To evaluate the model with reductions enabled, we need to regenerate the failing together DB, using
        # only failure data from the training pushes (otherwise, we'd leak training information into the test
        # set).
-        if self.granularity == "label":
-            print("Generate failing together DB (restricted to training pushes)")
-            push_data_iter, push_data_count, _ = test_scheduling.get_push_data("label")
-            test_scheduling.generate_failing_together_probabilities(
-                self.granularity,
-                push_data_iter(),
-                push_data_count,
-                pushes[train_push_len - 1]["revs"][0],
-            )
+        print("Generate failing together DB (restricted to training pushes)")
+        push_data_iter, push_data_count, _ = test_scheduling.get_push_data(
+            "label" if self.granularity == "label" else "config_group"
+        )
+        test_scheduling.generate_failing_together_probabilities(
+            "label" if self.granularity == "label" else "config_group",
+            push_data_iter(),
+            push_data_count,
+            pushes[train_push_len - 1]["revs"][0],
+        )

        test_pushes_list = pushes[train_push_len:]

@ -437,9 +550,7 @@ class TestSelectModel(Model):
                commits, 0.3, push_num - 100
            )

-        reductions: List[Optional[float]] = [None]
-        if self.granularity == "label":
-            reductions += [0.9, 1.0]
+        reductions: List[Optional[float]] = [None, 0.9, 1.0]

        def do_eval(confidence_threshold, reduction, cap, minimum):
            for rev, push in test_pushes.items():
@ -463,7 +574,17 @@ class TestSelectModel(Model):
                    )

                if reduction is not None:
-                    selected = self.reduce(selected, reduction)
+                    if self.granularity == "label":
+                        selected = self.reduce(selected, reduction)
+                    elif self.granularity == "group":
+                        push["number_configs"] = len(
+                            set(
+                                config
+                                for config, group in self.select_configs(
+                                    selected, reduction
+                                )
+                            )
+                        )

                if cap is not None and len(selected) > cap:
                    selected = set(
@ -532,9 +653,15 @@ class TestSelectModel(Model):
                else "disabled"
            )

-            print(
-                f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, cap at {cap}, and minimum at {minimum}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures."
-            )
+            message = f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, cap at {cap}, and minimum at {minimum}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures."
+
+            if reduction is not None and self.granularity == "group":
+                average_configs = statistics.mean(
+                    result["number_configs"] for result in test_pushes.values()
+                )
+                message += f" On average, we selected {average_configs} configs."
+
+            print(message)

        for minimum in [None, 10]:
            for cap in [None, 300, 500]:
--- a/bugbug/test_scheduling.py
+++ b/bugbug/test_scheduling.py
@ -342,21 +342,27 @@ def generate_failing_together_probabilities(
            elif task2 in failures:
                count_single_failures[(task1, task2)] += 1

+    all_available_configs: Set[str] = set()
+
    for revisions, tasks, likely_regressions, candidate_regressions in tqdm(
        push_data, total=push_data_count
    ):
        failures = set(likely_regressions + candidate_regressions)
-        all_tasks = list(set(tasks) | failures)
+        all_tasks_set = set(tasks) | failures
+        all_tasks = list(all_tasks_set)

        # At config/group granularity, only consider redundancy between the same manifest
        # on different configurations, and not between manifests too.
        if granularity == "config_group":
+            all_available_configs.update(config for config, group in all_tasks)
+
            groups = itertools.groupby(
                sorted(all_tasks, key=lambda x: x[1]), key=lambda x: x[1]
            )
            for manifest, group_tasks in groups:
                count_runs_and_failures(group_tasks)
        else:
+            all_available_configs |= all_tasks_set
            count_runs_and_failures(all_tasks)

        if up_to is not None and revisions[0] == up_to:
@ -409,7 +415,7 @@ def generate_failing_together_probabilities(
            f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
        )

-    failing_together: dict = collections.defaultdict(dict)
+    failing_together: dict = {}
    count_redundancies: collections.Counter = collections.Counter()
    for couple, (support, confidence) in stats.items():
        if confidence == 1.0:
@ -438,19 +444,32 @@ def generate_failing_together_probabilities(
            count_redundancies["0%"] += 1

        if granularity == "config_group":
-            failing_together[couple[0][1]][(couple[0][0], couple[1][0])] = (
+            if couple[0][1] not in failing_together:
+                failing_together[couple[0][1]] = {}
+
+            if couple[0][0] not in failing_together[couple[0][1]]:
+                failing_together[couple[0][1]][couple[0][0]] = {}
+
+            failing_together[couple[0][1]][couple[0][0]][couple[1][0]] = (
                support,
                confidence,
            )
        else:
+            if couple[0] not in failing_together:
+                failing_together[couple[0]] = {}
+
            failing_together[couple[0]][couple[1]] = (support, confidence)

    for percentage, count in count_redundancies.most_common():
        logger.info(f"{count} with {percentage} confidence")

    failing_together_db = get_failing_together_db(granularity)
+
+    failing_together_db[b"$ALL_CONFIGS$"] = pickle.dumps(list(all_available_configs))
+
    for key, value in failing_together.items():
        failing_together_db[failing_together_key(key)] = pickle.dumps(value)
+
    close_failing_together_db(granularity)


--- a/tests/test_testselect.py
+++ b/tests/test_testselect.py
@ -13,7 +13,7 @@ import pytest
 from igraph import Graph

 from bugbug import test_scheduling
-from bugbug.models.testselect import TestLabelSelectModel
+from bugbug.models.testselect import TestGroupSelectModel, TestLabelSelectModel


@pytest.fixture
@ -22,6 +22,12 @@ def failing_together():
    test_scheduling.close_failing_together_db("label")


+@pytest.fixture
+def failing_together_config_group():
+    yield test_scheduling.get_failing_together_db("config_group")
+    test_scheduling.close_failing_together_db("config_group")
+
+
 def test_reduce1(failing_together):
    failing_together[b"test-linux1804-64/debug"] = pickle.dumps(
        {
@ -120,6 +126,7 @@ def test_reduce3(failing_together):
        result == {"windows10/opt-a", "windows10/opt-c",}
        or result == {"windows10/opt-d", "windows10/opt-c",}
        or result == {"windows10/opt-b", "windows10/opt-c",}
+        or result == {"windows10/opt-b", "windows10/opt-d",}
    )


@ -253,3 +260,65 @@ def test_all(g):
    result = model.reduce(tasks, 1.0)
    hypothesis.note(f"Result: {sorted(result)}")
    assert len(result) == len(g.components())
+
+
+def test_select_configs(failing_together_config_group):
+    failing_together_config_group[b"group1"] = pickle.dumps(
+        {
+            "linux1804-64-asan/debug": {
+                "linux1804-64/debug": (1.0, 0.0),
+                "linux1804-64/opt": (1.0, 0.0),
+                "mac/debug": (1.0, 0.0),
+                "windows10/debug": (1.0, 0.0),
+            },
+            "linux1804-64/debug": {
+                "linux1804-64/opt": (1.0, 1.0),
+                "mac/debug": (1.0, 1.0),
+                "windows10/debug": (1.0, 1.0),
+            },
+            "linux1804-64/opt": {
+                "mac/debug": (1.0, 1.0),
+                "windows10/debug": (1.0, 1.0),
+            },
+            "mac/debug": {"windows10/debug": (1.0, 1.0)},
+        }
+    )
+    failing_together_config_group[b"group2"] = pickle.dumps(
+        {
+            "linux1804-64-asan/debug": {
+                "linux1804-64/debug": (1.0, 1.0),
+                "linux1804-64/opt": (1.0, 0.0),
+                "mac/debug": (1.0, 0.0),
+                "windows10/debug": (1.0, 0.0),
+            },
+            "linux1804-64/debug": {
+                "linux1804-64/opt": (1.0, 0.0),
+                "mac/debug": (1.0, 0.0),
+                "windows10/debug": (1.0, 1.0),
+            },
+            "linux1804-64/opt": {
+                "mac/debug": (1.0, 0.0),
+                "windows10/debug": (1.0, 0.0),
+            },
+            "mac/debug": {"windows10/debug": (1.0, 0.0)},
+        }
+    )
+    failing_together_config_group[b"$ALL_CONFIGS$"] = pickle.dumps(
+        [
+            "linux1804-64-asan/debug",
+            "linux1804-64/debug",
+            "linux1804-64/opt",
+            "mac/debug",
+            "windows10/debug",
+        ]
+    )
+
+    model = TestGroupSelectModel()
+    result = model.select_configs({"group1", "group2",}, 1.0,)
+    assert result == {
+        ("linux1804-64-asan/debug", "group1"),
+        ("linux1804-64/opt", "group2"),
+        ("mac/debug", "group2"),
+        ("linux1804-64/opt", "group1"),
+        ("linux1804-64/debug", "group2"),
+    }