Support selecting configurations on which groups should run

Also switch from using CP-SAT to MIP, as it's faster.

Part of #1117
This commit is contained in:
Marco Castelluccio 2020-06-09 01:57:13 +02:00
Родитель 479ec905c4
Коммит d1dae08fec
3 изменённых файлов: 294 добавлений и 79 удалений

Просмотреть файл

@ -8,12 +8,12 @@ import math
import pickle
import statistics
from functools import reduce
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple
import numpy as np
import xgboost
from imblearn.under_sampling import RandomUnderSampler
from ortools.sat.python import cp_model
from ortools.linear_solver import pywraplp
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
@ -61,6 +61,7 @@ class TestSelectModel(Model):
self.eval_dbs[test_scheduling.TEST_GROUP_SCHEDULING_DB] = (
test_scheduling.PAST_FAILURES_GROUP_DB,
test_scheduling.TOUCHED_TOGETHER_DB,
test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB,
)
elif granularity == "config_group":
self.training_dbs.append(test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB)
@ -236,47 +237,49 @@ class TestSelectModel(Model):
for i in selected_indexes
}
def reduce(self, tasks: Set[str], min_redundancy_confidence: float) -> Set[str]:
failing_together = test_scheduling.get_failing_together_db(self.granularity)
def _get_cost(self, config: str) -> int:
costs = [
(("linux1804-64", "opt"), 1),
(("linux1804-64", "debug"), 2),
(("linux1804-64", "opt"), 2),
(("linux1804-64", "debug"), 3),
(("windows10", "opt"), 4),
(("windows10", "debug"), 4),
(("android-em", "opt"), 5),
(("android-em", "debug"), 6),
(("windows7", "opt"), 7),
(("windows7", "debug"), 8),
(("mac", "opt"), 9),
(("mac", "debug"), 10),
(("asan", "opt"), 11),
(("asan", "debug"), 12),
(("linux1804-32", "opt"), 13),
(("linux1804-32", "debug"), 14),
(("android-hw", "opt"), 15),
(("android-hw", "debug"), 16),
(("tsan", "opt"), 17),
(("tsan", "debug"), 18),
(("windows10", "debug"), 5),
(("android-em", "opt"), 6),
(("android-em", "debug"), 7),
(("windows7", "opt"), 8),
(("windows7", "debug"), 9),
(("mac", "opt"), 10),
(("mac", "debug"), 11),
(("asan", "opt"), 12),
(("asan", "debug"), 13),
(("linux1804-32", "opt"), 14),
(("linux1804-32", "debug"), 15),
(("android-hw", "opt"), 16),
(("android-hw", "debug"), 17),
(("tsan", "opt"), 18),
(("tsan", "debug"), 19),
(("test-linux1804-64-shippable/opt-*-e10s",), 1),
]
def get_cost(task):
for substrings, cost in reversed(costs):
if all(s in task for s in substrings):
return cost
for substrings, cost in reversed(costs):
if all(s in config for s in substrings):
return cost
raise Exception(f"Couldn't find cost for {task}")
raise Exception(f"Couldn't find cost for {config}")
model = cp_model.CpModel()
task_vars = {task: model.NewIntVar(0, 1, task) for task in tasks}
# Generate 'equivalence groups', containing all tasks that are redundant with each other.
def _generate_equivalence_sets(
self,
tasks: Set[str],
min_redundancy_confidence: float,
load_failing_together: Callable[[str], Dict[str, Tuple[float, float]]],
assume_redundant: bool,
) -> List[Set[str]]:
# Generate 'equivalence sets', containing all tasks that are redundant with
# each other.
groups: List[Set[str]] = []
task_to_groups: Dict[str, Set[int]] = collections.defaultdict(set)
incompatible_groups: Dict[str, Set[int]] = collections.defaultdict(set)
def create_group(task):
def create_group(task: str) -> None:
if task in task_to_groups:
return
@ -285,7 +288,7 @@ class TestSelectModel(Model):
# Add task1 to all equivalence groups where task2 is present, and likewise for task2.
# Skip groups which contain tasks that are not redundant with task1.
def add_to_groups(task1, task2):
def add_to_groups(task1: str, task2: str) -> None:
found = False
if task1 in task_to_groups:
@ -315,7 +318,7 @@ class TestSelectModel(Model):
task_to_groups[task1].add(len(groups) - 1)
task_to_groups[task2].add(len(groups) - 1)
def mark_incompatible(task1, task2):
def mark_incompatible(task1: str, task2: str) -> None:
if task1 in task_to_groups:
incompatible_groups[task2].update(task_to_groups[task1])
@ -324,18 +327,23 @@ class TestSelectModel(Model):
sorted_tasks = sorted(tasks)
for i, task1 in enumerate(sorted_tasks):
key = test_scheduling.failing_together_key(task1)
if key not in failing_together:
create_group(task1)
continue
failing_together_stats = pickle.loads(failing_together[key])
try:
failing_together_stats = load_failing_together(task1)
except KeyError:
if not assume_redundant:
create_group(task1)
continue
else:
failing_together_stats = {}
for task2 in sorted_tasks[i + 1 :]:
try:
support, confidence = failing_together_stats[task2]
except KeyError:
continue
if not assume_redundant:
continue
else:
confidence = 1.0
if confidence >= min_redundancy_confidence:
add_to_groups(task1, task2)
@ -346,31 +354,135 @@ class TestSelectModel(Model):
# with it.
create_group(task1)
# Create constraints to ensure at least one task from each set of equivalent
# groups is selected.
for group in groups:
model.Add(sum(task_vars[task] for task in group) >= 1)
return groups
# Choose the best set of tasks that satisfy the constraints with the lowest cost.
model.Minimize(
sum(get_cost(task) * task_vars[task] for task in task_vars.keys())
def _solve_optimization(self, solver: pywraplp.Solver) -> None:
# The MIP solver is usually fast (milliseconds). If we hit a weird problem,
# accept a suboptimal solution after 10 seconds.
solver.SetTimeLimit(10000)
status = solver.Solve()
if status == pywraplp.Solver.INFEASIBLE:
raise Exception("Infeasible problem")
elif status == pywraplp.Solver.NOT_SOLVED:
raise Exception("Problem unsolved")
def reduce(self, tasks: Set[str], min_redundancy_confidence: float) -> Set[str]:
failing_together = test_scheduling.get_failing_together_db(self.granularity)
def load_failing_together(task: str) -> Dict[str, Tuple[float, float]]:
key = test_scheduling.failing_together_key(task)
return pickle.loads(failing_together[key])
solver = pywraplp.Solver(
"select_configs", pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING
)
solver = cp_model.CpSolver()
# The CP-SAT solver is usually fast (milliseconds). If we hit a weird problem,
# accept a suboptimal solution after 10 seconds.
solver.parameters.max_time_in_seconds = 10.0
# Presolving considerably slows down the CP-SAT solver.
solver.parameters.cp_model_presolve = 0
status = solver.Solve(model)
task_vars = {task: solver.IntVar(0, 1, task) for task in tasks}
if status == cp_model.INFEASIBLE:
raise Exception("Infeasible problem")
equivalence_sets = self._generate_equivalence_sets(
tasks, min_redundancy_confidence, load_failing_together, False
)
# Create constraints to ensure at least one task from each set of equivalent
# sets is selected.
mutually_exclusive = True
seen = set()
for equivalence_set in equivalence_sets:
if any(config in seen for config in equivalence_set):
mutually_exclusive = False
break
seen |= equivalence_set
for equivalence_set in equivalence_sets:
sum_constraint = sum(task_vars[task] for task in equivalence_set)
if mutually_exclusive:
solver.Add(sum_constraint == 1)
else:
solver.Add(sum_constraint >= 1)
# Choose the best set of tasks that satisfy the constraints with the lowest cost.
solver.Minimize(
sum(self._get_cost(task) * task_vars[task] for task in task_vars.keys())
)
self._solve_optimization(solver)
return {
task
for task, task_var in task_vars.items()
if solver.Value(task_vars[task]) == 1
if task_var.solution_value() == 1
}
def select_configs(
self, groups: Set[str], min_redundancy_confidence: float
) -> Set[Tuple[str, str]]:
failing_together = test_scheduling.get_failing_together_db("config_group")
all_configs = pickle.loads(failing_together[b"$ALL_CONFIGS$"])
config_costs = {config: self._get_cost(config) for config in all_configs}
solver = pywraplp.Solver(
"select_configs", pywraplp.Solver.CBC_MIXED_INTEGER_PROGRAMMING
)
config_group_vars = {
(config, group): solver.BoolVar(f"{group}@{config}")
for group in groups
for config in all_configs
}
for group in groups:
key = test_scheduling.failing_together_key(group)
if key in failing_together:
failing_together_stats = pickle.loads(failing_together[key])
else:
failing_together_stats = {}
def load_failing_together(config: str) -> Dict[str, Tuple[float, float]]:
return failing_together_stats[config]
equivalence_sets = self._generate_equivalence_sets(
all_configs, min_redundancy_confidence, load_failing_together, True
)
# Create constraints to ensure at least one task from each set of equivalent
# groups is selected.
mutually_exclusive = True
seen = set()
for equivalence_set in equivalence_sets:
if any(config in seen for config in equivalence_set):
mutually_exclusive = False
break
seen |= equivalence_set
for equivalence_set in equivalence_sets:
sum_constraint = sum(
config_group_vars[(config, group)] for config in equivalence_set
)
if mutually_exclusive:
solver.Add(sum_constraint == 1)
else:
solver.Add(sum_constraint >= 1)
# Choose the best set of tasks that satisfy the constraints with the lowest cost.
solver.Minimize(
sum(
config_costs[config] * config_group_vars[(config, group)]
for config, group in config_group_vars.keys()
)
)
self._solve_optimization(solver)
return {
(config, group)
for (config, group), config_group_var in config_group_vars.items()
if config_group_var.solution_value() == 1
}
def evaluation(self) -> None:
@ -380,15 +492,16 @@ class TestSelectModel(Model):
# To evaluate the model with reductions enabled, we need to regenerate the failing together DB, using
# only failure data from the training pushes (otherwise, we'd leak training information into the test
# set).
if self.granularity == "label":
print("Generate failing together DB (restricted to training pushes)")
push_data_iter, push_data_count, _ = test_scheduling.get_push_data("label")
test_scheduling.generate_failing_together_probabilities(
self.granularity,
push_data_iter(),
push_data_count,
pushes[train_push_len - 1]["revs"][0],
)
print("Generate failing together DB (restricted to training pushes)")
push_data_iter, push_data_count, _ = test_scheduling.get_push_data(
"label" if self.granularity == "label" else "config_group"
)
test_scheduling.generate_failing_together_probabilities(
"label" if self.granularity == "label" else "config_group",
push_data_iter(),
push_data_count,
pushes[train_push_len - 1]["revs"][0],
)
test_pushes_list = pushes[train_push_len:]
@ -437,9 +550,7 @@ class TestSelectModel(Model):
commits, 0.3, push_num - 100
)
reductions: List[Optional[float]] = [None]
if self.granularity == "label":
reductions += [0.9, 1.0]
reductions: List[Optional[float]] = [None, 0.9, 1.0]
def do_eval(confidence_threshold, reduction, cap, minimum):
for rev, push in test_pushes.items():
@ -463,7 +574,17 @@ class TestSelectModel(Model):
)
if reduction is not None:
selected = self.reduce(selected, reduction)
if self.granularity == "label":
selected = self.reduce(selected, reduction)
elif self.granularity == "group":
push["number_configs"] = len(
set(
config
for config, group in self.select_configs(
selected, reduction
)
)
)
if cap is not None and len(selected) > cap:
selected = set(
@ -532,9 +653,15 @@ class TestSelectModel(Model):
else "disabled"
)
print(
f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, cap at {cap}, and minimum at {minimum}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures."
)
message = f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, cap at {cap}, and minimum at {minimum}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures."
if reduction is not None and self.granularity == "group":
average_configs = statistics.mean(
result["number_configs"] for result in test_pushes.values()
)
message += f" On average, we selected {average_configs} configs."
print(message)
for minimum in [None, 10]:
for cap in [None, 300, 500]:

Просмотреть файл

@ -342,21 +342,27 @@ def generate_failing_together_probabilities(
elif task2 in failures:
count_single_failures[(task1, task2)] += 1
all_available_configs: Set[str] = set()
for revisions, tasks, likely_regressions, candidate_regressions in tqdm(
push_data, total=push_data_count
):
failures = set(likely_regressions + candidate_regressions)
all_tasks = list(set(tasks) | failures)
all_tasks_set = set(tasks) | failures
all_tasks = list(all_tasks_set)
# At config/group granularity, only consider redundancy between the same manifest
# on different configurations, and not between manifests too.
if granularity == "config_group":
all_available_configs.update(config for config, group in all_tasks)
groups = itertools.groupby(
sorted(all_tasks, key=lambda x: x[1]), key=lambda x: x[1]
)
for manifest, group_tasks in groups:
count_runs_and_failures(group_tasks)
else:
all_available_configs |= all_tasks_set
count_runs_and_failures(all_tasks)
if up_to is not None and revisions[0] == up_to:
@ -409,7 +415,7 @@ def generate_failing_together_probabilities(
f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})."
)
failing_together: dict = collections.defaultdict(dict)
failing_together: dict = {}
count_redundancies: collections.Counter = collections.Counter()
for couple, (support, confidence) in stats.items():
if confidence == 1.0:
@ -438,19 +444,32 @@ def generate_failing_together_probabilities(
count_redundancies["0%"] += 1
if granularity == "config_group":
failing_together[couple[0][1]][(couple[0][0], couple[1][0])] = (
if couple[0][1] not in failing_together:
failing_together[couple[0][1]] = {}
if couple[0][0] not in failing_together[couple[0][1]]:
failing_together[couple[0][1]][couple[0][0]] = {}
failing_together[couple[0][1]][couple[0][0]][couple[1][0]] = (
support,
confidence,
)
else:
if couple[0] not in failing_together:
failing_together[couple[0]] = {}
failing_together[couple[0]][couple[1]] = (support, confidence)
for percentage, count in count_redundancies.most_common():
logger.info(f"{count} with {percentage} confidence")
failing_together_db = get_failing_together_db(granularity)
failing_together_db[b"$ALL_CONFIGS$"] = pickle.dumps(list(all_available_configs))
for key, value in failing_together.items():
failing_together_db[failing_together_key(key)] = pickle.dumps(value)
close_failing_together_db(granularity)

Просмотреть файл

@ -13,7 +13,7 @@ import pytest
from igraph import Graph
from bugbug import test_scheduling
from bugbug.models.testselect import TestLabelSelectModel
from bugbug.models.testselect import TestGroupSelectModel, TestLabelSelectModel
@pytest.fixture
@ -22,6 +22,12 @@ def failing_together():
test_scheduling.close_failing_together_db("label")
@pytest.fixture
def failing_together_config_group():
yield test_scheduling.get_failing_together_db("config_group")
test_scheduling.close_failing_together_db("config_group")
def test_reduce1(failing_together):
failing_together[b"test-linux1804-64/debug"] = pickle.dumps(
{
@ -120,6 +126,7 @@ def test_reduce3(failing_together):
result == {"windows10/opt-a", "windows10/opt-c",}
or result == {"windows10/opt-d", "windows10/opt-c",}
or result == {"windows10/opt-b", "windows10/opt-c",}
or result == {"windows10/opt-b", "windows10/opt-d",}
)
@ -253,3 +260,65 @@ def test_all(g):
result = model.reduce(tasks, 1.0)
hypothesis.note(f"Result: {sorted(result)}")
assert len(result) == len(g.components())
def test_select_configs(failing_together_config_group):
failing_together_config_group[b"group1"] = pickle.dumps(
{
"linux1804-64-asan/debug": {
"linux1804-64/debug": (1.0, 0.0),
"linux1804-64/opt": (1.0, 0.0),
"mac/debug": (1.0, 0.0),
"windows10/debug": (1.0, 0.0),
},
"linux1804-64/debug": {
"linux1804-64/opt": (1.0, 1.0),
"mac/debug": (1.0, 1.0),
"windows10/debug": (1.0, 1.0),
},
"linux1804-64/opt": {
"mac/debug": (1.0, 1.0),
"windows10/debug": (1.0, 1.0),
},
"mac/debug": {"windows10/debug": (1.0, 1.0)},
}
)
failing_together_config_group[b"group2"] = pickle.dumps(
{
"linux1804-64-asan/debug": {
"linux1804-64/debug": (1.0, 1.0),
"linux1804-64/opt": (1.0, 0.0),
"mac/debug": (1.0, 0.0),
"windows10/debug": (1.0, 0.0),
},
"linux1804-64/debug": {
"linux1804-64/opt": (1.0, 0.0),
"mac/debug": (1.0, 0.0),
"windows10/debug": (1.0, 1.0),
},
"linux1804-64/opt": {
"mac/debug": (1.0, 0.0),
"windows10/debug": (1.0, 0.0),
},
"mac/debug": {"windows10/debug": (1.0, 0.0)},
}
)
failing_together_config_group[b"$ALL_CONFIGS$"] = pickle.dumps(
[
"linux1804-64-asan/debug",
"linux1804-64/debug",
"linux1804-64/opt",
"mac/debug",
"windows10/debug",
]
)
model = TestGroupSelectModel()
result = model.select_configs({"group1", "group2",}, 1.0,)
assert result == {
("linux1804-64-asan/debug", "group1"),
("linux1804-64/opt", "group2"),
("mac/debug", "group2"),
("linux1804-64/opt", "group1"),
("linux1804-64/debug", "group2"),
}