Extract the logic to identify bug types into individual feature classes (#3907)

This commit is contained in:
Promise Fru 2024-01-03 16:04:12 +01:00 коммит произвёл GitHub
Родитель c598ee7ef2
Коммит ca1bebafe3
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 233 добавлений и 88 удалений

Просмотреть файл

@ -16,7 +16,7 @@ from libmozdata import versions
from libmozdata.bugzilla import Bugzilla
from sklearn.base import BaseEstimator, TransformerMixin
from bugbug import bug_snapshot, repository
from bugbug import bug_snapshot, bugzilla, repository
def field(bug, field):
@ -687,3 +687,192 @@ class BugExtractor(BaseEstimator, TransformerMixin):
bugs_iter = apply_rollback(bugs_iter)
return pd.DataFrame(apply_transform(bug) for bug in bugs_iter)
class IsPerformanceBug(SingleBugFeature):
"""Determine if the bug is related to performance based on given bug data."""
name = "Is Performance Bug"
type_name = "performance"
keyword_prefixes = ("perf", "topperf", "main-thread-io")
whiteboard_prefixes = (
"[fxperf",
"[fxperfsize",
"[snappy",
"[pdfjs-c-performance",
"[pdfjs-performance",
"[sp3",
)
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if bug.get("cf_performance_impact") in ("low", "medium", "high"):
return True
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsMemoryBug(SingleBugFeature):
"""Determine if the bug is related to memory based on given bug data."""
name = "Is Memory Bug"
type_name = "memory"
keyword_prefixes = ("memory-",)
whiteboard_prefixes = ("[overhead", "[memshrink")
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if bug_map is not None:
for bug_id in bug["blocks"]:
if bug_id not in bug_map:
continue
alias = bug_map[bug_id]["alias"]
if alias and alias.startswith("memshrink"):
return True
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsPowerBug(SingleBugFeature):
"""Determine if the bug is related to power based on given bug data."""
name = "Is Power Bug"
type_name = "power"
keyword_prefixes = ("power",)
whiteboard_prefixes = ("[power",)
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsSecurityBug(SingleBugFeature):
"""Determine if the bug is related to security based on given bug data."""
name = "Is Security Bug"
type_name = "security"
keyword_prefixes = ("sec-", "csectype-")
whiteboard_prefixes = ("[client-bounty-form", "[sec-survey")
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
bug_whiteboard = bug["whiteboard"].lower()
if any(prefix in bug_whiteboard for prefix in self.whiteboard_prefixes):
return True
return False
class IsCrashBug(SingleBugFeature):
"""Determine if the bug is related to crash based on given bug data."""
name = "Is Crash Bug"
type_name = "crash"
keyword_prefixes = ("crash", "crashreportid")
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> bool:
# Checking for `[@` will exclude some bugs that do not have valid
# signatures: https://mzl.la/46XAqRF
if bug.get("cf_crash_signature") and "[@" in bug["cf_crash_signature"]:
return True
if any(
keyword.startswith(prefix)
for keyword in bug["keywords"]
for prefix in self.keyword_prefixes
):
return True
return False
class BugTypes(SingleBugFeature):
"""Determine bug type."""
name = "Infer Bug Type"
bug_type_extractors: list = [
IsCrashBug(),
IsMemoryBug(),
IsPerformanceBug(),
IsPowerBug(),
IsSecurityBug(),
]
def __call__(
self,
bug: bugzilla.BugDict,
bug_map: dict[int, bugzilla.BugDict] | None = None,
) -> list[str]:
"""Infer bug types based on various bug characteristics.
Args:
- bug (bugzilla.BugDict): A dictionary containing bug data.
- bug_map (Optional[dict[int, bugzilla.BugDict]]): A mapping
of bug IDs to bug dictionaries. Default is None.
Returns:
- list[str]: A list of inferred bug types (e.g., "memory", "power",
"performance", "security", "crash").
"""
return [
is_type.type_name
for is_type in self.bug_type_extractors
if is_type(bug, bug_map)
]

Просмотреть файл

@ -18,79 +18,6 @@ from bugbug.model import BugModel
logger = logging.getLogger(__name__)
KEYWORD_DICT = {
"sec-": "security",
"csectype-": "security",
"memory-": "memory",
"crash": "crash",
"crashreportid": "crash",
"perf": "performance",
"topperf": "performance",
"main-thread-io": "performance",
"power": "power",
}
TYPE_LIST = sorted(set(KEYWORD_DICT.values()))
def bug_to_types(
bug: bugzilla.BugDict, bug_map: dict[int, bugzilla.BugDict] | None = None
) -> list[str]:
types = set()
bug_whiteboard = bug["whiteboard"].lower()
if any(
f"{whiteboard_text}" in bug_whiteboard
for whiteboard_text in ("overhead", "memshrink")
):
types.add("memory")
if "[power" in bug_whiteboard:
types.add("power")
if any(
f"[{whiteboard_text}" in bug_whiteboard
for whiteboard_text in (
"fxperf",
"fxperfsize",
"snappy",
"pdfjs-c-performance",
"pdfjs-performance",
"sp3",
)
):
types.add("performance")
if any(
f"[{whiteboard_text}" in bug_whiteboard
for whiteboard_text in ("client-bounty-form", "sec-survey")
):
types.add("security")
if "cf_performance_impact" in bug and bug["cf_performance_impact"] not in (
"---",
"?",
):
types.add("performance")
if "cf_crash_signature" in bug and bug["cf_crash_signature"] not in ("", "---"):
types.add("crash")
if bug_map is not None:
for bug_id in bug["blocks"]:
if bug_id not in bug_map:
continue
alias = bug_map[bug_id]["alias"]
if alias and alias.startswith("memshrink"):
types.add("memory")
for keyword_start, type in KEYWORD_DICT.items():
if any(keyword.startswith(keyword_start) for keyword in bug["keywords"]):
types.add(type)
return list(types)
class BugTypeModel(BugModel):
def __init__(self, lemmatization=False, historical=False):
@ -98,12 +25,20 @@ class BugTypeModel(BugModel):
self.calculate_importance = False
self.bug_type_extractors = bug_features.BugTypes.bug_type_extractors
label_keyword_prefixes = {
keyword
for extractor in self.bug_type_extractors
for keyword in extractor.keyword_prefixes
}
feature_extractors = [
bug_features.HasSTR(),
bug_features.Severity(),
# Ignore keywords that would make the ML completely skewed
# (we are going to use them as 100% rules in the evaluation phase).
bug_features.Keywords(set(KEYWORD_DICT.keys())),
bug_features.Keywords(label_keyword_prefixes),
bug_features.IsCoverityIssue(),
bug_features.HasCrashSignature(),
bug_features.HasURL(),
@ -170,20 +105,23 @@ class BugTypeModel(BugModel):
bug_map = {bug["id"]: bug for bug in bugzilla.get_bugs()}
for bug_data in bug_map.values():
target = np.zeros(len(TYPE_LIST))
for type_ in bug_to_types(bug_data, bug_map):
target[TYPE_LIST.index(type_)] = 1
target = np.zeros(len(self.bug_type_extractors))
for i, is_type in enumerate(self.bug_type_extractors):
if is_type(bug_data, bug_map):
target[i] = 1
classes[int(bug_data["id"])] = target
for type_ in TYPE_LIST:
bug_types = [extractor.type_name for extractor in self.bug_type_extractors]
for i, bug_type in enumerate(bug_types):
logger.info(
"%d %s bugs",
sum(target[TYPE_LIST.index(type_)] == 1 for target in classes.values()),
type_,
sum(target[i] for target in classes.values()),
bug_type,
)
return classes, TYPE_LIST
return classes, bug_types
def get_feature_names(self):
return self.clf.named_steps["union"].get_feature_names_out()
@ -194,11 +132,14 @@ class BugTypeModel(BugModel):
classes: dict[int, np.ndarray],
probabilities: bool,
):
bug_map = {bug["id"]: bug for bug in bugs}
for i, bug in enumerate(bugs):
for type_ in bug_to_types(bug):
if probabilities:
classes[i][TYPE_LIST.index(type_)] = 1.0
else:
classes[i][TYPE_LIST.index(type_)] = 1
for j, is_type_applicable in enumerate(self.bug_type_extractors):
if is_type_applicable(bug, bug_map):
if probabilities:
classes[i][j] = 1.0
else:
classes[i][j] = 1
return classes

Просмотреть файл

@ -30,7 +30,6 @@ from dateutil.relativedelta import relativedelta
from tqdm import tqdm
from bugbug import bug_features, bugzilla, db, phabricator, repository, test_scheduling
from bugbug.models.bugtype import bug_to_types
from bugbug.models.regressor import BUG_FIXING_COMMITS_DB, RegressorModel
from bugbug.utils import (
download_check_etag,
@ -555,6 +554,8 @@ class LandingsRiskReportGenerator(object):
component_team_mapping = get_component_team_mapping()
bug_to_types = bug_features.BugTypes()
bug_summaries = []
for bug_id in bugs:
if bug_id not in bug_map:

5
tests/fixtures/bug_features/bug_types.json поставляемый Normal file
Просмотреть файл

@ -0,0 +1,5 @@
{"keywords": ["meta", "perf"], "whiteboard": "", "cf_crash_signature": ""}
{"keywords": ["memory-leak", "regression"], "whiteboard": "[MemShrink:P1]", "cf_crash_signature": ""}
{"whiteboard": "", "keywords": ["power"]}
{"keywords": ["sec-want"], "whiteboard": "[sg:want][psm-padlock]"}
{"keywords": ["crash", "regression"], "whiteboard": "", "cf_crash_signature": "[@ audiounit_property_listener_callback]"}

Просмотреть файл

@ -12,6 +12,7 @@ from bugbug.bug_features import (
BlockedBugsNumber,
BugExtractor,
BugReporter,
BugTypes,
CommentCount,
CommentLength,
Component,
@ -178,3 +179,11 @@ def test_BugExtractor():
BugExtractor([HasSTR(), HasSTR()], [fileref(), url()])
with pytest.raises(AssertionError):
BugExtractor([HasSTR(), HasURL()], [fileref(), fileref()])
def test_BugTypes(read) -> None:
read(
"bug_types.json",
BugTypes,
[["performance"], ["memory"], ["power"], ["security"], ["crash"]],
)