зеркало из https://github.com/mozilla/bugbug.git
Add a model to identify the root cause of a bug (#1283)
This commit is contained in:
Родитель
7e43bb7ac5
Коммит
91cd2114e7
|
@ -17,6 +17,7 @@ MODELS = {
|
|||
"devdocneeded": "bugbug.models.devdocneeded.DevDocNeededModel",
|
||||
"duplicate": "bugbug.models.duplicate.DuplicateModel",
|
||||
"qaneeded": "bugbug.models.qaneeded.QANeededModel",
|
||||
"rcatype": "bugbug.models.rcatype.RCATypeModel",
|
||||
"regression": "bugbug.models.regression.RegressionModel",
|
||||
"regressionrange": "bugbug.models.regressionrange.RegressionRangeModel",
|
||||
"regressor": "bugbug.models.regressor.RegressorModel",
|
||||
|
|
|
@ -0,0 +1,175 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import xgboost
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from bugbug import bug_features, bugzilla, feature_cleanup
|
||||
from bugbug.model import BugModel
|
||||
|
||||
# For the moment, rca - XYZ is treated of bugtype XYZ,
|
||||
# so we don't need to store it in a dictionary.
|
||||
RCA_CATEGORIES = [
|
||||
"requirementerror",
|
||||
"poorarchitecture",
|
||||
"designerror",
|
||||
"codingerror",
|
||||
"testingerror",
|
||||
"externalsoftwareaffectingfirefox",
|
||||
"performanceerror",
|
||||
"standards",
|
||||
"systemerror",
|
||||
"localizationerror",
|
||||
"memory",
|
||||
"infrastructure/builderror",
|
||||
"communicationissues",
|
||||
"productdecision",
|
||||
"undocumentedchange",
|
||||
"cornercase",
|
||||
]
|
||||
|
||||
RCA_SUBCATEGORIES = [
|
||||
"codingerror-syntaxerror",
|
||||
"codingerror-logicalerror",
|
||||
"codingerror-semanticerror",
|
||||
"codingerror-runtimeerror",
|
||||
"codingerror-unhandledexceptions",
|
||||
"codingerror-internalapiissue",
|
||||
"codingerror-networkissue",
|
||||
"codingerror-compatibilityissue",
|
||||
"codingerror-other",
|
||||
]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RCATypeModel(BugModel):
|
||||
def __init__(
|
||||
self, lemmatization=False, historical=False, rca_subcategories_enabled=False
|
||||
):
|
||||
BugModel.__init__(self, lemmatization)
|
||||
|
||||
self.calculate_importance = False
|
||||
self.rca_subcategories_enabled = rca_subcategories_enabled
|
||||
|
||||
# should we consider only the main category or all sub categories
|
||||
self.RCA_TYPES = (
|
||||
RCA_SUBCATEGORIES + RCA_CATEGORIES
|
||||
if rca_subcategories_enabled
|
||||
else RCA_CATEGORIES
|
||||
)
|
||||
|
||||
self.RCA_LIST = sorted(set(self.RCA_TYPES))
|
||||
|
||||
feature_extractors = [
|
||||
bug_features.has_str(),
|
||||
bug_features.severity(),
|
||||
bug_features.is_coverity_issue(),
|
||||
bug_features.has_crash_signature(),
|
||||
bug_features.has_url(),
|
||||
bug_features.has_w3c_url(),
|
||||
bug_features.has_github_url(),
|
||||
# Ignore whiteboards that would make the ML completely skewed
|
||||
# bug_features.whiteboard(),
|
||||
bug_features.patches(),
|
||||
bug_features.landings(),
|
||||
bug_features.blocked_bugs_number(),
|
||||
bug_features.ever_affected(),
|
||||
bug_features.affected_then_unaffected(),
|
||||
bug_features.product(),
|
||||
bug_features.component(),
|
||||
]
|
||||
|
||||
cleanup_functions = [
|
||||
feature_cleanup.url(),
|
||||
feature_cleanup.fileref(),
|
||||
feature_cleanup.synonyms(),
|
||||
]
|
||||
|
||||
self.extraction_pipeline = Pipeline(
|
||||
[
|
||||
(
|
||||
"bug_extractor",
|
||||
bug_features.BugExtractor(feature_extractors, cleanup_functions),
|
||||
),
|
||||
(
|
||||
"union",
|
||||
ColumnTransformer(
|
||||
[
|
||||
("data", DictVectorizer(), "data"),
|
||||
("title", self.text_vectorizer(min_df=0.001), "title"),
|
||||
(
|
||||
"first_comment",
|
||||
self.text_vectorizer(min_df=0.001),
|
||||
"first_comment",
|
||||
),
|
||||
(
|
||||
"comments",
|
||||
self.text_vectorizer(min_df=0.001),
|
||||
"comments",
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
|
||||
|
||||
# return rca from a whiteboard string
|
||||
def get_rca_from_whiteboard(self, whiteboard_data):
|
||||
rca = []
|
||||
whiteboard_data = re.sub(" ", "", whiteboard_data).lower()
|
||||
for whiteboard in whiteboard_data.split("["):
|
||||
if not any(whiteboard.startswith(s) for s in ("rca-", "rca:")):
|
||||
continue
|
||||
|
||||
rca_whiteboard = re.sub("]", "", whiteboard)
|
||||
|
||||
# Hybrid cases: rca:X-Y
|
||||
rca_whiteboard = re.sub(":", "-", rca_whiteboard)
|
||||
|
||||
rca_whiteboard_split = (
|
||||
rca_whiteboard.split("-", 1)
|
||||
if self.rca_subcategories_enabled
|
||||
else rca_whiteboard.split("-")
|
||||
)
|
||||
|
||||
if rca_whiteboard_split[1] not in self.RCA_LIST:
|
||||
logger.warning(rca_whiteboard_split[1] + " not in RCA_LIST")
|
||||
else:
|
||||
rca.append(rca_whiteboard_split[1])
|
||||
return rca
|
||||
|
||||
def get_labels(self):
|
||||
classes = {}
|
||||
for bug in bugzilla.get_bugs():
|
||||
target = np.zeros(len(self.RCA_LIST))
|
||||
for rca in self.get_rca_from_whiteboard(bug["whiteboard"]):
|
||||
target[self.RCA_LIST.index(rca)] = 1
|
||||
classes[bug["id"]] = target
|
||||
return classes, self.RCA_LIST
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
||||
def overwrite_classes(self, bugs, classes, probabilities):
|
||||
rca_values = self.get_rca(bugs)
|
||||
for i in len(classes):
|
||||
for rca in rca_values[i]:
|
||||
if rca in self.RCA_LIST:
|
||||
if probabilities:
|
||||
classes[i][self.RCA_LIST.index(rca)] = 1.0
|
||||
else:
|
||||
classes[i][self.RCA_LIST.index(rca)] = 1
|
||||
|
||||
return classes
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,71 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from bugbug.models.rcatype import RCATypeModel
|
||||
|
||||
|
||||
def test_get_rca_from_whiteboard():
|
||||
model = RCATypeModel()
|
||||
# Case 1: No rca
|
||||
assert model.get_rca_from_whiteboard("[Whiteboard1][Not RCA type]") == []
|
||||
# Case 2: RCA : A and RCA - A
|
||||
assert model.get_rca_from_whiteboard("[RCA: cornercase]") == ["cornercase"]
|
||||
assert model.get_rca_from_whiteboard("[rca - codingerror]") == ["codingerror"]
|
||||
# Case 3: Multiple rca types
|
||||
assert model.get_rca_from_whiteboard("[rca - cornercase][rca - codingerror]") == [
|
||||
"cornercase",
|
||||
"codingerror",
|
||||
]
|
||||
assert model.get_rca_from_whiteboard("[rca : systemerror][rca - codingerror]") == [
|
||||
"systemerror",
|
||||
"codingerror",
|
||||
]
|
||||
assert model.get_rca_from_whiteboard("[rca - cornercase][rca : testingerror]") == [
|
||||
"cornercase",
|
||||
"testingerror",
|
||||
]
|
||||
assert model.get_rca_from_whiteboard("[rca : cornercase][rca : codingerror]") == [
|
||||
"cornercase",
|
||||
"codingerror",
|
||||
]
|
||||
assert model.get_rca_from_whiteboard("[RCA: codingerror - syntaxerror]") == [
|
||||
"codingerror"
|
||||
]
|
||||
# Case 4: subcategories enabled, with rca already present in the list
|
||||
model = RCATypeModel(rca_subcategories_enabled=True)
|
||||
assert model.get_rca_from_whiteboard("[RCA: codingerror - syntaxerror]") == [
|
||||
"codingerror-syntaxerror"
|
||||
]
|
||||
assert model.get_rca_from_whiteboard(
|
||||
"[RCA: codingerror - syntaxerror][rca: codingerror:logicalerror]"
|
||||
) == ["codingerror-syntaxerror", "codingerror-logicalerror"]
|
||||
# Case 5: subcategories enabled, with rca not present in list
|
||||
assert model.get_rca_from_whiteboard("[RCA: codingerror - semanticerror]") == [
|
||||
"codingerror-semanticerror"
|
||||
]
|
||||
|
||||
|
||||
def test_get_labels():
|
||||
model = RCATypeModel()
|
||||
classes, _ = model.get_labels()
|
||||
|
||||
assert classes[1556846].tolist() == [
|
||||
1.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
1.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
0.0,
|
||||
]
|
Загрузка…
Ссылка в новой задаче