зеркало из https://github.com/mozilla/bugbug.git
Multilabel classifier for detecting type of bug (#395)
This commit is contained in:
Родитель
97d514e1db
Коммит
add9a937b3
|
@ -62,8 +62,8 @@ class Model:
|
|||
return top_features
|
||||
|
||||
def train(self, importance_cutoff=0.15):
|
||||
classes = self.get_labels()
|
||||
class_names = sorted(list(set(classes.values())), reverse=True)
|
||||
classes, class_names = self.get_labels()
|
||||
class_names = sorted(list(class_names), reverse=True)
|
||||
|
||||
# Get items, filtering out those for which we have no labels.
|
||||
def trainable_items_gen():
|
||||
|
|
|
@ -105,12 +105,14 @@ class AssigneeModel(BugModel):
|
|||
for assignee, count in assignee_counts:
|
||||
print(f"{assignee}: {count}")
|
||||
|
||||
return {
|
||||
classes = {
|
||||
bug_id: assignee
|
||||
for bug_id, assignee in classes.items()
|
||||
if assignee in top_assignees
|
||||
}
|
||||
|
||||
return classes, set(classes.values())
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
||||
|
|
|
@ -75,7 +75,7 @@ class BackoutModel(CommitModel):
|
|||
for commit_data in repository.get_commits():
|
||||
classes[commit_data["node"]] = 1 if commit_data["ever_backedout"] else 0
|
||||
|
||||
return classes
|
||||
return classes, [0, 1]
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import numpy as np
|
||||
import xgboost
|
||||
from imblearn.over_sampling import BorderlineSMOTE
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.pipeline import Pipeline
|
||||
|
||||
from bugbug import bug_features, bugzilla, feature_cleanup
|
||||
from bugbug.model import BugModel
|
||||
|
||||
keyword_dict = {
|
||||
"sec-critical": "security",
|
||||
"sec-high": "security",
|
||||
"sec-moderate": "security",
|
||||
"sec-low": "security",
|
||||
"sec-other": "security",
|
||||
"sec-audit": "security",
|
||||
"sec-vector": "security",
|
||||
"sec-want": "security",
|
||||
"memory-footprint": "memory",
|
||||
"memory-leak": "memory",
|
||||
"crash": "crash",
|
||||
"crashreportid": "crash",
|
||||
"perf": "performance",
|
||||
}
|
||||
|
||||
|
||||
class BugTypeModel(BugModel):
|
||||
def __init__(self, lemmatization=False, historical=False):
|
||||
BugModel.__init__(self, lemmatization)
|
||||
|
||||
self.sampler = BorderlineSMOTE(random_state=0)
|
||||
|
||||
feature_extractors = [
|
||||
bug_features.has_str(),
|
||||
bug_features.severity(),
|
||||
# Ignore keywords that would make the ML completely skewed
|
||||
# (we are going to use them as 100% rules in the evaluation phase).
|
||||
bug_features.keywords(set(keyword_dict.keys())),
|
||||
bug_features.is_coverity_issue(),
|
||||
bug_features.has_crash_signature(),
|
||||
bug_features.has_url(),
|
||||
bug_features.has_w3c_url(),
|
||||
bug_features.has_github_url(),
|
||||
bug_features.whiteboard(),
|
||||
bug_features.patches(),
|
||||
bug_features.landings(),
|
||||
bug_features.title(),
|
||||
bug_features.blocked_bugs_number(),
|
||||
bug_features.ever_affected(),
|
||||
bug_features.affected_then_unaffected(),
|
||||
bug_features.product(),
|
||||
bug_features.component(),
|
||||
]
|
||||
|
||||
cleanup_functions = [
|
||||
feature_cleanup.url(),
|
||||
feature_cleanup.fileref(),
|
||||
feature_cleanup.synonyms(),
|
||||
]
|
||||
|
||||
self.extraction_pipeline = Pipeline(
|
||||
[
|
||||
(
|
||||
"bug_extractor",
|
||||
bug_features.BugExtractor(feature_extractors, cleanup_functions),
|
||||
),
|
||||
(
|
||||
"union",
|
||||
ColumnTransformer(
|
||||
[
|
||||
("data", DictVectorizer(), "data"),
|
||||
("title", self.text_vectorizer(min_df=0.001), "title"),
|
||||
(
|
||||
"first_comment",
|
||||
self.text_vectorizer(min_df=0.001),
|
||||
"first_comment",
|
||||
),
|
||||
(
|
||||
"comments",
|
||||
self.text_vectorizer(min_df=0.001),
|
||||
"comments",
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
|
||||
|
||||
def get_labels(self):
|
||||
classes = {}
|
||||
keyword_list = list(set(keyword_dict.values()))
|
||||
|
||||
for bug_data in bugzilla.get_bugs():
|
||||
target = np.zeros(len(keyword_list))
|
||||
for keyword in bug_data["keywords"]:
|
||||
target[keyword_list.index(keyword_dict[keyword])] = 1
|
||||
|
||||
classes[int(bug_data["id"])] = target
|
||||
|
||||
return classes, keyword_list
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
|
@ -181,12 +181,14 @@ class ComponentModel(BugModel):
|
|||
for product, component in product_components.values()
|
||||
), f"It should be possible to map {conflated_component}"
|
||||
|
||||
return {
|
||||
classes = {
|
||||
bug_id: component
|
||||
for bug_id, component in classes.items()
|
||||
if component in top_components
|
||||
}
|
||||
|
||||
return classes, set(classes.values())
|
||||
|
||||
def is_meaningful(self, product, component):
|
||||
return product in self.PRODUCTS and component not in ["General", "Untriaged"]
|
||||
|
||||
|
|
|
@ -243,7 +243,7 @@ class DefectModel(BugModel):
|
|||
return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
|
||||
|
||||
def get_labels(self):
|
||||
return self.get_bugbug_labels("bug")
|
||||
return (self.get_bugbug_labels("bug"), [0, 1])
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
|
|
@ -27,7 +27,7 @@ class DefectEnhancementTaskModel(DefectModel):
|
|||
"{} tasks".format(sum(1 for label in classes.values() if label == "task"))
|
||||
)
|
||||
|
||||
return classes
|
||||
return classes, ["defect", "enhancement", "task"]
|
||||
|
||||
def overwrite_classes(self, bugs, classes, probabilities):
|
||||
for i, bug in enumerate(bugs):
|
||||
|
|
|
@ -108,7 +108,7 @@ class DevDocNeededModel(BugModel):
|
|||
if bug_id not in classes:
|
||||
classes[bug_id] = 0
|
||||
|
||||
return classes
|
||||
return classes, [0, 1]
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
|
|
@ -90,7 +90,7 @@ class QANeededModel(BugModel):
|
|||
if bug_id not in classes:
|
||||
classes[bug_id] = 0
|
||||
|
||||
return classes
|
||||
return classes, [0, 1]
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
|
|
@ -11,4 +11,4 @@ class RegressionModel(DefectModel):
|
|||
DefectModel.__init__(self, lemmatization)
|
||||
|
||||
def get_labels(self):
|
||||
return self.get_bugbug_labels("regression")
|
||||
return self.get_bugbug_labels("regression"), [0, 1]
|
||||
|
|
|
@ -113,7 +113,7 @@ class TrackingModel(BugModel):
|
|||
if bug_id not in classes:
|
||||
classes[bug_id] = 0
|
||||
|
||||
return classes
|
||||
return classes, [0, 1]
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
|
|
@ -93,7 +93,7 @@ class UpliftModel(BugModel):
|
|||
elif flag["status"] == "-":
|
||||
classes[bug_id] = 0
|
||||
|
||||
return classes
|
||||
return classes, [0, 1]
|
||||
|
||||
def get_feature_names(self):
|
||||
return self.extraction_pipeline.named_steps["union"].get_feature_names()
|
||||
|
|
5
run.py
5
run.py
|
@ -35,6 +35,7 @@ if __name__ == "__main__":
|
|||
"devdocneeded",
|
||||
"defectenhancementtask",
|
||||
"assignee",
|
||||
"bugtype",
|
||||
# commit classifiers
|
||||
"backout",
|
||||
],
|
||||
|
@ -107,6 +108,10 @@ if __name__ == "__main__":
|
|||
from bugbug.models.backout import BackoutModel
|
||||
|
||||
model_class = BackoutModel
|
||||
elif args.goal == "bugtype":
|
||||
from bugbug.models.bugtype import BugTypeModel
|
||||
|
||||
model_class = BugTypeModel
|
||||
|
||||
if args.train:
|
||||
db.download()
|
||||
|
|
|
@ -8,7 +8,7 @@ from bugbug.models.defect import DefectModel
|
|||
|
||||
def test_get_bug_labels():
|
||||
model = DefectModel()
|
||||
classes = model.get_labels()
|
||||
classes, _ = model.get_labels()
|
||||
# labels from bug_nobug.csv
|
||||
assert classes[1087488]
|
||||
assert not classes[1101825]
|
||||
|
|
|
@ -9,7 +9,7 @@ from bugbug.models.qaneeded import QANeededModel
|
|||
|
||||
def test_get_qaneeded_labels():
|
||||
model = QANeededModel()
|
||||
classes = model.get_labels()
|
||||
classes, _ = model.get_labels()
|
||||
assert not classes[1389220]
|
||||
assert classes[1389223], "Bug should contain qawanted in a field"
|
||||
assert classes[1390433], "Bug should contain qe-verify in a field"
|
||||
|
|
|
@ -8,6 +8,6 @@ from bugbug.models.tracking import TrackingModel
|
|||
|
||||
def test_get_tracking_labels():
|
||||
model = TrackingModel()
|
||||
classes = model.get_labels()
|
||||
classes, _ = model.get_labels()
|
||||
assert not classes[1101825]
|
||||
assert classes[1042096]
|
||||
|
|
Загрузка…
Ссылка в новой задаче