Multilabel classifier for detecting type of bug (#395)

This commit is contained in:
Ayush Shridhar 2019-05-14 03:17:53 -07:00 коммит произвёл Marco
Родитель 97d514e1db
Коммит add9a937b3
16 изменённых файлов: 136 добавлений и 15 удалений

Просмотреть файл

@ -62,8 +62,8 @@ class Model:
return top_features
def train(self, importance_cutoff=0.15):
classes = self.get_labels()
class_names = sorted(list(set(classes.values())), reverse=True)
classes, class_names = self.get_labels()
class_names = sorted(list(class_names), reverse=True)
# Get items, filtering out those for which we have no labels.
def trainable_items_gen():

Просмотреть файл

@ -105,12 +105,14 @@ class AssigneeModel(BugModel):
for assignee, count in assignee_counts:
print(f"{assignee}: {count}")
return {
classes = {
bug_id: assignee
for bug_id, assignee in classes.items()
if assignee in top_assignees
}
return classes, set(classes.values())
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

Просмотреть файл

@ -75,7 +75,7 @@ class BackoutModel(CommitModel):
for commit_data in repository.get_commits():
classes[commit_data["node"]] = 1 if commit_data["ever_backedout"] else 0
return classes
return classes, [0, 1]
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

112
bugbug/models/bugtype.py Normal file
Просмотреть файл

@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import numpy as np
import xgboost
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla, feature_cleanup
from bugbug.model import BugModel
keyword_dict = {
"sec-critical": "security",
"sec-high": "security",
"sec-moderate": "security",
"sec-low": "security",
"sec-other": "security",
"sec-audit": "security",
"sec-vector": "security",
"sec-want": "security",
"memory-footprint": "memory",
"memory-leak": "memory",
"crash": "crash",
"crashreportid": "crash",
"perf": "performance",
}
class BugTypeModel(BugModel):
def __init__(self, lemmatization=False, historical=False):
BugModel.__init__(self, lemmatization)
self.sampler = BorderlineSMOTE(random_state=0)
feature_extractors = [
bug_features.has_str(),
bug_features.severity(),
# Ignore keywords that would make the ML completely skewed
# (we are going to use them as 100% rules in the evaluation phase).
bug_features.keywords(set(keyword_dict.keys())),
bug_features.is_coverity_issue(),
bug_features.has_crash_signature(),
bug_features.has_url(),
bug_features.has_w3c_url(),
bug_features.has_github_url(),
bug_features.whiteboard(),
bug_features.patches(),
bug_features.landings(),
bug_features.title(),
bug_features.blocked_bugs_number(),
bug_features.ever_affected(),
bug_features.affected_then_unaffected(),
bug_features.product(),
bug_features.component(),
]
cleanup_functions = [
feature_cleanup.url(),
feature_cleanup.fileref(),
feature_cleanup.synonyms(),
]
self.extraction_pipeline = Pipeline(
[
(
"bug_extractor",
bug_features.BugExtractor(feature_extractors, cleanup_functions),
),
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
("title", self.text_vectorizer(min_df=0.001), "title"),
(
"first_comment",
self.text_vectorizer(min_df=0.001),
"first_comment",
),
(
"comments",
self.text_vectorizer(min_df=0.001),
"comments",
),
]
),
),
]
)
self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
def get_labels(self):
classes = {}
keyword_list = list(set(keyword_dict.values()))
for bug_data in bugzilla.get_bugs():
target = np.zeros(len(keyword_list))
for keyword in bug_data["keywords"]:
target[keyword_list.index(keyword_dict[keyword])] = 1
classes[int(bug_data["id"])] = target
return classes, keyword_list
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

Просмотреть файл

@ -181,12 +181,14 @@ class ComponentModel(BugModel):
for product, component in product_components.values()
), f"It should be possible to map {conflated_component}"
return {
classes = {
bug_id: component
for bug_id, component in classes.items()
if component in top_components
}
return classes, set(classes.values())
def is_meaningful(self, product, component):
return product in self.PRODUCTS and component not in ["General", "Untriaged"]

Просмотреть файл

@ -243,7 +243,7 @@ class DefectModel(BugModel):
return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
def get_labels(self):
return self.get_bugbug_labels("bug")
return (self.get_bugbug_labels("bug"), [0, 1])
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

Просмотреть файл

@ -27,7 +27,7 @@ class DefectEnhancementTaskModel(DefectModel):
"{} tasks".format(sum(1 for label in classes.values() if label == "task"))
)
return classes
return classes, ["defect", "enhancement", "task"]
def overwrite_classes(self, bugs, classes, probabilities):
for i, bug in enumerate(bugs):

Просмотреть файл

@ -108,7 +108,7 @@ class DevDocNeededModel(BugModel):
if bug_id not in classes:
classes[bug_id] = 0
return classes
return classes, [0, 1]
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

Просмотреть файл

@ -90,7 +90,7 @@ class QANeededModel(BugModel):
if bug_id not in classes:
classes[bug_id] = 0
return classes
return classes, [0, 1]
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

Просмотреть файл

@ -11,4 +11,4 @@ class RegressionModel(DefectModel):
DefectModel.__init__(self, lemmatization)
def get_labels(self):
return self.get_bugbug_labels("regression")
return self.get_bugbug_labels("regression"), [0, 1]

Просмотреть файл

@ -113,7 +113,7 @@ class TrackingModel(BugModel):
if bug_id not in classes:
classes[bug_id] = 0
return classes
return classes, [0, 1]
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

Просмотреть файл

@ -93,7 +93,7 @@ class UpliftModel(BugModel):
elif flag["status"] == "-":
classes[bug_id] = 0
return classes
return classes, [0, 1]
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

5
run.py
Просмотреть файл

@ -35,6 +35,7 @@ if __name__ == "__main__":
"devdocneeded",
"defectenhancementtask",
"assignee",
"bugtype",
# commit classifiers
"backout",
],
@ -107,6 +108,10 @@ if __name__ == "__main__":
from bugbug.models.backout import BackoutModel
model_class = BackoutModel
elif args.goal == "bugtype":
from bugbug.models.bugtype import BugTypeModel
model_class = BugTypeModel
if args.train:
db.download()

Просмотреть файл

@ -8,7 +8,7 @@ from bugbug.models.defect import DefectModel
def test_get_bug_labels():
model = DefectModel()
classes = model.get_labels()
classes, _ = model.get_labels()
# labels from bug_nobug.csv
assert classes[1087488]
assert not classes[1101825]

Просмотреть файл

@ -9,7 +9,7 @@ from bugbug.models.qaneeded import QANeededModel
def test_get_qaneeded_labels():
model = QANeededModel()
classes = model.get_labels()
classes, _ = model.get_labels()
assert not classes[1389220]
assert classes[1389223], "Bug should contain qawanted in a field"
assert classes[1390433], "Bug should contain qe-verify in a field"

Просмотреть файл

@ -8,6 +8,6 @@ from bugbug.models.tracking import TrackingModel
def test_get_tracking_labels():
model = TrackingModel()
classes = model.get_labels()
classes, _ = model.get_labels()
assert not classes[1101825]
assert classes[1042096]