[New Model] Performance Bug Model (#3895)

This commit is contained in:
Promise Fru 2024-01-17 14:58:47 +01:00 коммит произвёл GitHub
Родитель 799994add8
Коммит e31823744e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
4 изменённых файлов: 145 добавлений и 1 удалений

Просмотреть файл

@ -22,6 +22,7 @@ MODELS = {
"fixtime": "bugbug.models.fixtime.FixTimeModel",
"invalidcompatibilityreport": "bugbug.models.invalid_compatibility_report.InvalidCompatibilityReportModel",
"needsdiagnosis": "bugbug.models.needsdiagnosis.NeedsDiagnosisModel",
"performancebug": "bugbug.models.performancebug.PerformanceBugModel",
"qaneeded": "bugbug.models.qaneeded.QANeededModel",
"rcatype": "bugbug.models.rcatype.RCATypeModel",
"regression": "bugbug.models.regression.RegressionModel",

Просмотреть файл

@ -0,0 +1,124 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import xgboost
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImblearnPipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla, feature_cleanup, utils
from bugbug.model import BugModel
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class PerformanceBugModel(BugModel):
def __init__(self, lemmatization=False):
BugModel.__init__(self, lemmatization)
self.calculate_importance = False
feature_extractors = [
bug_features.HasSTR(),
bug_features.Keywords(
prefixes_to_ignore=bug_features.IsPerformanceBug.keyword_prefixes
),
bug_features.IsCoverityIssue(),
bug_features.HasCrashSignature(),
bug_features.HasURL(),
bug_features.HasW3CURL(),
bug_features.HasGithubURL(),
bug_features.Product(),
bug_features.HasRegressionRange(),
bug_features.HasCVEInAlias(),
bug_features.HasAttachment(),
bug_features.FiledVia(),
]
cleanup_functions = [
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
feature_cleanup.hex(),
feature_cleanup.dll(),
feature_cleanup.crash(),
]
self.extraction_pipeline = Pipeline(
[
(
"bug_extractor",
bug_features.BugExtractor(
feature_extractors, cleanup_functions, rollback=True
),
),
]
)
self.clf = ImblearnPipeline(
[
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
("title", self.text_vectorizer(min_df=0.0001), "title"),
(
"first_comment",
self.text_vectorizer(min_df=0.0001),
"first_comment",
),
]
),
),
("sampler", BorderlineSMOTE(random_state=0)),
(
"estimator",
xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()),
),
]
)
def get_labels(self):
classes = {}
is_performance_bug = bug_features.IsPerformanceBug()
for bug_data in bugzilla.get_bugs():
bug_id = int(bug_data["id"])
if "cf_performance_impact" not in bug_data or bug_data[
"cf_performance_impact"
] in ("?", "none"):
continue
classes[bug_id] = 1 if is_performance_bug(bug_data) else 0
logger.info(
"%d performance bugs",
sum(label == 1 for label in classes.values()),
)
logger.info(
"%d non-performance bugs",
sum(label == 0 for label in classes.values()),
)
return classes, [0, 1]
def get_feature_names(self):
return self.clf.named_steps["union"].get_feature_names_out()
def overwrite_classes(self, bugs, classes, probabilities):
is_performance_bug = bug_features.IsPerformanceBug()
for i, bug in enumerate(bugs):
if is_performance_bug(bug):
classes[i] = [1.0, 0.0] if probabilities else 1
return classes

6
tests/fixtures/bugs.json поставляемый

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug.models.performancebug import PerformanceBugModel
def test_get_performancebug_labels():
model = PerformanceBugModel()
classes, _ = model.get_labels()
assert classes[1461247] == 1
assert classes[1457988] == 1
assert classes[446261] == 0
assert classes[452258] == 0