Add a 'types' field to the changes artifact which specifies the bug types

This commit is contained in:
Marco Castelluccio 2020-12-21 21:54:46 +01:00
Родитель 419cadf90f
Коммит 773cdcb7c6
2 изменённых файлов: 31 добавлений и 14 удалений

Просмотреть файл

@ -3,6 +3,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file, # License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/. # You can obtain one at http://mozilla.org/MPL/2.0/.
from typing import Dict, Iterable, List, Tuple
import numpy as np import numpy as np
import xgboost import xgboost
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
@ -28,7 +30,17 @@ KEYWORD_DICT = {
"crashreportid": "crash", "crashreportid": "crash",
"perf": "performance", "perf": "performance",
} }
KEYWORD_LIST = sorted(set(KEYWORD_DICT.values())) TYPE_LIST = sorted(set(KEYWORD_DICT.values()))
def bug_to_types(bug: bugzilla.BugDict) -> List[str]:
return list(
set(
KEYWORD_DICT[keyword]
for keyword in bug["keywords"]
if keyword in KEYWORD_DICT
)
)
class BugTypeModel(BugModel): class BugTypeModel(BugModel):
@ -96,29 +108,32 @@ class BugTypeModel(BugModel):
xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()) xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
) )
def get_labels(self): def get_labels(self) -> Tuple[Dict[int, np.ndarray], List[str]]:
classes = {} classes = {}
for bug_data in bugzilla.get_bugs(): for bug_data in bugzilla.get_bugs():
target = np.zeros(len(KEYWORD_LIST)) target = np.zeros(len(TYPE_LIST))
for keyword in bug_data["keywords"]: for type_ in bug_to_types(bug_data):
if keyword in KEYWORD_DICT: target[TYPE_LIST.index(type_)] = 1
target[KEYWORD_LIST.index(KEYWORD_DICT[keyword])] = 1
classes[int(bug_data["id"])] = target classes[int(bug_data["id"])] = target
return classes, KEYWORD_LIST return classes, TYPE_LIST
def get_feature_names(self): def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names() return self.extraction_pipeline.named_steps["union"].get_feature_names()
def overwrite_classes(self, bugs, classes, probabilities): def overwrite_classes(
self,
bugs: Iterable[bugzilla.BugDict],
classes: Dict[int, np.ndarray],
probabilities: bool,
):
for i, bug in enumerate(bugs): for i, bug in enumerate(bugs):
for keyword in bug["keywords"]: for type_ in bug_to_types(bug):
if keyword in KEYWORD_LIST: if probabilities:
if probabilities: classes[i][TYPE_LIST.index(type_)] = 1.0
classes[i][KEYWORD_LIST.index(keyword)] = 1.0 else:
else: classes[i][TYPE_LIST.index(type_)] = 1
classes[i][KEYWORD_LIST.index(keyword)] = 1
return classes return classes

Просмотреть файл

@ -17,6 +17,7 @@ import requests
from tqdm import tqdm from tqdm import tqdm
from bugbug import bugzilla, db, phabricator, repository, test_scheduling from bugbug import bugzilla, db, phabricator, repository, test_scheduling
from bugbug.models.bugtype import bug_to_types
from bugbug.models.regressor import BUG_FIXING_COMMITS_DB, RegressorModel from bugbug.models.regressor import BUG_FIXING_COMMITS_DB, RegressorModel
from bugbug.utils import ( from bugbug.utils import (
download_check_etag, download_check_etag,
@ -469,6 +470,7 @@ class LandingsRiskReportGenerator(object):
component_team_mapping, bug["product"], bug["component"] component_team_mapping, bug["product"], bug["component"]
), ),
"summary": bug["summary"], "summary": bug["summary"],
"types": bug_to_types(bug),
"creation_date": dateutil.parser.parse(bug["creation_time"]).strftime( "creation_date": dateutil.parser.parse(bug["creation_time"]).strftime(
"%Y-%m-%d" "%Y-%m-%d"
), ),