diff --git a/bugbug/labels.py b/bugbug/labels.py index 4fcef5f4..c9308f7d 100644 --- a/bugbug/labels.py +++ b/bugbug/labels.py @@ -107,6 +107,25 @@ def get_bugbug_labels(kind='bug', augmentation=False): return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids} +def get_uplift_labels(): + classes = {} + + for bug_data in bugzilla.get_bugs(): + bug_id = int(bug_data['id']) + + for attachment in bug_data['attachments']: + for flag in attachment['flags']: + if not flag['name'].startswith('approval-mozilla-') or flag['status'] not in ['+', '-']: + continue + + if flag['status'] == '+': + classes[bug_id] = True + elif flag['status'] == '-': + classes[bug_id] = False + + return classes + + def get_all_bug_ids(): bug_ids = set() diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py new file mode 100644 index 00000000..aa621459 --- /dev/null +++ b/bugbug/models/uplift.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import xgboost +from sklearn.feature_extraction import DictVectorizer +from sklearn.pipeline import FeatureUnion +from sklearn.pipeline import Pipeline + +from bugbug import bug_features +from bugbug import labels +from bugbug.model import Model +from bugbug.utils import DictSelector + + +class UpliftModel(Model): + def __init__(self, lemmatization=False): + Model.__init__(self, lemmatization) + + self.classes = labels.get_uplift_labels() + + feature_extractors = [ + bug_features.has_str(), + bug_features.has_regression_range(), + bug_features.severity(), + bug_features.keywords(), + bug_features.is_coverity_issue(), + bug_features.has_crash_signature(), + bug_features.has_url(), + bug_features.has_w3c_url(), + bug_features.has_github_url(), + bug_features.whiteboard(), + bug_features.patches(), + bug_features.landings(), + bug_features.title(), + bug_features.comments(), + ] + + self.extraction_pipeline = Pipeline([ + ('bug_extractor', bug_features.BugExtractor(feature_extractors)), + ('union', FeatureUnion( + transformer_list=[ + ('data', Pipeline([ + ('selector', DictSelector(key='data')), + ('vect', DictVectorizer()), + ])), + + ('title', Pipeline([ + ('selector', DictSelector(key='title')), + ('tfidf', self.text_vectorizer(stop_words='english')), + ])), + + ('comments', Pipeline([ + ('selector', DictSelector(key='comments')), + ('tfidf', self.text_vectorizer(stop_words='english')), + ])), + ], + )), + ]) + + self.clf = xgboost.XGBClassifier(n_jobs=16) diff --git a/run.py b/run.py index 85c8a5da..57b64966 100644 --- a/run.py +++ b/run.py @@ -13,7 +13,7 @@ if __name__ == '__main__': parser.add_argument('--lemmatization', help='Perform lemmatization (using spaCy)', action='store_true') parser.add_argument('--download', help='Download data required for training', action='store_true') parser.add_argument('--train', help='Perform training', action='store_true') - parser.add_argument('--goal', help='Goal of the classifier', choices=['bug', 'regression', 'tracking', 'qaneeded'], default='bug') + parser.add_argument('--goal', help='Goal of the classifier', choices=['bug', 'regression', 'tracking', 'qaneeded', 'uplift'], default='bug') args = parser.parse_args() if args.download: @@ -34,6 +34,9 @@ if __name__ == '__main__': elif args.goal == 'qaneeded': from bugbug.models.qaneeded import QANeededModel model_class = QANeededModel + elif args.goal == 'uplift': + from bugbug.models.uplift import UpliftModel + model_class = UpliftModel if args.train: model = model_class(args.lemmatization)