Add an 'uplift' model

2018-12-13 12:01:44 +01:00 · 2018-12-13 12:01:44 +01:00 · 289ff7bf92
--- a/bugbug/labels.py
+++ b/bugbug/labels.py
@ -107,6 +107,25 @@ def get_bugbug_labels(kind='bug', augmentation=False):
    return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
 def get_uplift_labels():
    classes = {}
    for bug_data in bugzilla.get_bugs():
        bug_id = int(bug_data['id'])
        for attachment in bug_data['attachments']:
            for flag in attachment['flags']:
                if not flag['name'].startswith('approval-mozilla-') or flag['status'] not in ['+', '-']:
                    continue
                if flag['status'] == '+':
                    classes[bug_id] = True
                elif flag['status'] == '-':
                    classes[bug_id] = False
    return classes
 def get_all_bug_ids():
    bug_ids = set()
--- a/bugbug/models/uplift.py
+++ b/bugbug/models/uplift.py
@ -0,0 +1,62 @@
 # -*- coding: utf-8 -*-
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 import xgboost
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import FeatureUnion
 from sklearn.pipeline import Pipeline
 from bugbug import bug_features
 from bugbug import labels
 from bugbug.model import Model
 from bugbug.utils import DictSelector
 class UpliftModel(Model):
    def __init__(self, lemmatization=False):
        Model.__init__(self, lemmatization)
        self.classes = labels.get_uplift_labels()
        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.comments(),
        ]
        self.extraction_pipeline = Pipeline([
            ('bug_extractor', bug_features.BugExtractor(feature_extractors)),
            ('union', FeatureUnion(
                transformer_list=[
                    ('data', Pipeline([
                        ('selector', DictSelector(key='data')),
                        ('vect', DictVectorizer()),
                    ])),
                    ('title', Pipeline([
                        ('selector', DictSelector(key='title')),
                        ('tfidf', self.text_vectorizer(stop_words='english')),
                    ])),
                    ('comments', Pipeline([
                        ('selector', DictSelector(key='comments')),
                        ('tfidf', self.text_vectorizer(stop_words='english')),
                    ])),
                ],
            )),
        ])
        self.clf = xgboost.XGBClassifier(n_jobs=16)
--- a/run.py
+++ b/run.py
@ -13,7 +13,7 @@ if __name__ == '__main__':
    parser.add_argument('--lemmatization', help='Perform lemmatization (using spaCy)', action='store_true')
    parser.add_argument('--download', help='Download data required for training', action='store_true')
    parser.add_argument('--train', help='Perform training', action='store_true')
-    parser.add_argument('--goal', help='Goal of the classifier', choices=['bug', 'regression', 'tracking', 'qaneeded'], default='bug')
+    parser.add_argument('--goal', help='Goal of the classifier', choices=['bug', 'regression', 'tracking', 'qaneeded', 'uplift'], default='bug')
    args = parser.parse_args()
    if args.download:
@ -34,6 +34,9 @@ if __name__ == '__main__':
    elif args.goal == 'qaneeded':
        from bugbug.models.qaneeded import QANeededModel
        model_class = QANeededModel
    elif args.goal == 'uplift':
        from bugbug.models.uplift import UpliftModel
        model_class = UpliftModel
    if args.train:
        model = model_class(args.lemmatization)