From 289ff7bf929644c501eaf62c6738060954c36c1b Mon Sep 17 00:00:00 2001
From: Marco Castelluccio <mcastelluccio@mozilla.com>
Date: Thu, 13 Dec 2018 12:01:44 +0100
Subject: [PATCH] Add an 'uplift' model

---
 bugbug/labels.py        | 19 +++++++++++++
 bugbug/models/uplift.py | 62 +++++++++++++++++++++++++++++++++++++++++
 run.py                  |  5 +++-
 3 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 bugbug/models/uplift.py

diff --git a/bugbug/labels.py b/bugbug/labels.py
index 4fcef5f4..c9308f7d 100644
--- a/bugbug/labels.py
+++ b/bugbug/labels.py
@@ -107,6 +107,25 @@ def get_bugbug_labels(kind='bug', augmentation=False):
     return {bug_id: label for bug_id, label in classes.items() if bug_id in bug_ids}
 
 
+def get_uplift_labels():
+    classes = {}
+
+    for bug_data in bugzilla.get_bugs():
+        bug_id = int(bug_data['id'])
+
+        for attachment in bug_data['attachments']:
+            for flag in attachment['flags']:
+                if not flag['name'].startswith('approval-mozilla-') or flag['status'] not in ['+', '-']:
+                    continue
+
+                if flag['status'] == '+':
+                    classes[bug_id] = True
+                elif flag['status'] == '-':
+                    classes[bug_id] = False
+
+    return classes
+
+
 def get_all_bug_ids():
     bug_ids = set()
 
diff --git a/bugbug/models/uplift.py b/bugbug/models/uplift.py
new file mode 100644
index 00000000..aa621459
--- /dev/null
+++ b/bugbug/models/uplift.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import xgboost
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.pipeline import FeatureUnion
+from sklearn.pipeline import Pipeline
+
+from bugbug import bug_features
+from bugbug import labels
+from bugbug.model import Model
+from bugbug.utils import DictSelector
+
+
+class UpliftModel(Model):
+    def __init__(self, lemmatization=False):
+        Model.__init__(self, lemmatization)
+
+        self.classes = labels.get_uplift_labels()
+
+        feature_extractors = [
+            bug_features.has_str(),
+            bug_features.has_regression_range(),
+            bug_features.severity(),
+            bug_features.keywords(),
+            bug_features.is_coverity_issue(),
+            bug_features.has_crash_signature(),
+            bug_features.has_url(),
+            bug_features.has_w3c_url(),
+            bug_features.has_github_url(),
+            bug_features.whiteboard(),
+            bug_features.patches(),
+            bug_features.landings(),
+            bug_features.title(),
+            bug_features.comments(),
+        ]
+
+        self.extraction_pipeline = Pipeline([
+            ('bug_extractor', bug_features.BugExtractor(feature_extractors)),
+            ('union', FeatureUnion(
+                transformer_list=[
+                    ('data', Pipeline([
+                        ('selector', DictSelector(key='data')),
+                        ('vect', DictVectorizer()),
+                    ])),
+
+                    ('title', Pipeline([
+                        ('selector', DictSelector(key='title')),
+                        ('tfidf', self.text_vectorizer(stop_words='english')),
+                    ])),
+
+                    ('comments', Pipeline([
+                        ('selector', DictSelector(key='comments')),
+                        ('tfidf', self.text_vectorizer(stop_words='english')),
+                    ])),
+                ],
+            )),
+        ])
+
+        self.clf = xgboost.XGBClassifier(n_jobs=16)
diff --git a/run.py b/run.py
index 85c8a5da..57b64966 100644
--- a/run.py
+++ b/run.py
@@ -13,7 +13,7 @@ if __name__ == '__main__':
     parser.add_argument('--lemmatization', help='Perform lemmatization (using spaCy)', action='store_true')
     parser.add_argument('--download', help='Download data required for training', action='store_true')
     parser.add_argument('--train', help='Perform training', action='store_true')
-    parser.add_argument('--goal', help='Goal of the classifier', choices=['bug', 'regression', 'tracking', 'qaneeded'], default='bug')
+    parser.add_argument('--goal', help='Goal of the classifier', choices=['bug', 'regression', 'tracking', 'qaneeded', 'uplift'], default='bug')
     args = parser.parse_args()
 
     if args.download:
@@ -34,6 +34,9 @@ if __name__ == '__main__':
     elif args.goal == 'qaneeded':
         from bugbug.models.qaneeded import QANeededModel
         model_class = QANeededModel
+    elif args.goal == 'uplift':
+        from bugbug.models.uplift import UpliftModel
+        model_class = UpliftModel
 
     if args.train:
         model = model_class(args.lemmatization)