Add a module to perform classification

2018-11-21 01:06:30 +01:00 · 2018-11-21 01:06:30 +01:00 · 30ee98533b
--- a/bugbug/bug_features.py
+++ b/bugbug/bug_features.py
@ -5,6 +5,9 @@

 import re

+from sklearn.base import BaseEstimator
+from sklearn.base import TransformerMixin
+

 def field(bug, field):
    if field in bug and bug[field] != '---':
@ -169,3 +172,54 @@ feature_extractors = [
    title,
    comments,
 ]
+
+
+class BugExtractor(BaseEstimator, TransformerMixin):
+    def __init__(self, commit_messages_map=None):
+        self.commit_messages_map = commit_messages_map
+
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, bugs):
+        results = []
+
+        for bug in bugs:
+            bug_id = bug['id']
+
+            data = {}
+
+            for f in feature_extractors:
+                res = f(bug)
+
+                if res is None:
+                    continue
+
+                if isinstance(res, list):
+                    for item in res:
+                        data[f.__name__ + '-' + item] = 'True'
+                    continue
+
+                if isinstance(res, bool):
+                    res = str(res)
+
+                data[f.__name__] = res
+
+            # TODO: Alternative features, to integreate in bug_features.py
+            # for f in bugbug.feature_rules + bugbug.bug_rules:
+            #     data[f.__name__] = f(bug)
+
+            # TODO: Try simply using all possible fields instead of extracting features manually.
+
+            result = {
+                'data': data,
+                'title': bug['summary'],
+                'comments': ' '.join([c['text'] for c in bug['comments']]),
+            }
+
+            if self.commit_messages_map is not None:
+                result['commits'] = self.commit_messages_map[bug_id] if bug_id in self.commit_messages_map else ''
+
+            results.append(result)
+
+        return results
--- a/bugbug/bugzilla.py
+++ b/bugbug/bugzilla.py
@ -3,6 +3,7 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

+import itertools
 import json
 import os

@ -41,15 +42,18 @@ def get_bugs():


 def download_bugs(bug_ids):
-    old_bug_ids = set()
+    old_bug_count = 0
+    old_bugs = []
+    new_bug_ids = set(bug_ids)
    for bug in get_bugs():
-        old_bug_ids.add(bug['id'])
+        old_bug_count += 1
+        if bug['id'] in new_bug_ids:
+            old_bugs.append(bug)
+            new_bug_ids.remove(bug['id'])

-    bug_ids = [bug_id for bug_id in bug_ids if bug_id not in old_bug_ids]
+    print('Loaded {} bugs.'.format(old_bug_count))

-    print('Loaded ' + str(len(old_bug_ids)) + ' bugs.')
-
-    print('To download ' + str(len(bug_ids)) + ' bugs.')
+    print('To download {} bugs.'.format(len(new_bug_ids)))

    new_bugs = {}

@ -86,9 +90,11 @@ def download_bugs(bug_ids):

        new_bugs[bug_id]['history'] = bug['history']

-    bugzilla.Bugzilla(bug_ids, bughandler=bughandler, commenthandler=commenthandler, comment_include_fields=COMMENT_INCLUDE_FIELDS, attachmenthandler=attachmenthandler, attachment_include_fields=ATTACHMENT_INCLUDE_FIELDS, historyhandler=historyhandler).get_data().wait()
+    bugzilla.Bugzilla(new_bug_ids, bughandler=bughandler, commenthandler=commenthandler, comment_include_fields=COMMENT_INCLUDE_FIELDS, attachmenthandler=attachmenthandler, attachment_include_fields=ATTACHMENT_INCLUDE_FIELDS, historyhandler=historyhandler).get_data().wait()

-    print('Total number of bugs: {}'.format(len(old_bug_ids) + len(new_bugs)))
+    print('Total number of bugs: {}'.format(old_bug_count + len(new_bugs)))

    if len(new_bugs):
        db.append(BUGS_DB, new_bugs.values())
+
+    return itertools.chain(old_bugs, new_bugs.items())
--- a/bugbug/classify.py
+++ b/bugbug/classify.py
@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from sklearn.externals import joblib
+
+from bugbug import bugzilla
+
+extraction_pipeline = None
+clf = None
+
+
+def init(model):
+    global extraction_pipeline, clf
+    extraction_pipeline, clf = joblib.load(model)
+
+
+def classify(bugs):
+    assert bugs is not None
+    assert extraction_pipeline is not None and clf is not None, 'The module needs to be initialized first'
+
+    if not isinstance(bugs, list):
+        bugs = [bugs]
+
+    if isinstance(bugs[0], int) or isinstance(bugs[0], str):
+        bugs = list(bugzilla.download_bugs([int(i) for i in bugs]))
+
+    X = extraction_pipeline.transform(bugs)
+    return clf.predict(X)
--- a/bugbug/labels.py
+++ b/bugbug/labels.py
@ -73,4 +73,4 @@ def get_bugbug_labels(augmentation=False):

 if __name__ == '__main__':
    classes = get_bugbug_labels(augmentation=False)
-    bugzilla.download_bugs([bug_id for bug_id in classes.keys()])
+    bugzilla.download_and_store_bugs([bug_id for bug_id in classes.keys()])
--- a/bugbug/nlp.py
+++ b/bugbug/nlp.py
@ -0,0 +1,21 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import spacy
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+nlp = spacy.load('en')
+
+
+def spacy_token_lemmatizer(text):
+    if len(text) > nlp.max_length:
+        text = text[:nlp.max_length - 1]
+    doc = nlp(text)
+    return [token.lemma_ for token in doc]
+
+
+class SpacyVectorizer(TfidfVectorizer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)
--- a/bugbug/train.py
+++ b/bugbug/train.py
@ -3,10 +3,7 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

-from typing import Dict
-
 import numpy as np
-import spacy
 import xgboost
 from imblearn.under_sampling import RandomUnderSampler
 from sklearn import metrics
@ -22,86 +19,34 @@ from bugbug import bug_features
 from bugbug import bugzilla
 from bugbug import repository
 from bugbug.labels import get_bugbug_labels
-from bugbug.utils import ItemSelector
-
-nlp = spacy.load('en')
-
-
-def spacy_token_lemmatizer(text):
-    if len(text) > nlp.max_length:
-        text = text[:nlp.max_length - 1]
-    doc = nlp(text)
-    return [token.lemma_ for token in doc]
-
-
-class SpacyVectorizer(TfidfVectorizer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)
+from bugbug.nlp import SpacyVectorizer
+from bugbug.utils import DictSelector


 def train(model=None, lemmatization=False):
    # Get labels.
    classes = get_bugbug_labels(augmentation=True)

-    labels = []
-    bugs: Dict = {
-        'data': [],
-        'title': [],
-        'comments': [],
-        'commits': [],
-    }
-
-    bug_id_to_commit_messages = {}
+    commit_messages_map = {}
    for commit in repository.get_commits():
        bug_id = commit['bug_id']

        if not bug_id:
            continue

-        if bug_id not in bug_id_to_commit_messages:
-            bug_id_to_commit_messages[bug_id] = ' '
+        if bug_id not in commit_messages_map:
+            commit_messages_map[bug_id] = ' '

-        bug_id_to_commit_messages[bug_id] += commit['desc']
+        commit_messages_map[bug_id] += commit['desc']

-    for bug in bugzilla.get_bugs():
-        bug_id = bug['id']
+    # Get bugs.
+    bugs = bugzilla.get_bugs()

-        if bug_id not in classes:
-            continue
+    # Filter out bugs for which we have no labels.
+    bugs = [bug for bug in bugs if bug['id'] in classes]

-        labels.append(1 if classes[bug_id] else 0)
-
-        data = {}
-
-        for f in bug_features.feature_extractors:
-            res = f(bug)
-
-            if res is None:
-                continue
-
-            if isinstance(res, list):
-                for item in res:
-                    data[f.__name__ + '-' + item] = 'True'
-                continue
-
-            if isinstance(res, bool):
-                res = str(res)
-
-            data[f.__name__] = res
-
-        # TODO: Alternative features, to integreate in bug_features.py
-        # for f in bugbug.feature_rules + bugbug.bug_rules:
-        #     data[f.__name__] = f(bug)
-
-        # TODO: Try simply using all possible fields instead of extracting features manually.
-
-        bugs['data'].append(data)
-        bugs['title'].append(bug['summary'])
-        bugs['comments'].append(' '.join([c['text'] for c in bug['comments']]))
-        bugs['commits'].append(bug_id_to_commit_messages[bug_id] if bug_id in bug_id_to_commit_messages else '')
-
-    # Turn the labels array into a numpy array for scikit-learn consumption.
-    y = np.array(labels)
+    # Calculate labels.
+    y = np.array([1 if classes[bug['id']] else 0 for bug in bugs])

    if lemmatization:
        text_vectorizer = SpacyVectorizer
@ -114,25 +59,26 @@ def train(model=None, lemmatization=False):

    # Extract features from the bugs.
    extraction_pipeline = Pipeline([
+        ('bug_extractor', bug_features.BugExtractor(commit_messages_map)),
        ('union', FeatureUnion(
            transformer_list=[
                ('data', Pipeline([
-                    ('selector', ItemSelector(key='data')),
+                    ('selector', DictSelector(key='data')),
                    ('vect', DictVectorizer()),
                ])),

                ('title', Pipeline([
-                    ('selector', ItemSelector(key='title')),
+                    ('selector', DictSelector(key='title')),
                    ('tfidf', text_vectorizer(stop_words='english')),
                ])),

                ('comments', Pipeline([
-                    ('selector', ItemSelector(key='comments')),
+                    ('selector', DictSelector(key='comments')),
                    ('tfidf', text_vectorizer(stop_words='english')),
                ])),

                # ('commits', Pipeline([
-                #     ('selector', ItemSelector(key='commits')),
+                #     ('selector', DictSelector(key='commits')),
                #     ('tfidf', text_vectorizer(stop_words='english')),
                # ])),
            ],
@ -168,4 +114,4 @@ def train(model=None, lemmatization=False):
    print(metrics.confusion_matrix(y_test, y_pred))

    if model is not None:
-        joblib.dump(clf, model)
+        joblib.dump((extraction_pipeline, clf), model)
--- a/bugbug/utils.py
+++ b/bugbug/utils.py
@ -3,47 +3,16 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

-from typing import Dict
-
 from sklearn.base import BaseEstimator
 from sklearn.base import TransformerMixin


-# From http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html.
-class ItemSelector(BaseEstimator, TransformerMixin):
-    """For data grouped by feature, select subset of data at a provided key.
-
-    The data is expected to be stored in a 2D data structure, where the first
-    index is over features and the second is over samples.  i.e.
-
-    >> len(data[key]) == n_samples
-
-    Please note that this is the opposite convention to scikit-learn feature
-    matrixes (where the first index corresponds to sample).
-
-    ItemSelector only requires that the collection implement getitem
-    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
-    DataFrame, numpy record array, etc.
-
-    >> data = {'a': [1, 5, 2, 5, 2, 8],
-               'b': [9, 4, 1, 4, 1, 3]}
-    >> ds = ItemSelector(key='a')
-    >> data['a'] == ds.transform(data)
-
-    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
-    list of dicts).  If your data is structured this way, consider a
-    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
-
-    Parameters
-    ----------
-    key : hashable, required
-        The key corresponding to the desired value in a mappable.
-    """
+class DictSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

-    def transform(self, data_dict: Dict):
-        return data_dict[self.key]
+    def transform(self, data):
+        return (elem[self.key] for elem in data)
--- a/run.py
+++ b/run.py
@ -5,12 +5,27 @@

 import argparse

+from bugbug import bugzilla
+from bugbug import classify
 from bugbug import train

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--lemmatization', help='Perform lemmatization (using spaCy)', action='store_true')
    parser.add_argument('--model', nargs='?', help='Path where to store the model file')
+    parser.add_argument('--train', help='Perform training', action='store_true')
    args = parser.parse_args()

-    train.train(model=args.model, lemmatization=args.lemmatization)
+    if args.train:
+        train.train(model=args.model, lemmatization=args.lemmatization)
+
+    classify.init(args.model)
+
+    for bug in bugzilla.get_bugs():
+        print('https://bugzilla.mozilla.org/show_bug.cgi?id={} - {}'.format(bug['id'], bug['summary']))
+        c = classify.classify(bug)
+        if c == 1:
+            print('It\'s a bug!')
+        else:
+            print('It\'s not a bug!')
+        input()