Add a module to perform classification

This commit is contained in:
Marco Castelluccio 2018-11-21 01:06:30 +01:00
Родитель dc267ffb3d
Коммит 30ee98533b
8 изменённых файлов: 157 добавлений и 116 удалений

Просмотреть файл

@ -5,6 +5,9 @@
import re
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
def field(bug, field):
if field in bug and bug[field] != '---':
@ -169,3 +172,54 @@ feature_extractors = [
title,
comments,
]
class BugExtractor(BaseEstimator, TransformerMixin):
def __init__(self, commit_messages_map=None):
self.commit_messages_map = commit_messages_map
def fit(self, x, y=None):
return self
def transform(self, bugs):
results = []
for bug in bugs:
bug_id = bug['id']
data = {}
for f in feature_extractors:
res = f(bug)
if res is None:
continue
if isinstance(res, list):
for item in res:
data[f.__name__ + '-' + item] = 'True'
continue
if isinstance(res, bool):
res = str(res)
data[f.__name__] = res
# TODO: Alternative features, to integreate in bug_features.py
# for f in bugbug.feature_rules + bugbug.bug_rules:
# data[f.__name__] = f(bug)
# TODO: Try simply using all possible fields instead of extracting features manually.
result = {
'data': data,
'title': bug['summary'],
'comments': ' '.join([c['text'] for c in bug['comments']]),
}
if self.commit_messages_map is not None:
result['commits'] = self.commit_messages_map[bug_id] if bug_id in self.commit_messages_map else ''
results.append(result)
return results

Просмотреть файл

@ -3,6 +3,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import itertools
import json
import os
@ -41,15 +42,18 @@ def get_bugs():
def download_bugs(bug_ids):
old_bug_ids = set()
old_bug_count = 0
old_bugs = []
new_bug_ids = set(bug_ids)
for bug in get_bugs():
old_bug_ids.add(bug['id'])
old_bug_count += 1
if bug['id'] in new_bug_ids:
old_bugs.append(bug)
new_bug_ids.remove(bug['id'])
bug_ids = [bug_id for bug_id in bug_ids if bug_id not in old_bug_ids]
print('Loaded {} bugs.'.format(old_bug_count))
print('Loaded ' + str(len(old_bug_ids)) + ' bugs.')
print('To download ' + str(len(bug_ids)) + ' bugs.')
print('To download {} bugs.'.format(len(new_bug_ids)))
new_bugs = {}
@ -86,9 +90,11 @@ def download_bugs(bug_ids):
new_bugs[bug_id]['history'] = bug['history']
bugzilla.Bugzilla(bug_ids, bughandler=bughandler, commenthandler=commenthandler, comment_include_fields=COMMENT_INCLUDE_FIELDS, attachmenthandler=attachmenthandler, attachment_include_fields=ATTACHMENT_INCLUDE_FIELDS, historyhandler=historyhandler).get_data().wait()
bugzilla.Bugzilla(new_bug_ids, bughandler=bughandler, commenthandler=commenthandler, comment_include_fields=COMMENT_INCLUDE_FIELDS, attachmenthandler=attachmenthandler, attachment_include_fields=ATTACHMENT_INCLUDE_FIELDS, historyhandler=historyhandler).get_data().wait()
print('Total number of bugs: {}'.format(len(old_bug_ids) + len(new_bugs)))
print('Total number of bugs: {}'.format(old_bug_count + len(new_bugs)))
if len(new_bugs):
db.append(BUGS_DB, new_bugs.values())
return itertools.chain(old_bugs, new_bugs.items())

30
bugbug/classify.py Normal file
Просмотреть файл

@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from sklearn.externals import joblib
from bugbug import bugzilla
extraction_pipeline = None
clf = None
def init(model):
global extraction_pipeline, clf
extraction_pipeline, clf = joblib.load(model)
def classify(bugs):
assert bugs is not None
assert extraction_pipeline is not None and clf is not None, 'The module needs to be initialized first'
if not isinstance(bugs, list):
bugs = [bugs]
if isinstance(bugs[0], int) or isinstance(bugs[0], str):
bugs = list(bugzilla.download_bugs([int(i) for i in bugs]))
X = extraction_pipeline.transform(bugs)
return clf.predict(X)

Просмотреть файл

@ -73,4 +73,4 @@ def get_bugbug_labels(augmentation=False):
if __name__ == '__main__':
classes = get_bugbug_labels(augmentation=False)
bugzilla.download_bugs([bug_id for bug_id in classes.keys()])
bugzilla.download_and_store_bugs([bug_id for bug_id in classes.keys()])

21
bugbug/nlp.py Normal file
Просмотреть файл

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
nlp = spacy.load('en')
def spacy_token_lemmatizer(text):
if len(text) > nlp.max_length:
text = text[:nlp.max_length - 1]
doc = nlp(text)
return [token.lemma_ for token in doc]
class SpacyVectorizer(TfidfVectorizer):
def __init__(self, *args, **kwargs):
super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)

Просмотреть файл

@ -3,10 +3,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from typing import Dict
import numpy as np
import spacy
import xgboost
from imblearn.under_sampling import RandomUnderSampler
from sklearn import metrics
@ -22,86 +19,34 @@ from bugbug import bug_features
from bugbug import bugzilla
from bugbug import repository
from bugbug.labels import get_bugbug_labels
from bugbug.utils import ItemSelector
nlp = spacy.load('en')
def spacy_token_lemmatizer(text):
if len(text) > nlp.max_length:
text = text[:nlp.max_length - 1]
doc = nlp(text)
return [token.lemma_ for token in doc]
class SpacyVectorizer(TfidfVectorizer):
def __init__(self, *args, **kwargs):
super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)
from bugbug.nlp import SpacyVectorizer
from bugbug.utils import DictSelector
def train(model=None, lemmatization=False):
# Get labels.
classes = get_bugbug_labels(augmentation=True)
labels = []
bugs: Dict = {
'data': [],
'title': [],
'comments': [],
'commits': [],
}
bug_id_to_commit_messages = {}
commit_messages_map = {}
for commit in repository.get_commits():
bug_id = commit['bug_id']
if not bug_id:
continue
if bug_id not in bug_id_to_commit_messages:
bug_id_to_commit_messages[bug_id] = ' '
if bug_id not in commit_messages_map:
commit_messages_map[bug_id] = ' '
bug_id_to_commit_messages[bug_id] += commit['desc']
commit_messages_map[bug_id] += commit['desc']
for bug in bugzilla.get_bugs():
bug_id = bug['id']
# Get bugs.
bugs = bugzilla.get_bugs()
if bug_id not in classes:
continue
# Filter out bugs for which we have no labels.
bugs = [bug for bug in bugs if bug['id'] in classes]
labels.append(1 if classes[bug_id] else 0)
data = {}
for f in bug_features.feature_extractors:
res = f(bug)
if res is None:
continue
if isinstance(res, list):
for item in res:
data[f.__name__ + '-' + item] = 'True'
continue
if isinstance(res, bool):
res = str(res)
data[f.__name__] = res
# TODO: Alternative features, to integreate in bug_features.py
# for f in bugbug.feature_rules + bugbug.bug_rules:
# data[f.__name__] = f(bug)
# TODO: Try simply using all possible fields instead of extracting features manually.
bugs['data'].append(data)
bugs['title'].append(bug['summary'])
bugs['comments'].append(' '.join([c['text'] for c in bug['comments']]))
bugs['commits'].append(bug_id_to_commit_messages[bug_id] if bug_id in bug_id_to_commit_messages else '')
# Turn the labels array into a numpy array for scikit-learn consumption.
y = np.array(labels)
# Calculate labels.
y = np.array([1 if classes[bug['id']] else 0 for bug in bugs])
if lemmatization:
text_vectorizer = SpacyVectorizer
@ -114,25 +59,26 @@ def train(model=None, lemmatization=False):
# Extract features from the bugs.
extraction_pipeline = Pipeline([
('bug_extractor', bug_features.BugExtractor(commit_messages_map)),
('union', FeatureUnion(
transformer_list=[
('data', Pipeline([
('selector', ItemSelector(key='data')),
('selector', DictSelector(key='data')),
('vect', DictVectorizer()),
])),
('title', Pipeline([
('selector', ItemSelector(key='title')),
('selector', DictSelector(key='title')),
('tfidf', text_vectorizer(stop_words='english')),
])),
('comments', Pipeline([
('selector', ItemSelector(key='comments')),
('selector', DictSelector(key='comments')),
('tfidf', text_vectorizer(stop_words='english')),
])),
# ('commits', Pipeline([
# ('selector', ItemSelector(key='commits')),
# ('selector', DictSelector(key='commits')),
# ('tfidf', text_vectorizer(stop_words='english')),
# ])),
],
@ -168,4 +114,4 @@ def train(model=None, lemmatization=False):
print(metrics.confusion_matrix(y_test, y_pred))
if model is not None:
joblib.dump(clf, model)
joblib.dump((extraction_pipeline, clf), model)

Просмотреть файл

@ -3,47 +3,16 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from typing import Dict
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
# From http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html.
class ItemSelector(BaseEstimator, TransformerMixin):
"""For data grouped by feature, select subset of data at a provided key.
The data is expected to be stored in a 2D data structure, where the first
index is over features and the second is over samples. i.e.
>> len(data[key]) == n_samples
Please note that this is the opposite convention to scikit-learn feature
matrixes (where the first index corresponds to sample).
ItemSelector only requires that the collection implement getitem
(data[key]). Examples include: a dict of lists, 2D numpy array, Pandas
DataFrame, numpy record array, etc.
>> data = {'a': [1, 5, 2, 5, 2, 8],
'b': [9, 4, 1, 4, 1, 3]}
>> ds = ItemSelector(key='a')
>> data['a'] == ds.transform(data)
ItemSelector is not designed to handle data grouped by sample. (e.g. a
list of dicts). If your data is structured this way, consider a
transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
Parameters
----------
key : hashable, required
The key corresponding to the desired value in a mappable.
"""
class DictSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict: Dict):
return data_dict[self.key]
def transform(self, data):
return (elem[self.key] for elem in data)

17
run.py
Просмотреть файл

@ -5,12 +5,27 @@
import argparse
from bugbug import bugzilla
from bugbug import classify
from bugbug import train
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--lemmatization', help='Perform lemmatization (using spaCy)', action='store_true')
parser.add_argument('--model', nargs='?', help='Path where to store the model file')
parser.add_argument('--train', help='Perform training', action='store_true')
args = parser.parse_args()
train.train(model=args.model, lemmatization=args.lemmatization)
if args.train:
train.train(model=args.model, lemmatization=args.lemmatization)
classify.init(args.model)
for bug in bugzilla.get_bugs():
print('https://bugzilla.mozilla.org/show_bug.cgi?id={} - {}'.format(bug['id'], bug['summary']))
c = classify.classify(bug)
if c == 1:
print('It\'s a bug!')
else:
print('It\'s not a bug!')
input()