зеркало из https://github.com/mozilla/bugbug.git
Add a module to perform classification
This commit is contained in:
Родитель
dc267ffb3d
Коммит
30ee98533b
|
@ -5,6 +5,9 @@
|
|||
|
||||
import re
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import TransformerMixin
|
||||
|
||||
|
||||
def field(bug, field):
|
||||
if field in bug and bug[field] != '---':
|
||||
|
@ -169,3 +172,54 @@ feature_extractors = [
|
|||
title,
|
||||
comments,
|
||||
]
|
||||
|
||||
|
||||
class BugExtractor(BaseEstimator, TransformerMixin):
|
||||
def __init__(self, commit_messages_map=None):
|
||||
self.commit_messages_map = commit_messages_map
|
||||
|
||||
def fit(self, x, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, bugs):
|
||||
results = []
|
||||
|
||||
for bug in bugs:
|
||||
bug_id = bug['id']
|
||||
|
||||
data = {}
|
||||
|
||||
for f in feature_extractors:
|
||||
res = f(bug)
|
||||
|
||||
if res is None:
|
||||
continue
|
||||
|
||||
if isinstance(res, list):
|
||||
for item in res:
|
||||
data[f.__name__ + '-' + item] = 'True'
|
||||
continue
|
||||
|
||||
if isinstance(res, bool):
|
||||
res = str(res)
|
||||
|
||||
data[f.__name__] = res
|
||||
|
||||
# TODO: Alternative features, to integreate in bug_features.py
|
||||
# for f in bugbug.feature_rules + bugbug.bug_rules:
|
||||
# data[f.__name__] = f(bug)
|
||||
|
||||
# TODO: Try simply using all possible fields instead of extracting features manually.
|
||||
|
||||
result = {
|
||||
'data': data,
|
||||
'title': bug['summary'],
|
||||
'comments': ' '.join([c['text'] for c in bug['comments']]),
|
||||
}
|
||||
|
||||
if self.commit_messages_map is not None:
|
||||
result['commits'] = self.commit_messages_map[bug_id] if bug_id in self.commit_messages_map else ''
|
||||
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import itertools
|
||||
import json
|
||||
import os
|
||||
|
||||
|
@ -41,15 +42,18 @@ def get_bugs():
|
|||
|
||||
|
||||
def download_bugs(bug_ids):
|
||||
old_bug_ids = set()
|
||||
old_bug_count = 0
|
||||
old_bugs = []
|
||||
new_bug_ids = set(bug_ids)
|
||||
for bug in get_bugs():
|
||||
old_bug_ids.add(bug['id'])
|
||||
old_bug_count += 1
|
||||
if bug['id'] in new_bug_ids:
|
||||
old_bugs.append(bug)
|
||||
new_bug_ids.remove(bug['id'])
|
||||
|
||||
bug_ids = [bug_id for bug_id in bug_ids if bug_id not in old_bug_ids]
|
||||
print('Loaded {} bugs.'.format(old_bug_count))
|
||||
|
||||
print('Loaded ' + str(len(old_bug_ids)) + ' bugs.')
|
||||
|
||||
print('To download ' + str(len(bug_ids)) + ' bugs.')
|
||||
print('To download {} bugs.'.format(len(new_bug_ids)))
|
||||
|
||||
new_bugs = {}
|
||||
|
||||
|
@ -86,9 +90,11 @@ def download_bugs(bug_ids):
|
|||
|
||||
new_bugs[bug_id]['history'] = bug['history']
|
||||
|
||||
bugzilla.Bugzilla(bug_ids, bughandler=bughandler, commenthandler=commenthandler, comment_include_fields=COMMENT_INCLUDE_FIELDS, attachmenthandler=attachmenthandler, attachment_include_fields=ATTACHMENT_INCLUDE_FIELDS, historyhandler=historyhandler).get_data().wait()
|
||||
bugzilla.Bugzilla(new_bug_ids, bughandler=bughandler, commenthandler=commenthandler, comment_include_fields=COMMENT_INCLUDE_FIELDS, attachmenthandler=attachmenthandler, attachment_include_fields=ATTACHMENT_INCLUDE_FIELDS, historyhandler=historyhandler).get_data().wait()
|
||||
|
||||
print('Total number of bugs: {}'.format(len(old_bug_ids) + len(new_bugs)))
|
||||
print('Total number of bugs: {}'.format(old_bug_count + len(new_bugs)))
|
||||
|
||||
if len(new_bugs):
|
||||
db.append(BUGS_DB, new_bugs.values())
|
||||
|
||||
return itertools.chain(old_bugs, new_bugs.items())
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from sklearn.externals import joblib
|
||||
|
||||
from bugbug import bugzilla
|
||||
|
||||
extraction_pipeline = None
|
||||
clf = None
|
||||
|
||||
|
||||
def init(model):
|
||||
global extraction_pipeline, clf
|
||||
extraction_pipeline, clf = joblib.load(model)
|
||||
|
||||
|
||||
def classify(bugs):
|
||||
assert bugs is not None
|
||||
assert extraction_pipeline is not None and clf is not None, 'The module needs to be initialized first'
|
||||
|
||||
if not isinstance(bugs, list):
|
||||
bugs = [bugs]
|
||||
|
||||
if isinstance(bugs[0], int) or isinstance(bugs[0], str):
|
||||
bugs = list(bugzilla.download_bugs([int(i) for i in bugs]))
|
||||
|
||||
X = extraction_pipeline.transform(bugs)
|
||||
return clf.predict(X)
|
|
@ -73,4 +73,4 @@ def get_bugbug_labels(augmentation=False):
|
|||
|
||||
if __name__ == '__main__':
|
||||
classes = get_bugbug_labels(augmentation=False)
|
||||
bugzilla.download_bugs([bug_id for bug_id in classes.keys()])
|
||||
bugzilla.download_and_store_bugs([bug_id for bug_id in classes.keys()])
|
||||
|
|
|
@ -0,0 +1,21 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import spacy
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
nlp = spacy.load('en')
|
||||
|
||||
|
||||
def spacy_token_lemmatizer(text):
|
||||
if len(text) > nlp.max_length:
|
||||
text = text[:nlp.max_length - 1]
|
||||
doc = nlp(text)
|
||||
return [token.lemma_ for token in doc]
|
||||
|
||||
|
||||
class SpacyVectorizer(TfidfVectorizer):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)
|
|
@ -3,10 +3,7 @@
|
|||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import spacy
|
||||
import xgboost
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from sklearn import metrics
|
||||
|
@ -22,86 +19,34 @@ from bugbug import bug_features
|
|||
from bugbug import bugzilla
|
||||
from bugbug import repository
|
||||
from bugbug.labels import get_bugbug_labels
|
||||
from bugbug.utils import ItemSelector
|
||||
|
||||
nlp = spacy.load('en')
|
||||
|
||||
|
||||
def spacy_token_lemmatizer(text):
|
||||
if len(text) > nlp.max_length:
|
||||
text = text[:nlp.max_length - 1]
|
||||
doc = nlp(text)
|
||||
return [token.lemma_ for token in doc]
|
||||
|
||||
|
||||
class SpacyVectorizer(TfidfVectorizer):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(tokenizer=spacy_token_lemmatizer, *args, **kwargs)
|
||||
from bugbug.nlp import SpacyVectorizer
|
||||
from bugbug.utils import DictSelector
|
||||
|
||||
|
||||
def train(model=None, lemmatization=False):
|
||||
# Get labels.
|
||||
classes = get_bugbug_labels(augmentation=True)
|
||||
|
||||
labels = []
|
||||
bugs: Dict = {
|
||||
'data': [],
|
||||
'title': [],
|
||||
'comments': [],
|
||||
'commits': [],
|
||||
}
|
||||
|
||||
bug_id_to_commit_messages = {}
|
||||
commit_messages_map = {}
|
||||
for commit in repository.get_commits():
|
||||
bug_id = commit['bug_id']
|
||||
|
||||
if not bug_id:
|
||||
continue
|
||||
|
||||
if bug_id not in bug_id_to_commit_messages:
|
||||
bug_id_to_commit_messages[bug_id] = ' '
|
||||
if bug_id not in commit_messages_map:
|
||||
commit_messages_map[bug_id] = ' '
|
||||
|
||||
bug_id_to_commit_messages[bug_id] += commit['desc']
|
||||
commit_messages_map[bug_id] += commit['desc']
|
||||
|
||||
for bug in bugzilla.get_bugs():
|
||||
bug_id = bug['id']
|
||||
# Get bugs.
|
||||
bugs = bugzilla.get_bugs()
|
||||
|
||||
if bug_id not in classes:
|
||||
continue
|
||||
# Filter out bugs for which we have no labels.
|
||||
bugs = [bug for bug in bugs if bug['id'] in classes]
|
||||
|
||||
labels.append(1 if classes[bug_id] else 0)
|
||||
|
||||
data = {}
|
||||
|
||||
for f in bug_features.feature_extractors:
|
||||
res = f(bug)
|
||||
|
||||
if res is None:
|
||||
continue
|
||||
|
||||
if isinstance(res, list):
|
||||
for item in res:
|
||||
data[f.__name__ + '-' + item] = 'True'
|
||||
continue
|
||||
|
||||
if isinstance(res, bool):
|
||||
res = str(res)
|
||||
|
||||
data[f.__name__] = res
|
||||
|
||||
# TODO: Alternative features, to integreate in bug_features.py
|
||||
# for f in bugbug.feature_rules + bugbug.bug_rules:
|
||||
# data[f.__name__] = f(bug)
|
||||
|
||||
# TODO: Try simply using all possible fields instead of extracting features manually.
|
||||
|
||||
bugs['data'].append(data)
|
||||
bugs['title'].append(bug['summary'])
|
||||
bugs['comments'].append(' '.join([c['text'] for c in bug['comments']]))
|
||||
bugs['commits'].append(bug_id_to_commit_messages[bug_id] if bug_id in bug_id_to_commit_messages else '')
|
||||
|
||||
# Turn the labels array into a numpy array for scikit-learn consumption.
|
||||
y = np.array(labels)
|
||||
# Calculate labels.
|
||||
y = np.array([1 if classes[bug['id']] else 0 for bug in bugs])
|
||||
|
||||
if lemmatization:
|
||||
text_vectorizer = SpacyVectorizer
|
||||
|
@ -114,25 +59,26 @@ def train(model=None, lemmatization=False):
|
|||
|
||||
# Extract features from the bugs.
|
||||
extraction_pipeline = Pipeline([
|
||||
('bug_extractor', bug_features.BugExtractor(commit_messages_map)),
|
||||
('union', FeatureUnion(
|
||||
transformer_list=[
|
||||
('data', Pipeline([
|
||||
('selector', ItemSelector(key='data')),
|
||||
('selector', DictSelector(key='data')),
|
||||
('vect', DictVectorizer()),
|
||||
])),
|
||||
|
||||
('title', Pipeline([
|
||||
('selector', ItemSelector(key='title')),
|
||||
('selector', DictSelector(key='title')),
|
||||
('tfidf', text_vectorizer(stop_words='english')),
|
||||
])),
|
||||
|
||||
('comments', Pipeline([
|
||||
('selector', ItemSelector(key='comments')),
|
||||
('selector', DictSelector(key='comments')),
|
||||
('tfidf', text_vectorizer(stop_words='english')),
|
||||
])),
|
||||
|
||||
# ('commits', Pipeline([
|
||||
# ('selector', ItemSelector(key='commits')),
|
||||
# ('selector', DictSelector(key='commits')),
|
||||
# ('tfidf', text_vectorizer(stop_words='english')),
|
||||
# ])),
|
||||
],
|
||||
|
@ -168,4 +114,4 @@ def train(model=None, lemmatization=False):
|
|||
print(metrics.confusion_matrix(y_test, y_pred))
|
||||
|
||||
if model is not None:
|
||||
joblib.dump(clf, model)
|
||||
joblib.dump((extraction_pipeline, clf), model)
|
||||
|
|
|
@ -3,47 +3,16 @@
|
|||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.base import TransformerMixin
|
||||
|
||||
|
||||
# From http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html.
|
||||
class ItemSelector(BaseEstimator, TransformerMixin):
|
||||
"""For data grouped by feature, select subset of data at a provided key.
|
||||
|
||||
The data is expected to be stored in a 2D data structure, where the first
|
||||
index is over features and the second is over samples. i.e.
|
||||
|
||||
>> len(data[key]) == n_samples
|
||||
|
||||
Please note that this is the opposite convention to scikit-learn feature
|
||||
matrixes (where the first index corresponds to sample).
|
||||
|
||||
ItemSelector only requires that the collection implement getitem
|
||||
(data[key]). Examples include: a dict of lists, 2D numpy array, Pandas
|
||||
DataFrame, numpy record array, etc.
|
||||
|
||||
>> data = {'a': [1, 5, 2, 5, 2, 8],
|
||||
'b': [9, 4, 1, 4, 1, 3]}
|
||||
>> ds = ItemSelector(key='a')
|
||||
>> data['a'] == ds.transform(data)
|
||||
|
||||
ItemSelector is not designed to handle data grouped by sample. (e.g. a
|
||||
list of dicts). If your data is structured this way, consider a
|
||||
transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
key : hashable, required
|
||||
The key corresponding to the desired value in a mappable.
|
||||
"""
|
||||
class DictSelector(BaseEstimator, TransformerMixin):
|
||||
def __init__(self, key):
|
||||
self.key = key
|
||||
|
||||
def fit(self, x, y=None):
|
||||
return self
|
||||
|
||||
def transform(self, data_dict: Dict):
|
||||
return data_dict[self.key]
|
||||
def transform(self, data):
|
||||
return (elem[self.key] for elem in data)
|
||||
|
|
17
run.py
17
run.py
|
@ -5,12 +5,27 @@
|
|||
|
||||
import argparse
|
||||
|
||||
from bugbug import bugzilla
|
||||
from bugbug import classify
|
||||
from bugbug import train
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--lemmatization', help='Perform lemmatization (using spaCy)', action='store_true')
|
||||
parser.add_argument('--model', nargs='?', help='Path where to store the model file')
|
||||
parser.add_argument('--train', help='Perform training', action='store_true')
|
||||
args = parser.parse_args()
|
||||
|
||||
train.train(model=args.model, lemmatization=args.lemmatization)
|
||||
if args.train:
|
||||
train.train(model=args.model, lemmatization=args.lemmatization)
|
||||
|
||||
classify.init(args.model)
|
||||
|
||||
for bug in bugzilla.get_bugs():
|
||||
print('https://bugzilla.mozilla.org/show_bug.cgi?id={} - {}'.format(bug['id'], bug['summary']))
|
||||
c = classify.classify(bug)
|
||||
if c == 1:
|
||||
print('It\'s a bug!')
|
||||
else:
|
||||
print('It\'s not a bug!')
|
||||
input()
|
||||
|
|
Загрузка…
Ссылка в новой задаче