Make training code more generic to make it possible to train on other kinds of objects (e.g. commits) (#335)

* Move feature cleanup functions in a separate module

As they can be shared for different objectives, e.g. both training on bugs and on commits.

* Make Model more generic to make it possible to train on different objects

Introduce BugModel and CommitModel, as base classes for models training on bugs and on commits.

Update all models to use BugModel and to use the new feature_cleanup module.

Fixes #306.

* Update ID and description of the defect/enhancement/task Taskcluster task definition

* Add a module to extract features from commit data

* Add an example model training on commits to predict commits which will be backed out

* Update defect model name, and add possibility to train backout model
This commit is contained in:
Marco 2019-05-03 11:57:48 +02:00 коммит произвёл GitHub
Родитель 4b9b9fb234
Коммит 9995b8c236
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
19 изменённых файлов: 511 добавлений и 272 удалений

Просмотреть файл

@ -392,177 +392,6 @@ class had_severity_enhancement(object):
return False
def cleanup_url(text):
text = re.sub(
r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+",
"__CODE_REFERENCE_URL__",
text,
)
return re.sub(r"http\S+", "__URL__", text)
def cleanup_fileref(text):
return re.sub(
r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b",
"__FILE_REFERENCE__",
text,
)
def cleanup_responses(text):
return re.sub(">[^\n]+", " ", text)
def cleanup_hex(text):
return re.sub(r"\b0[xX][0-9a-fA-F]+\b", "__HEX_NUMBER__", text)
FIREFOX_DLLS_MATCH = "|".join(
[
"libmozwayland.so",
"libssl3.so",
"libnssdbm3.so",
"liblgpllibs.so",
"libmozavutil.so",
"libxul.so",
"libmozgtk.so",
"libnssckbi.so",
"libclearkey.dylib",
"libmozsqlite3.so",
"libplc4.so",
"libsmime3.so",
"libclearkey.so",
"libnssutil3.so",
"libnss3.so",
"libplds4.so",
"libfreeblpriv3.so",
"libsoftokn3.so",
"libmozgtk.so",
"libmozavcodec.so",
"libnspr4.so",
"IA2Marshal.dll",
"lgpllibs.dll",
"libEGL.dll",
"libGLESv2.dll",
"libmozsandbox.so",
"AccessibleHandler.dll",
"AccessibleMarshal.dll",
"api-ms-win-core-console-l1-1-0.dll",
"api-ms-win-core-datetime-l1-1-0.dll",
"api-ms-win-core-debug-l1-1-0.dll",
"api-ms-win-core-errorhandling-l1-1-0.dll",
"api-ms-win-core-file-l1-1-0.dll",
"api-ms-win-core-file-l1-2-0.dll",
"api-ms-win-core-file-l2-1-0.dll",
"api-ms-win-core-handle-l1-1-0.dll",
"api-ms-win-core-heap-l1-1-0.dll",
"api-ms-win-core-interlocked-l1-1-0.dll",
"api-ms-win-core-libraryloader-l1-1-0.dll",
"api-ms-win-core-localization-l1-2-0.dll",
"api-ms-win-core-memory-l1-1-0.dll",
"api-ms-win-core-namedpipe-l1-1-0.dll",
"api-ms-win-core-processenvironment-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-1.dll",
"api-ms-win-core-profile-l1-1-0.dll",
"api-ms-win-core-rtlsupport-l1-1-0.dll",
"api-ms-win-core-string-l1-1-0.dll",
"api-ms-win-core-synch-l1-1-0.dll",
"api-ms-win-core-synch-l1-2-0.dll",
"api-ms-win-core-sysinfo-l1-1-0.dll",
"api-ms-win-core-timezone-l1-1-0.dll",
"api-ms-win-core-util-l1-1-0.dll",
"api-ms-win-crt-conio-l1-1-0.dll",
"api-ms-win-crt-convert-l1-1-0.dll",
"api-ms-win-crt-environment-l1-1-0.dll",
"api-ms-win-crt-filesystem-l1-1-0.dll",
"api-ms-win-crt-heap-l1-1-0.dll",
"api-ms-win-crt-locale-l1-1-0.dll",
"api-ms-win-crt-math-l1-1-0.dll",
"api-ms-win-crt-multibyte-l1-1-0.dll",
"api-ms-win-crt-private-l1-1-0.dll",
"api-ms-win-crt-process-l1-1-0.dll",
"api-ms-win-crt-runtime-l1-1-0.dll",
"api-ms-win-crt-stdio-l1-1-0.dll",
"api-ms-win-crt-string-l1-1-0.dll",
"api-ms-win-crt-time-l1-1-0.dll",
"api-ms-win-crt-utility-l1-1-0.dll",
"d3dcompiler_47.dll",
"freebl3.dll",
"mozavcodec.dll",
"mozavutil.dll",
"mozglue.dll",
"msvcp140.dll",
"nss3.dll",
"nssckbi.dll",
"nssdbm3.dll",
"qipcap64.dll",
"softokn3.dll",
"ucrtbase.dll",
"vcruntime140.dll",
"xul.dll",
"clearkey.dll",
"libfreebl3.dylib",
"liblgpllibs.dylib",
"libmozavcodec.dylib",
"libmozavutil.dylib",
"libmozglue.dylib",
"libnss3.dylib",
"libnssckbi.dylib",
"libnssdbm3.dylib",
"libplugin_child_interpose.dylib",
"libsoftokn3.dylib",
]
).replace(".", r"\.")
def cleanup_dll(text):
regex = fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b"
return re.sub(regex, "__DLL_NAME__", text)
def cleanup_synonyms(text):
synonyms = [
("safemode", ["safemode", "safe mode"]),
("str", ["str", "steps to reproduce", "repro steps"]),
("uaf", ["uaf", "use after free", "use-after-free"]),
("asan", ["asan", "address sanitizer", "addresssanitizer"]),
(
"permafailure",
[
"permafailure",
"permafailing",
"permafail",
"perma failure",
"perma failing",
"perma fail",
"perma-failure",
"perma-failing",
"perma-fail",
],
),
("spec", ["spec", "specification"]),
]
for synonym_group, synonym_list in synonyms:
text = re.sub(
"|".join(fr"\b{synonym}\b" for synonym in synonym_list),
synonym_group,
text,
flags=re.IGNORECASE,
)
return text
def cleanup_crash(text):
return re.sub(
r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b",
"__CRASH_STATS_LINK__",
text,
)
def get_author_ids():
author_ids = set()
for commit in repository.get_commits():

129
bugbug/commit_features.py Normal file
Просмотреть файл

@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
class files_modified_num(object):
def __call__(self, commit, **kwargs):
return commit["files_modified_num"]
class added(object):
def __call__(self, commit, **kwargs):
return commit["added"]
class test_added(object):
def __call__(self, commit, **kwargs):
return commit["test_added"]
class deleted(object):
def __call__(self, commit, **kwargs):
return commit["deleted"]
class test_deleted(object):
def __call__(self, commit, **kwargs):
return commit["test_deleted"]
class author_experience(object):
def __call__(self, commit, **kwargs):
return commit["author_experience"]
class author_experience_90_days(object):
def __call__(self, commit, **kwargs):
return commit["author_experience_90_days"]
class reviewer_experience(object):
def __call__(self, commit, **kwargs):
return commit["reviewer_experience"]
class reviewer_experience_90_days(object):
def __call__(self, commit, **kwargs):
return commit["reviewer_experience_90_days"]
class components_touched_prev(object):
def __call__(self, commit, **kwargs):
return commit["components_touched_prev"]
class components_touched_prev_90_days(object):
def __call__(self, commit, **kwargs):
return commit["components_touched_prev_90_days"]
class files_touched_prev(object):
def __call__(self, commit, **kwargs):
return commit["files_touched_prev"]
class files_touched_prev_90_days(object):
def __call__(self, commit, **kwargs):
return commit["files_touched_prev_90_days"]
class types(object):
def __call__(self, commit, **kwargs):
return commit["types"]
class components(object):
def __call__(self, commit, **kwargs):
return commit["components"]
class number_of_reviewers(object):
def __call__(self, commit, **kwargs):
return len(commit["reviewers"])
class CommitExtractor(BaseEstimator, TransformerMixin):
def __init__(self, feature_extractors, cleanup_functions):
self.feature_extractors = feature_extractors
self.cleanup_functions = cleanup_functions
def fit(self, x, y=None):
return self
def transform(self, commits):
results = []
for commit in commits:
data = {}
for f in self.feature_extractors:
res = f(commit)
if res is None:
continue
if isinstance(res, list):
for item in res:
data[f.__class__.__name__ + "-" + item] = "True"
continue
if isinstance(res, bool):
res = str(res)
data[f.__class__.__name__] = res
# TODO: Try simply using all possible fields instead of extracting features manually.
for cleanup_function in self.cleanup_functions:
commit["desc"] = cleanup_function(commit["desc"])
result = {"data": data, "desc": commit["desc"]}
results.append(result)
return pd.DataFrame(results)

177
bugbug/feature_cleanup.py Normal file
Просмотреть файл

@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import re
def url(text):
text = re.sub(
r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+",
"__CODE_REFERENCE_URL__",
text,
)
return re.sub(r"http\S+", "__URL__", text)
def fileref(text):
return re.sub(
r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b",
"__FILE_REFERENCE__",
text,
)
def responses(text):
return re.sub(">[^\n]+", " ", text)
def hex(text):
return re.sub(r"\b0[xX][0-9a-fA-F]+\b", "__HEX_NUMBER__", text)
FIREFOX_DLLS_MATCH = "|".join(
[
"libmozwayland.so",
"libssl3.so",
"libnssdbm3.so",
"liblgpllibs.so",
"libmozavutil.so",
"libxul.so",
"libmozgtk.so",
"libnssckbi.so",
"libclearkey.dylib",
"libmozsqlite3.so",
"libplc4.so",
"libsmime3.so",
"libclearkey.so",
"libnssutil3.so",
"libnss3.so",
"libplds4.so",
"libfreeblpriv3.so",
"libsoftokn3.so",
"libmozgtk.so",
"libmozavcodec.so",
"libnspr4.so",
"IA2Marshal.dll",
"lgpllibs.dll",
"libEGL.dll",
"libGLESv2.dll",
"libmozsandbox.so",
"AccessibleHandler.dll",
"AccessibleMarshal.dll",
"api-ms-win-core-console-l1-1-0.dll",
"api-ms-win-core-datetime-l1-1-0.dll",
"api-ms-win-core-debug-l1-1-0.dll",
"api-ms-win-core-errorhandling-l1-1-0.dll",
"api-ms-win-core-file-l1-1-0.dll",
"api-ms-win-core-file-l1-2-0.dll",
"api-ms-win-core-file-l2-1-0.dll",
"api-ms-win-core-handle-l1-1-0.dll",
"api-ms-win-core-heap-l1-1-0.dll",
"api-ms-win-core-interlocked-l1-1-0.dll",
"api-ms-win-core-libraryloader-l1-1-0.dll",
"api-ms-win-core-localization-l1-2-0.dll",
"api-ms-win-core-memory-l1-1-0.dll",
"api-ms-win-core-namedpipe-l1-1-0.dll",
"api-ms-win-core-processenvironment-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-0.dll",
"api-ms-win-core-processthreads-l1-1-1.dll",
"api-ms-win-core-profile-l1-1-0.dll",
"api-ms-win-core-rtlsupport-l1-1-0.dll",
"api-ms-win-core-string-l1-1-0.dll",
"api-ms-win-core-synch-l1-1-0.dll",
"api-ms-win-core-synch-l1-2-0.dll",
"api-ms-win-core-sysinfo-l1-1-0.dll",
"api-ms-win-core-timezone-l1-1-0.dll",
"api-ms-win-core-util-l1-1-0.dll",
"api-ms-win-crt-conio-l1-1-0.dll",
"api-ms-win-crt-convert-l1-1-0.dll",
"api-ms-win-crt-environment-l1-1-0.dll",
"api-ms-win-crt-filesystem-l1-1-0.dll",
"api-ms-win-crt-heap-l1-1-0.dll",
"api-ms-win-crt-locale-l1-1-0.dll",
"api-ms-win-crt-math-l1-1-0.dll",
"api-ms-win-crt-multibyte-l1-1-0.dll",
"api-ms-win-crt-private-l1-1-0.dll",
"api-ms-win-crt-process-l1-1-0.dll",
"api-ms-win-crt-runtime-l1-1-0.dll",
"api-ms-win-crt-stdio-l1-1-0.dll",
"api-ms-win-crt-string-l1-1-0.dll",
"api-ms-win-crt-time-l1-1-0.dll",
"api-ms-win-crt-utility-l1-1-0.dll",
"d3dcompiler_47.dll",
"freebl3.dll",
"mozavcodec.dll",
"mozavutil.dll",
"mozglue.dll",
"msvcp140.dll",
"nss3.dll",
"nssckbi.dll",
"nssdbm3.dll",
"qipcap64.dll",
"softokn3.dll",
"ucrtbase.dll",
"vcruntime140.dll",
"xul.dll",
"clearkey.dll",
"libfreebl3.dylib",
"liblgpllibs.dylib",
"libmozavcodec.dylib",
"libmozavutil.dylib",
"libmozglue.dylib",
"libnss3.dylib",
"libnssckbi.dylib",
"libnssdbm3.dylib",
"libplugin_child_interpose.dylib",
"libsoftokn3.dylib",
]
).replace(".", r"\.")
def dll(text):
regex = fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b"
return re.sub(regex, "__DLL_NAME__", text)
def synonyms(text):
synonyms = [
("safemode", ["safemode", "safe mode"]),
("str", ["str", "steps to reproduce", "repro steps"]),
("uaf", ["uaf", "use after free", "use-after-free"]),
("asan", ["asan", "address sanitizer", "addresssanitizer"]),
(
"permafailure",
[
"permafailure",
"permafailing",
"permafail",
"perma failure",
"perma failing",
"perma fail",
"perma-failure",
"perma-failing",
"perma-fail",
],
),
("spec", ["spec", "specification"]),
]
for synonym_group, synonym_list in synonyms:
text = re.sub(
"|".join(fr"\b{synonym}\b" for synonym in synonym_list),
synonym_group,
text,
flags=re.IGNORECASE,
)
return text
def crash(text):
return re.sub(
r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b",
"__CRASH_STATS_LINK__",
text,
)

Просмотреть файл

@ -12,7 +12,7 @@ from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, train_test_split
from bugbug import bugzilla
from bugbug import bugzilla, repository
from bugbug.nlp import SpacyVectorizer
@ -65,15 +65,15 @@ class Model:
classes = self.get_labels()
class_names = sorted(list(set(classes.values())), reverse=True)
# Get bugs, filtering out those for which we have no labels.
def bugs():
return (bug for bug in bugzilla.get_bugs() if bug["id"] in classes)
# Get items, filtering out those for which we have no labels.
def trainable_items_gen():
return (item for item in self.items_gen() if self.get_id(item) in classes)
# Calculate labels.
y = np.array([classes[bug["id"]] for bug in bugs()])
y = np.array([classes[self.get_id(item)] for item in trainable_items_gen()])
# Extract features from the bugs.
X = self.extraction_pipeline.fit_transform(bugs())
# Extract features from the items.
X = self.extraction_pipeline.fit_transform(trainable_items_gen())
print(f"X: {X.shape}, y: {y.shape}")
@ -172,29 +172,29 @@ class Model:
def load(model_file_name):
return joblib.load(model_file_name)
def overwrite_classes(self, bugs, classes, probabilities):
def overwrite_classes(self, items, classes, probabilities):
return classes
def classify(
self, bugs, probabilities=False, importances=False, importance_cutoff=0.15
self, items, probabilities=False, importances=False, importance_cutoff=0.15
):
assert bugs is not None
assert items is not None
assert (
self.extraction_pipeline is not None and self.clf is not None
), "The module needs to be initialized first"
if not isinstance(bugs, list):
bugs = [bugs]
if not isinstance(items, list):
items = [items]
assert isinstance(bugs[0], dict)
assert isinstance(items[0], dict)
X = self.extraction_pipeline.transform(bugs)
X = self.extraction_pipeline.transform(items)
if probabilities:
classes = self.clf.predict_proba(X)
else:
classes = self.clf.predict(X)
classes = self.overwrite_classes(bugs, classes, probabilities)
classes = self.overwrite_classes(items, classes, probabilities)
if importances:
explainer = shap.TreeExplainer(self.clf)
@ -209,3 +209,19 @@ class Model:
return classes, importances
return classes
class BugModel(Model):
def get_id(self, bug):
return bug["id"]
def items_gen(self):
return (bug for bug in bugzilla.get_bugs())
class CommitModel(Model):
def get_id(self, commit):
return commit["node"]
def items_gen(self):
return (commit for commit in repository.get_commits())

Просмотреть файл

@ -10,8 +10,8 @@ from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla
from bugbug.model import Model
from bugbug import bug_features, bugzilla, feature_cleanup
from bugbug.model import BugModel
MINIMUM_ASSIGNMENTS = 5
ADDRESSES_TO_EXCLUDE = [
@ -25,9 +25,9 @@ ADDRESSES_TO_EXCLUDE = [
]
class AssigneeModel(Model):
class AssigneeModel(BugModel):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.cross_validation_enabled = False
self.calculate_importance = False
@ -48,9 +48,9 @@ class AssigneeModel(Model):
]
cleanup_functions = [
bug_features.cleanup_fileref,
bug_features.cleanup_url,
bug_features.cleanup_synonyms,
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
]
self.extraction_pipeline = Pipeline(

81
bugbug/models/backout.py Normal file
Просмотреть файл

@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import xgboost
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import commit_features, feature_cleanup, repository
from bugbug.model import CommitModel
class BackoutModel(CommitModel):
def __init__(self, lemmatization=False):
CommitModel.__init__(self, lemmatization)
self.calculate_importance = False
self.sampler = RandomUnderSampler(random_state=0)
feature_extractors = [
commit_features.files_modified_num(),
commit_features.test_added(),
commit_features.added(),
commit_features.deleted(),
commit_features.test_deleted(),
commit_features.author_experience(),
commit_features.author_experience_90_days(),
commit_features.reviewer_experience(),
commit_features.reviewer_experience_90_days(),
commit_features.components_touched_prev(),
commit_features.components_touched_prev_90_days(),
commit_features.files_touched_prev(),
commit_features.files_touched_prev_90_days(),
commit_features.types(),
commit_features.components(),
commit_features.number_of_reviewers(),
]
cleanup_functions = [
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
]
self.extraction_pipeline = Pipeline(
[
(
"commit_extractor",
commit_features.CommitExtractor(
feature_extractors, cleanup_functions
),
),
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
("desc", self.text_vectorizer(), "desc"),
]
),
),
]
)
self.clf = xgboost.XGBClassifier(n_jobs=16)
self.clf.set_params(predictor="cpu_predictor")
def get_labels(self):
classes = {}
for commit_data in repository.get_commits():
classes[commit_data["node"]] = 1 if commit_data["ever_backedout"] else 0
return classes
def get_feature_names(self):
return self.extraction_pipeline.named_steps["union"].get_feature_names()

Просмотреть файл

@ -10,11 +10,11 @@ from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla
from bugbug.model import Model
from bugbug import bug_features, bugzilla, feature_cleanup
from bugbug.model import BugModel
class ComponentModel(Model):
class ComponentModel(BugModel):
PRODUCTS = {
"Core",
"External Software Affecting Firefox",
@ -56,7 +56,7 @@ class ComponentModel(Model):
}
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.cross_validation_enabled = False
self.calculate_importance = False
@ -77,9 +77,9 @@ class ComponentModel(Model):
]
cleanup_functions = [
bug_features.cleanup_fileref,
bug_features.cleanup_url,
bug_features.cleanup_synonyms,
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -11,13 +11,13 @@ from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla, labels
from bugbug.model import Model
from bugbug import bug_features, bugzilla, feature_cleanup, labels
from bugbug.model import BugModel
class BugModel(Model):
class DefectModel(BugModel):
def __init__(self, lemmatization=False, historical=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.sampler = BorderlineSMOTE(random_state=0)
@ -47,9 +47,9 @@ class BugModel(Model):
feature_extractors.append(bug_features.had_severity_enhancement())
cleanup_functions = [
bug_features.cleanup_url,
bug_features.cleanup_fileref,
bug_features.cleanup_synonyms,
feature_cleanup.url,
feature_cleanup.fileref,
feature_cleanup.synonyms,
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -3,12 +3,12 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug.models.bug import BugModel
from bugbug.models.defect import DefectModel
class DefectEnhancementTaskModel(BugModel):
class DefectEnhancementTaskModel(DefectModel):
def __init__(self, lemmatization=False):
BugModel.__init__(self, lemmatization)
DefectModel.__init__(self, lemmatization)
def get_labels(self):
classes = self.get_bugbug_labels("defect_enhancement_task")

Просмотреть файл

@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla
from bugbug.model import Model
from bugbug import bug_features, bugzilla, feature_cleanup
from bugbug.model import BugModel
class DevDocNeededModel(Model):
class DevDocNeededModel(BugModel):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.sampler = RandomUnderSampler(random_state=0)
@ -41,9 +41,9 @@ class DevDocNeededModel(Model):
]
cleanup_functions = [
bug_features.cleanup_fileref,
bug_features.cleanup_url,
bug_features.cleanup_synonyms,
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla
from bugbug.model import Model
from bugbug import bug_features, bugzilla, feature_cleanup
from bugbug.model import BugModel
class QANeededModel(Model):
class QANeededModel(BugModel):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.sampler = RandomUnderSampler(random_state=0)
@ -36,9 +36,9 @@ class QANeededModel(Model):
]
cleanup_functions = [
bug_features.cleanup_fileref,
bug_features.cleanup_url,
bug_features.cleanup_synonyms,
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -3,12 +3,12 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug.models.bug import BugModel
from bugbug.models.defect import DefectModel
class RegressionModel(BugModel):
class RegressionModel(DefectModel):
def __init__(self, lemmatization=False):
BugModel.__init__(self, lemmatization)
DefectModel.__init__(self, lemmatization)
def get_labels(self):
return self.get_bugbug_labels("regression")

Просмотреть файл

@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla, labels
from bugbug.model import Model
from bugbug import bug_features, bugzilla, feature_cleanup, labels
from bugbug.model import BugModel
class TrackingModel(Model):
class TrackingModel(BugModel):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.sampler = InstanceHardnessThreshold(random_state=0)
@ -47,12 +47,12 @@ class TrackingModel(Model):
]
cleanup_functions = [
bug_features.cleanup_url,
bug_features.cleanup_fileref,
bug_features.cleanup_hex,
bug_features.cleanup_dll,
bug_features.cleanup_synonyms,
bug_features.cleanup_crash,
feature_cleanup.url,
feature_cleanup.fileref,
feature_cleanup.hex,
feature_cleanup.dll,
feature_cleanup.synonyms,
feature_cleanup.crash,
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from bugbug import bug_features, bugzilla
from bugbug.model import Model
from bugbug import bug_features, bugzilla, feature_cleanup
from bugbug.model import BugModel
class UpliftModel(Model):
class UpliftModel(BugModel):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
BugModel.__init__(self, lemmatization)
self.sampler = RandomUnderSampler(random_state=0)
@ -36,9 +36,9 @@ class UpliftModel(Model):
]
cleanup_functions = [
bug_features.cleanup_fileref,
bug_features.cleanup_url,
bug_features.cleanup_synonyms,
feature_cleanup.fileref,
feature_cleanup.url,
feature_cleanup.synonyms,
]
self.extraction_pipeline = Pipeline(

Просмотреть файл

@ -83,7 +83,7 @@ tasks:
owner: release-mgmt-analysis@mozilla.com
source: https://github.com/mozilla/bugbug/raw/master/data-pipeline.yml
- ID: train-defect
- ID: train-defectenhancementtask
created: {$fromNow: ''}
deadline: {$fromNow: '48 hours'}
provisionerId: aws-provisioner-v1
@ -105,8 +105,8 @@ tasks:
- notify.irc-channel.#bugbug.on-failed
- index.project.relman.bugbug.train_defectenhancementtask.latest
metadata:
name: bugbug train defect model
description: bugbug train defect model
name: bugbug train defect/enhancement/task model
description: bugbug train defect/enhancement/task model
owner: release-mgmt-analysis@mozilla.com
source: https://github.com/mozilla/bugbug/raw/master/data-pipeline.yml

17
run.py
Просмотреть файл

@ -25,7 +25,8 @@ if __name__ == "__main__":
"--goal",
help="Goal of the classifier",
choices=[
"bug",
# bug classifiers
"defect",
"regression",
"tracking",
"qaneeded",
@ -34,8 +35,10 @@ if __name__ == "__main__":
"devdocneeded",
"defectenhancementtask",
"assignee",
# commit classifiers
"backout",
],
default="bug",
default="defect",
)
parser.add_argument(
"--classifier",
@ -59,10 +62,10 @@ if __name__ == "__main__":
args.goal, "" if args.classifier == "default" else args.classifier
)
if args.goal == "bug":
from bugbug.models.bug import BugModel
if args.goal == "defect":
from bugbug.models.defect import DefectModel
model_class = BugModel
model_class = DefectModel
elif args.goal == "defectenhancementtask":
from bugbug.models.defect_enhancement_task import DefectEnhancementTaskModel
@ -100,6 +103,10 @@ if __name__ == "__main__":
from bugbug.models.assignee import AssigneeModel
model_class = AssigneeModel
elif args.goal == "backout":
from bugbug.models.backout import BackoutModel
model_class = BackoutModel
if args.train:
db.download()

Просмотреть файл

@ -3,11 +3,11 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug.models.bug import BugModel
from bugbug.models.defect import DefectModel
def test_get_bug_labels():
model = BugModel()
model = DefectModel()
classes = model.get_labels()
# labels from bug_nobug.csv
assert classes[1087488]

Просмотреть файл

@ -3,10 +3,10 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from bugbug import bug_features
from bugbug import feature_cleanup
def test_cleanup_url():
def test_url():
tests = [
(
"This code lies in https://github.com/marco-c/bugbug",
@ -26,10 +26,10 @@ def test_cleanup_url():
),
]
for orig_text, cleaned_text in tests:
assert bug_features.cleanup_url(orig_text) == cleaned_text
assert feature_cleanup.url(orig_text) == cleaned_text
def test_cleanup_fileref():
def test_fileref():
tests = [
(
"Some random filenames are file1.py , file2.cpp and file3.json",
@ -37,10 +37,10 @@ def test_cleanup_fileref():
)
]
for orig_text, cleaned_text in tests:
assert bug_features.cleanup_fileref(orig_text) == cleaned_text
assert feature_cleanup.fileref(orig_text) == cleaned_text
def test_cleanup_responses():
def test_responses():
tests = [
(
"A response can be of the form>This is the comment\n",
@ -57,10 +57,10 @@ def test_cleanup_responses():
),
]
for orig_text, cleaned_text in tests:
assert bug_features.cleanup_responses(orig_text) == cleaned_text
assert feature_cleanup.responses(orig_text) == cleaned_text
def test_cleanup_hex():
def test_hex():
tests = [
(
"0 scdetour.dll scdetour.dll@0x2dd77",
@ -72,10 +72,10 @@ def test_cleanup_hex():
),
]
for orig_text, cleaned_text in tests:
assert bug_features.cleanup_hex(orig_text) == cleaned_text
assert feature_cleanup.hex(orig_text) == cleaned_text
def test_cleanup_dll():
def test_dll():
tests = [
(
"Crashing thread: 0 scdetour.dll scdetour.dll@0x2dd77",
@ -100,10 +100,10 @@ def test_cleanup_dll():
),
]
for orig_text, cleaned_text in tests:
assert bug_features.cleanup_dll(orig_text) == cleaned_text
assert feature_cleanup.dll(orig_text) == cleaned_text
def test_cleanup_synonyms():
def test_synonyms():
tests = [
(
"I was in safemode, but the problem occurred in safe mode too",
@ -118,10 +118,10 @@ def test_cleanup_synonyms():
("found via address sanitizer or asan", "found via asan or asan"),
]
for orig_text, cleaned_text in tests:
assert bug_features.cleanup_synonyms(orig_text) == cleaned_text
assert feature_cleanup.synonyms(orig_text) == cleaned_text
def test_cleanup_crash():
def test_crash():
tests = [
(
"This bug was filed from the Socorro interface and is report bp-ba7ff893-687f-4381-b430-ba66b0170628.",
@ -133,4 +133,4 @@ def test_cleanup_crash():
),
]
for orig_text, cleaned_text in tests:
assert bug_features.cleanup_crash(orig_text) == cleaned_text
assert feature_cleanup.crash(orig_text) == cleaned_text

Просмотреть файл

@ -12,8 +12,8 @@ RUNPY = os.path.join(os.path.dirname(os.path.dirname(__file__)), "run.py")
def test_run():
# Test running the training for the bug model.
print([sys.executable, RUNPY, "--train", "--goal", "bug"])
subprocess.run([sys.executable, RUNPY, "--train", "--goal", "bug"], check=True)
print([sys.executable, RUNPY, "--train", "--goal", "defect"])
subprocess.run([sys.executable, RUNPY, "--train", "--goal", "defect"], check=True)
# Test loading the trained model.
subprocess.run([sys.executable, RUNPY, "--goal", "bug"], check=True)
subprocess.run([sys.executable, RUNPY, "--goal", "defect"], check=True)