Make training code more generic to make it possible to train on other kinds of objects (e.g. commits) (#335)

* Move feature cleanup functions in a separate module As they can be shared for different objectives, e.g. both training on bugs and on commits. * Make Model more generic to make it possible to train on different objects Introduce BugModel and CommitModel, as base classes for models training on bugs and on commits. Update all models to use BugModel and to use the new feature_cleanup module. Fixes #306. * Update ID and description of the defect/enhancement/task Taskcluster task definition * Add a module to extract features from commit data * Add an example model training on commits to predict commits which will be backed out * Update defect model name, and add possibility to train backout model
2019-05-03 11:57:48 +02:00 · 2019-05-03 11:57:48 +02:00 · 9995b8c236
--- a/bugbug/bug_features.py
+++ b/bugbug/bug_features.py
@ -392,177 +392,6 @@ class had_severity_enhancement(object):
        return False


-def cleanup_url(text):
-    text = re.sub(
-        r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+",
-        "__CODE_REFERENCE_URL__",
-        text,
-    )
-    return re.sub(r"http\S+", "__URL__", text)
-
-
-def cleanup_fileref(text):
-    return re.sub(
-        r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b",
-        "__FILE_REFERENCE__",
-        text,
-    )
-
-
-def cleanup_responses(text):
-    return re.sub(">[^\n]+", " ", text)
-
-
-def cleanup_hex(text):
-    return re.sub(r"\b0[xX][0-9a-fA-F]+\b", "__HEX_NUMBER__", text)
-
-
-FIREFOX_DLLS_MATCH = "|".join(
-    [
-        "libmozwayland.so",
-        "libssl3.so",
-        "libnssdbm3.so",
-        "liblgpllibs.so",
-        "libmozavutil.so",
-        "libxul.so",
-        "libmozgtk.so",
-        "libnssckbi.so",
-        "libclearkey.dylib",
-        "libmozsqlite3.so",
-        "libplc4.so",
-        "libsmime3.so",
-        "libclearkey.so",
-        "libnssutil3.so",
-        "libnss3.so",
-        "libplds4.so",
-        "libfreeblpriv3.so",
-        "libsoftokn3.so",
-        "libmozgtk.so",
-        "libmozavcodec.so",
-        "libnspr4.so",
-        "IA2Marshal.dll",
-        "lgpllibs.dll",
-        "libEGL.dll",
-        "libGLESv2.dll",
-        "libmozsandbox.so",
-        "AccessibleHandler.dll",
-        "AccessibleMarshal.dll",
-        "api-ms-win-core-console-l1-1-0.dll",
-        "api-ms-win-core-datetime-l1-1-0.dll",
-        "api-ms-win-core-debug-l1-1-0.dll",
-        "api-ms-win-core-errorhandling-l1-1-0.dll",
-        "api-ms-win-core-file-l1-1-0.dll",
-        "api-ms-win-core-file-l1-2-0.dll",
-        "api-ms-win-core-file-l2-1-0.dll",
-        "api-ms-win-core-handle-l1-1-0.dll",
-        "api-ms-win-core-heap-l1-1-0.dll",
-        "api-ms-win-core-interlocked-l1-1-0.dll",
-        "api-ms-win-core-libraryloader-l1-1-0.dll",
-        "api-ms-win-core-localization-l1-2-0.dll",
-        "api-ms-win-core-memory-l1-1-0.dll",
-        "api-ms-win-core-namedpipe-l1-1-0.dll",
-        "api-ms-win-core-processenvironment-l1-1-0.dll",
-        "api-ms-win-core-processthreads-l1-1-0.dll",
-        "api-ms-win-core-processthreads-l1-1-1.dll",
-        "api-ms-win-core-profile-l1-1-0.dll",
-        "api-ms-win-core-rtlsupport-l1-1-0.dll",
-        "api-ms-win-core-string-l1-1-0.dll",
-        "api-ms-win-core-synch-l1-1-0.dll",
-        "api-ms-win-core-synch-l1-2-0.dll",
-        "api-ms-win-core-sysinfo-l1-1-0.dll",
-        "api-ms-win-core-timezone-l1-1-0.dll",
-        "api-ms-win-core-util-l1-1-0.dll",
-        "api-ms-win-crt-conio-l1-1-0.dll",
-        "api-ms-win-crt-convert-l1-1-0.dll",
-        "api-ms-win-crt-environment-l1-1-0.dll",
-        "api-ms-win-crt-filesystem-l1-1-0.dll",
-        "api-ms-win-crt-heap-l1-1-0.dll",
-        "api-ms-win-crt-locale-l1-1-0.dll",
-        "api-ms-win-crt-math-l1-1-0.dll",
-        "api-ms-win-crt-multibyte-l1-1-0.dll",
-        "api-ms-win-crt-private-l1-1-0.dll",
-        "api-ms-win-crt-process-l1-1-0.dll",
-        "api-ms-win-crt-runtime-l1-1-0.dll",
-        "api-ms-win-crt-stdio-l1-1-0.dll",
-        "api-ms-win-crt-string-l1-1-0.dll",
-        "api-ms-win-crt-time-l1-1-0.dll",
-        "api-ms-win-crt-utility-l1-1-0.dll",
-        "d3dcompiler_47.dll",
-        "freebl3.dll",
-        "mozavcodec.dll",
-        "mozavutil.dll",
-        "mozglue.dll",
-        "msvcp140.dll",
-        "nss3.dll",
-        "nssckbi.dll",
-        "nssdbm3.dll",
-        "qipcap64.dll",
-        "softokn3.dll",
-        "ucrtbase.dll",
-        "vcruntime140.dll",
-        "xul.dll",
-        "clearkey.dll",
-        "libfreebl3.dylib",
-        "liblgpllibs.dylib",
-        "libmozavcodec.dylib",
-        "libmozavutil.dylib",
-        "libmozglue.dylib",
-        "libnss3.dylib",
-        "libnssckbi.dylib",
-        "libnssdbm3.dylib",
-        "libplugin_child_interpose.dylib",
-        "libsoftokn3.dylib",
-    ]
-).replace(".", r"\.")
-
-
-def cleanup_dll(text):
-    regex = fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b"
-    return re.sub(regex, "__DLL_NAME__", text)
-
-
-def cleanup_synonyms(text):
-    synonyms = [
-        ("safemode", ["safemode", "safe mode"]),
-        ("str", ["str", "steps to reproduce", "repro steps"]),
-        ("uaf", ["uaf", "use after free", "use-after-free"]),
-        ("asan", ["asan", "address sanitizer", "addresssanitizer"]),
-        (
-            "permafailure",
-            [
-                "permafailure",
-                "permafailing",
-                "permafail",
-                "perma failure",
-                "perma failing",
-                "perma fail",
-                "perma-failure",
-                "perma-failing",
-                "perma-fail",
-            ],
-        ),
-        ("spec", ["spec", "specification"]),
-    ]
-
-    for synonym_group, synonym_list in synonyms:
-        text = re.sub(
-            "|".join(fr"\b{synonym}\b" for synonym in synonym_list),
-            synonym_group,
-            text,
-            flags=re.IGNORECASE,
-        )
-
-    return text
-
-
-def cleanup_crash(text):
-    return re.sub(
-        r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b",
-        "__CRASH_STATS_LINK__",
-        text,
-    )
-
-
 def get_author_ids():
    author_ids = set()
    for commit in repository.get_commits():
--- a/bugbug/commit_features.py
+++ b/bugbug/commit_features.py
@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class files_modified_num(object):
+    def __call__(self, commit, **kwargs):
+        return commit["files_modified_num"]
+
+
+class added(object):
+    def __call__(self, commit, **kwargs):
+        return commit["added"]
+
+
+class test_added(object):
+    def __call__(self, commit, **kwargs):
+        return commit["test_added"]
+
+
+class deleted(object):
+    def __call__(self, commit, **kwargs):
+        return commit["deleted"]
+
+
+class test_deleted(object):
+    def __call__(self, commit, **kwargs):
+        return commit["test_deleted"]
+
+
+class author_experience(object):
+    def __call__(self, commit, **kwargs):
+        return commit["author_experience"]
+
+
+class author_experience_90_days(object):
+    def __call__(self, commit, **kwargs):
+        return commit["author_experience_90_days"]
+
+
+class reviewer_experience(object):
+    def __call__(self, commit, **kwargs):
+        return commit["reviewer_experience"]
+
+
+class reviewer_experience_90_days(object):
+    def __call__(self, commit, **kwargs):
+        return commit["reviewer_experience_90_days"]
+
+
+class components_touched_prev(object):
+    def __call__(self, commit, **kwargs):
+        return commit["components_touched_prev"]
+
+
+class components_touched_prev_90_days(object):
+    def __call__(self, commit, **kwargs):
+        return commit["components_touched_prev_90_days"]
+
+
+class files_touched_prev(object):
+    def __call__(self, commit, **kwargs):
+        return commit["files_touched_prev"]
+
+
+class files_touched_prev_90_days(object):
+    def __call__(self, commit, **kwargs):
+        return commit["files_touched_prev_90_days"]
+
+
+class types(object):
+    def __call__(self, commit, **kwargs):
+        return commit["types"]
+
+
+class components(object):
+    def __call__(self, commit, **kwargs):
+        return commit["components"]
+
+
+class number_of_reviewers(object):
+    def __call__(self, commit, **kwargs):
+        return len(commit["reviewers"])
+
+
+class CommitExtractor(BaseEstimator, TransformerMixin):
+    def __init__(self, feature_extractors, cleanup_functions):
+        self.feature_extractors = feature_extractors
+        self.cleanup_functions = cleanup_functions
+
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, commits):
+        results = []
+
+        for commit in commits:
+            data = {}
+
+            for f in self.feature_extractors:
+                res = f(commit)
+
+                if res is None:
+                    continue
+
+                if isinstance(res, list):
+                    for item in res:
+                        data[f.__class__.__name__ + "-" + item] = "True"
+                    continue
+
+                if isinstance(res, bool):
+                    res = str(res)
+
+                data[f.__class__.__name__] = res
+
+            # TODO: Try simply using all possible fields instead of extracting features manually.
+
+            for cleanup_function in self.cleanup_functions:
+                commit["desc"] = cleanup_function(commit["desc"])
+
+            result = {"data": data, "desc": commit["desc"]}
+
+            results.append(result)
+
+        return pd.DataFrame(results)
--- a/bugbug/feature_cleanup.py
+++ b/bugbug/feature_cleanup.py
@ -0,0 +1,177 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+
+
+def url(text):
+    text = re.sub(
+        r"http[s]?://(hg.mozilla|searchfox|dxr.mozilla)\S+",
+        "__CODE_REFERENCE_URL__",
+        text,
+    )
+    return re.sub(r"http\S+", "__URL__", text)
+
+
+def fileref(text):
+    return re.sub(
+        r"\w+\.py\b|\w+\.json\b|\w+\.js\b|\w+\.jsm\b|\w+\.html\b|\w+\.css\b|\w+\.c\b|\w+\.cpp\b|\w+\.h\b",
+        "__FILE_REFERENCE__",
+        text,
+    )
+
+
+def responses(text):
+    return re.sub(">[^\n]+", " ", text)
+
+
+def hex(text):
+    return re.sub(r"\b0[xX][0-9a-fA-F]+\b", "__HEX_NUMBER__", text)
+
+
+FIREFOX_DLLS_MATCH = "|".join(
+    [
+        "libmozwayland.so",
+        "libssl3.so",
+        "libnssdbm3.so",
+        "liblgpllibs.so",
+        "libmozavutil.so",
+        "libxul.so",
+        "libmozgtk.so",
+        "libnssckbi.so",
+        "libclearkey.dylib",
+        "libmozsqlite3.so",
+        "libplc4.so",
+        "libsmime3.so",
+        "libclearkey.so",
+        "libnssutil3.so",
+        "libnss3.so",
+        "libplds4.so",
+        "libfreeblpriv3.so",
+        "libsoftokn3.so",
+        "libmozgtk.so",
+        "libmozavcodec.so",
+        "libnspr4.so",
+        "IA2Marshal.dll",
+        "lgpllibs.dll",
+        "libEGL.dll",
+        "libGLESv2.dll",
+        "libmozsandbox.so",
+        "AccessibleHandler.dll",
+        "AccessibleMarshal.dll",
+        "api-ms-win-core-console-l1-1-0.dll",
+        "api-ms-win-core-datetime-l1-1-0.dll",
+        "api-ms-win-core-debug-l1-1-0.dll",
+        "api-ms-win-core-errorhandling-l1-1-0.dll",
+        "api-ms-win-core-file-l1-1-0.dll",
+        "api-ms-win-core-file-l1-2-0.dll",
+        "api-ms-win-core-file-l2-1-0.dll",
+        "api-ms-win-core-handle-l1-1-0.dll",
+        "api-ms-win-core-heap-l1-1-0.dll",
+        "api-ms-win-core-interlocked-l1-1-0.dll",
+        "api-ms-win-core-libraryloader-l1-1-0.dll",
+        "api-ms-win-core-localization-l1-2-0.dll",
+        "api-ms-win-core-memory-l1-1-0.dll",
+        "api-ms-win-core-namedpipe-l1-1-0.dll",
+        "api-ms-win-core-processenvironment-l1-1-0.dll",
+        "api-ms-win-core-processthreads-l1-1-0.dll",
+        "api-ms-win-core-processthreads-l1-1-1.dll",
+        "api-ms-win-core-profile-l1-1-0.dll",
+        "api-ms-win-core-rtlsupport-l1-1-0.dll",
+        "api-ms-win-core-string-l1-1-0.dll",
+        "api-ms-win-core-synch-l1-1-0.dll",
+        "api-ms-win-core-synch-l1-2-0.dll",
+        "api-ms-win-core-sysinfo-l1-1-0.dll",
+        "api-ms-win-core-timezone-l1-1-0.dll",
+        "api-ms-win-core-util-l1-1-0.dll",
+        "api-ms-win-crt-conio-l1-1-0.dll",
+        "api-ms-win-crt-convert-l1-1-0.dll",
+        "api-ms-win-crt-environment-l1-1-0.dll",
+        "api-ms-win-crt-filesystem-l1-1-0.dll",
+        "api-ms-win-crt-heap-l1-1-0.dll",
+        "api-ms-win-crt-locale-l1-1-0.dll",
+        "api-ms-win-crt-math-l1-1-0.dll",
+        "api-ms-win-crt-multibyte-l1-1-0.dll",
+        "api-ms-win-crt-private-l1-1-0.dll",
+        "api-ms-win-crt-process-l1-1-0.dll",
+        "api-ms-win-crt-runtime-l1-1-0.dll",
+        "api-ms-win-crt-stdio-l1-1-0.dll",
+        "api-ms-win-crt-string-l1-1-0.dll",
+        "api-ms-win-crt-time-l1-1-0.dll",
+        "api-ms-win-crt-utility-l1-1-0.dll",
+        "d3dcompiler_47.dll",
+        "freebl3.dll",
+        "mozavcodec.dll",
+        "mozavutil.dll",
+        "mozglue.dll",
+        "msvcp140.dll",
+        "nss3.dll",
+        "nssckbi.dll",
+        "nssdbm3.dll",
+        "qipcap64.dll",
+        "softokn3.dll",
+        "ucrtbase.dll",
+        "vcruntime140.dll",
+        "xul.dll",
+        "clearkey.dll",
+        "libfreebl3.dylib",
+        "liblgpllibs.dylib",
+        "libmozavcodec.dylib",
+        "libmozavutil.dylib",
+        "libmozglue.dylib",
+        "libnss3.dylib",
+        "libnssckbi.dylib",
+        "libnssdbm3.dylib",
+        "libplugin_child_interpose.dylib",
+        "libsoftokn3.dylib",
+    ]
+).replace(".", r"\.")
+
+
+def dll(text):
+    regex = fr"\b(?!{FIREFOX_DLLS_MATCH})\w+(\.dll|\.so|\.dylib)\b"
+    return re.sub(regex, "__DLL_NAME__", text)
+
+
+def synonyms(text):
+    synonyms = [
+        ("safemode", ["safemode", "safe mode"]),
+        ("str", ["str", "steps to reproduce", "repro steps"]),
+        ("uaf", ["uaf", "use after free", "use-after-free"]),
+        ("asan", ["asan", "address sanitizer", "addresssanitizer"]),
+        (
+            "permafailure",
+            [
+                "permafailure",
+                "permafailing",
+                "permafail",
+                "perma failure",
+                "perma failing",
+                "perma fail",
+                "perma-failure",
+                "perma-failing",
+                "perma-fail",
+            ],
+        ),
+        ("spec", ["spec", "specification"]),
+    ]
+
+    for synonym_group, synonym_list in synonyms:
+        text = re.sub(
+            "|".join(fr"\b{synonym}\b" for synonym in synonym_list),
+            synonym_group,
+            text,
+            flags=re.IGNORECASE,
+        )
+
+    return text
+
+
+def crash(text):
+    return re.sub(
+        r"bp-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{6}[0-9]{6}\b",
+        "__CRASH_STATS_LINK__",
+        text,
+    )
--- a/bugbug/model.py
+++ b/bugbug/model.py
@ -12,7 +12,7 @@ from sklearn.externals import joblib
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.model_selection import cross_validate, train_test_split

-from bugbug import bugzilla
+from bugbug import bugzilla, repository
 from bugbug.nlp import SpacyVectorizer


@ -65,15 +65,15 @@ class Model:
        classes = self.get_labels()
        class_names = sorted(list(set(classes.values())), reverse=True)

-        # Get bugs, filtering out those for which we have no labels.
-        def bugs():
-            return (bug for bug in bugzilla.get_bugs() if bug["id"] in classes)
+        # Get items, filtering out those for which we have no labels.
+        def trainable_items_gen():
+            return (item for item in self.items_gen() if self.get_id(item) in classes)

        # Calculate labels.
-        y = np.array([classes[bug["id"]] for bug in bugs()])
+        y = np.array([classes[self.get_id(item)] for item in trainable_items_gen()])

-        # Extract features from the bugs.
-        X = self.extraction_pipeline.fit_transform(bugs())
+        # Extract features from the items.
+        X = self.extraction_pipeline.fit_transform(trainable_items_gen())

        print(f"X: {X.shape}, y: {y.shape}")

@ -172,29 +172,29 @@ class Model:
    def load(model_file_name):
        return joblib.load(model_file_name)

-    def overwrite_classes(self, bugs, classes, probabilities):
+    def overwrite_classes(self, items, classes, probabilities):
        return classes

    def classify(
-        self, bugs, probabilities=False, importances=False, importance_cutoff=0.15
+        self, items, probabilities=False, importances=False, importance_cutoff=0.15
    ):
-        assert bugs is not None
+        assert items is not None
        assert (
            self.extraction_pipeline is not None and self.clf is not None
        ), "The module needs to be initialized first"

-        if not isinstance(bugs, list):
-            bugs = [bugs]
+        if not isinstance(items, list):
+            items = [items]

-        assert isinstance(bugs[0], dict)
+        assert isinstance(items[0], dict)

-        X = self.extraction_pipeline.transform(bugs)
+        X = self.extraction_pipeline.transform(items)
        if probabilities:
            classes = self.clf.predict_proba(X)
        else:
            classes = self.clf.predict(X)

-        classes = self.overwrite_classes(bugs, classes, probabilities)
+        classes = self.overwrite_classes(items, classes, probabilities)

        if importances:
            explainer = shap.TreeExplainer(self.clf)
@ -209,3 +209,19 @@ class Model:
            return classes, importances

        return classes
+
+
+class BugModel(Model):
+    def get_id(self, bug):
+        return bug["id"]
+
+    def items_gen(self):
+        return (bug for bug in bugzilla.get_bugs())
+
+
+class CommitModel(Model):
+    def get_id(self, commit):
+        return commit["node"]
+
+    def items_gen(self):
+        return (commit for commit in repository.get_commits())
--- a/bugbug/models/assignee.py
+++ b/bugbug/models/assignee.py
@ -10,8 +10,8 @@ from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

-from bugbug import bug_features, bugzilla
-from bugbug.model import Model
+from bugbug import bug_features, bugzilla, feature_cleanup
+from bugbug.model import BugModel

 MINIMUM_ASSIGNMENTS = 5
 ADDRESSES_TO_EXCLUDE = [
@ -25,9 +25,9 @@ ADDRESSES_TO_EXCLUDE = [
 ]


-class AssigneeModel(Model):
+class AssigneeModel(BugModel):
    def __init__(self, lemmatization=False):
-        Model.__init__(self, lemmatization)
+        BugModel.__init__(self, lemmatization)

        self.cross_validation_enabled = False
        self.calculate_importance = False
@ -48,9 +48,9 @@ class AssigneeModel(Model):
        ]

        cleanup_functions = [
-            bug_features.cleanup_fileref,
-            bug_features.cleanup_url,
-            bug_features.cleanup_synonyms,
+            feature_cleanup.fileref,
+            feature_cleanup.url,
+            feature_cleanup.synonyms,
        ]

        self.extraction_pipeline = Pipeline(
--- a/bugbug/models/backout.py
+++ b/bugbug/models/backout.py
@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import xgboost
+from imblearn.under_sampling import RandomUnderSampler
+from sklearn.compose import ColumnTransformer
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.pipeline import Pipeline
+
+from bugbug import commit_features, feature_cleanup, repository
+from bugbug.model import CommitModel
+
+
+class BackoutModel(CommitModel):
+    def __init__(self, lemmatization=False):
+        CommitModel.__init__(self, lemmatization)
+
+        self.calculate_importance = False
+
+        self.sampler = RandomUnderSampler(random_state=0)
+
+        feature_extractors = [
+            commit_features.files_modified_num(),
+            commit_features.test_added(),
+            commit_features.added(),
+            commit_features.deleted(),
+            commit_features.test_deleted(),
+            commit_features.author_experience(),
+            commit_features.author_experience_90_days(),
+            commit_features.reviewer_experience(),
+            commit_features.reviewer_experience_90_days(),
+            commit_features.components_touched_prev(),
+            commit_features.components_touched_prev_90_days(),
+            commit_features.files_touched_prev(),
+            commit_features.files_touched_prev_90_days(),
+            commit_features.types(),
+            commit_features.components(),
+            commit_features.number_of_reviewers(),
+        ]
+
+        cleanup_functions = [
+            feature_cleanup.fileref,
+            feature_cleanup.url,
+            feature_cleanup.synonyms,
+        ]
+
+        self.extraction_pipeline = Pipeline(
+            [
+                (
+                    "commit_extractor",
+                    commit_features.CommitExtractor(
+                        feature_extractors, cleanup_functions
+                    ),
+                ),
+                (
+                    "union",
+                    ColumnTransformer(
+                        [
+                            ("data", DictVectorizer(), "data"),
+                            ("desc", self.text_vectorizer(), "desc"),
+                        ]
+                    ),
+                ),
+            ]
+        )
+
+        self.clf = xgboost.XGBClassifier(n_jobs=16)
+        self.clf.set_params(predictor="cpu_predictor")
+
+    def get_labels(self):
+        classes = {}
+
+        for commit_data in repository.get_commits():
+            classes[commit_data["node"]] = 1 if commit_data["ever_backedout"] else 0
+
+        return classes
+
+    def get_feature_names(self):
+        return self.extraction_pipeline.named_steps["union"].get_feature_names()
--- a/bugbug/models/component.py
+++ b/bugbug/models/component.py
@ -10,11 +10,11 @@ from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

-from bugbug import bug_features, bugzilla
-from bugbug.model import Model
+from bugbug import bug_features, bugzilla, feature_cleanup
+from bugbug.model import BugModel


-class ComponentModel(Model):
+class ComponentModel(BugModel):
    PRODUCTS = {
        "Core",
        "External Software Affecting Firefox",
@ -56,7 +56,7 @@ class ComponentModel(Model):
    }

    def __init__(self, lemmatization=False):
-        Model.__init__(self, lemmatization)
+        BugModel.__init__(self, lemmatization)

        self.cross_validation_enabled = False
        self.calculate_importance = False
@ -77,9 +77,9 @@ class ComponentModel(Model):
        ]

        cleanup_functions = [
-            bug_features.cleanup_fileref,
-            bug_features.cleanup_url,
-            bug_features.cleanup_synonyms,
+            feature_cleanup.fileref,
+            feature_cleanup.url,
+            feature_cleanup.synonyms,
        ]

        self.extraction_pipeline = Pipeline(
--- a/bugbug/models/defect.py
+++ b/bugbug/models/defect.py
@ -11,13 +11,13 @@ from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

-from bugbug import bug_features, bugzilla, labels
-from bugbug.model import Model
+from bugbug import bug_features, bugzilla, feature_cleanup, labels
+from bugbug.model import BugModel


-class BugModel(Model):
+class DefectModel(BugModel):
    def __init__(self, lemmatization=False, historical=False):
-        Model.__init__(self, lemmatization)
+        BugModel.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)

@ -47,9 +47,9 @@ class BugModel(Model):
            feature_extractors.append(bug_features.had_severity_enhancement())

        cleanup_functions = [
-            bug_features.cleanup_url,
-            bug_features.cleanup_fileref,
-            bug_features.cleanup_synonyms,
+            feature_cleanup.url,
+            feature_cleanup.fileref,
+            feature_cleanup.synonyms,
        ]

        self.extraction_pipeline = Pipeline(
--- a/bugbug/models/defect_enhancement_task.py
+++ b/bugbug/models/defect_enhancement_task.py
@ -3,12 +3,12 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

-from bugbug.models.bug import BugModel
+from bugbug.models.defect import DefectModel


-class DefectEnhancementTaskModel(BugModel):
+class DefectEnhancementTaskModel(DefectModel):
    def __init__(self, lemmatization=False):
-        BugModel.__init__(self, lemmatization)
+        DefectModel.__init__(self, lemmatization)

    def get_labels(self):
        classes = self.get_bugbug_labels("defect_enhancement_task")
--- a/bugbug/models/devdocneeded.py
+++ b/bugbug/models/devdocneeded.py
@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

-from bugbug import bug_features, bugzilla
-from bugbug.model import Model
+from bugbug import bug_features, bugzilla, feature_cleanup
+from bugbug.model import BugModel


-class DevDocNeededModel(Model):
+class DevDocNeededModel(BugModel):
    def __init__(self, lemmatization=False):
-        Model.__init__(self, lemmatization)
+        BugModel.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)

@ -41,9 +41,9 @@ class DevDocNeededModel(Model):
        ]

        cleanup_functions = [
-            bug_features.cleanup_fileref,
-            bug_features.cleanup_url,
-            bug_features.cleanup_synonyms,
+            feature_cleanup.fileref,
+            feature_cleanup.url,
+            feature_cleanup.synonyms,
        ]

        self.extraction_pipeline = Pipeline(
--- a/bugbug/models/qaneeded.py
+++ b/bugbug/models/qaneeded.py
@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

-from bugbug import bug_features, bugzilla
-from bugbug.model import Model
+from bugbug import bug_features, bugzilla, feature_cleanup
+from bugbug.model import BugModel


-class QANeededModel(Model):
+class QANeededModel(BugModel):
    def __init__(self, lemmatization=False):
-        Model.__init__(self, lemmatization)
+        BugModel.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)

@ -36,9 +36,9 @@ class QANeededModel(Model):
        ]

        cleanup_functions = [
-            bug_features.cleanup_fileref,
-            bug_features.cleanup_url,
-            bug_features.cleanup_synonyms,
+            feature_cleanup.fileref,
+            feature_cleanup.url,
+            feature_cleanup.synonyms,
        ]

        self.extraction_pipeline = Pipeline(
--- a/bugbug/models/regression.py
+++ b/bugbug/models/regression.py
@ -3,12 +3,12 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

-from bugbug.models.bug import BugModel
+from bugbug.models.defect import DefectModel


-class RegressionModel(BugModel):
+class RegressionModel(DefectModel):
    def __init__(self, lemmatization=False):
-        BugModel.__init__(self, lemmatization)
+        DefectModel.__init__(self, lemmatization)

    def get_labels(self):
        return self.get_bugbug_labels("regression")
--- a/bugbug/models/tracking.py
+++ b/bugbug/models/tracking.py
@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

-from bugbug import bug_features, bugzilla, labels
-from bugbug.model import Model
+from bugbug import bug_features, bugzilla, feature_cleanup, labels
+from bugbug.model import BugModel


-class TrackingModel(Model):
+class TrackingModel(BugModel):
    def __init__(self, lemmatization=False):
-        Model.__init__(self, lemmatization)
+        BugModel.__init__(self, lemmatization)

        self.sampler = InstanceHardnessThreshold(random_state=0)

@ -47,12 +47,12 @@ class TrackingModel(Model):
        ]

        cleanup_functions = [
-            bug_features.cleanup_url,
-            bug_features.cleanup_fileref,
-            bug_features.cleanup_hex,
-            bug_features.cleanup_dll,
-            bug_features.cleanup_synonyms,
-            bug_features.cleanup_crash,
+            feature_cleanup.url,
+            feature_cleanup.fileref,
+            feature_cleanup.hex,
+            feature_cleanup.dll,
+            feature_cleanup.synonyms,
+            feature_cleanup.crash,
        ]

        self.extraction_pipeline = Pipeline(
--- a/bugbug/models/uplift.py
+++ b/bugbug/models/uplift.py
@ -9,13 +9,13 @@ from sklearn.compose import ColumnTransformer
 from sklearn.feature_extraction import DictVectorizer
 from sklearn.pipeline import Pipeline

-from bugbug import bug_features, bugzilla
-from bugbug.model import Model
+from bugbug import bug_features, bugzilla, feature_cleanup
+from bugbug.model import BugModel


-class UpliftModel(Model):
+class UpliftModel(BugModel):
    def __init__(self, lemmatization=False):
-        Model.__init__(self, lemmatization)
+        BugModel.__init__(self, lemmatization)

        self.sampler = RandomUnderSampler(random_state=0)

@ -36,9 +36,9 @@ class UpliftModel(Model):
        ]

        cleanup_functions = [
-            bug_features.cleanup_fileref,
-            bug_features.cleanup_url,
-            bug_features.cleanup_synonyms,
+            feature_cleanup.fileref,
+            feature_cleanup.url,
+            feature_cleanup.synonyms,
        ]

        self.extraction_pipeline = Pipeline(
--- a/infra/data-pipeline.yml
+++ b/infra/data-pipeline.yml
@ -83,7 +83,7 @@ tasks:
      owner: release-mgmt-analysis@mozilla.com
      source: https://github.com/mozilla/bugbug/raw/master/data-pipeline.yml

-  - ID: train-defect
+  - ID: train-defectenhancementtask
    created: {$fromNow: ''}
    deadline: {$fromNow: '48 hours'}
    provisionerId: aws-provisioner-v1
@ -105,8 +105,8 @@ tasks:
      - notify.irc-channel.#bugbug.on-failed
      - index.project.relman.bugbug.train_defectenhancementtask.latest
    metadata:
-      name: bugbug train defect model
-      description: bugbug train defect model
+      name: bugbug train defect/enhancement/task model
+      description: bugbug train defect/enhancement/task model
      owner: release-mgmt-analysis@mozilla.com
      source: https://github.com/mozilla/bugbug/raw/master/data-pipeline.yml

--- a/run.py
+++ b/run.py
@ -25,7 +25,8 @@ if __name__ == "__main__":
        "--goal",
        help="Goal of the classifier",
        choices=[
-            "bug",
+            # bug classifiers
+            "defect",
            "regression",
            "tracking",
            "qaneeded",
@ -34,8 +35,10 @@ if __name__ == "__main__":
            "devdocneeded",
            "defectenhancementtask",
            "assignee",
+            # commit classifiers
+            "backout",
        ],
-        default="bug",
+        default="defect",
    )
    parser.add_argument(
        "--classifier",
@ -59,10 +62,10 @@ if __name__ == "__main__":
        args.goal, "" if args.classifier == "default" else args.classifier
    )

-    if args.goal == "bug":
-        from bugbug.models.bug import BugModel
+    if args.goal == "defect":
+        from bugbug.models.defect import DefectModel

-        model_class = BugModel
+        model_class = DefectModel
    elif args.goal == "defectenhancementtask":
        from bugbug.models.defect_enhancement_task import DefectEnhancementTaskModel

@ -100,6 +103,10 @@ if __name__ == "__main__":
        from bugbug.models.assignee import AssigneeModel

        model_class = AssigneeModel
+    elif args.goal == "backout":
+        from bugbug.models.backout import BackoutModel
+
+        model_class = BackoutModel

    if args.train:
        db.download()
--- a/tests/test_bug.py
+++ b/tests/test_bug.py
@ -3,11 +3,11 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

-from bugbug.models.bug import BugModel
+from bugbug.models.defect import DefectModel


 def test_get_bug_labels():
-    model = BugModel()
+    model = DefectModel()
    classes = model.get_labels()
    # labels from bug_nobug.csv
    assert classes[1087488]
--- a/tests/test_cleanup_functions.py
+++ b/tests/test_cleanup_functions.py
@ -3,10 +3,10 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.

-from bugbug import bug_features
+from bugbug import feature_cleanup


-def test_cleanup_url():
+def test_url():
    tests = [
        (
            "This code lies in https://github.com/marco-c/bugbug",
@ -26,10 +26,10 @@ def test_cleanup_url():
        ),
    ]
    for orig_text, cleaned_text in tests:
-        assert bug_features.cleanup_url(orig_text) == cleaned_text
+        assert feature_cleanup.url(orig_text) == cleaned_text


-def test_cleanup_fileref():
+def test_fileref():
    tests = [
        (
            "Some random filenames are file1.py , file2.cpp and file3.json",
@ -37,10 +37,10 @@ def test_cleanup_fileref():
        )
    ]
    for orig_text, cleaned_text in tests:
-        assert bug_features.cleanup_fileref(orig_text) == cleaned_text
+        assert feature_cleanup.fileref(orig_text) == cleaned_text


-def test_cleanup_responses():
+def test_responses():
    tests = [
        (
            "A response can be of the form>This is the comment\n",
@ -57,10 +57,10 @@ def test_cleanup_responses():
        ),
    ]
    for orig_text, cleaned_text in tests:
-        assert bug_features.cleanup_responses(orig_text) == cleaned_text
+        assert feature_cleanup.responses(orig_text) == cleaned_text


-def test_cleanup_hex():
+def test_hex():
    tests = [
        (
            "0 scdetour.dll scdetour.dll@0x2dd77",
@ -72,10 +72,10 @@ def test_cleanup_hex():
        ),
    ]
    for orig_text, cleaned_text in tests:
-        assert bug_features.cleanup_hex(orig_text) == cleaned_text
+        assert feature_cleanup.hex(orig_text) == cleaned_text


-def test_cleanup_dll():
+def test_dll():
    tests = [
        (
            "Crashing thread: 0 scdetour.dll scdetour.dll@0x2dd77",
@ -100,10 +100,10 @@ def test_cleanup_dll():
        ),
    ]
    for orig_text, cleaned_text in tests:
-        assert bug_features.cleanup_dll(orig_text) == cleaned_text
+        assert feature_cleanup.dll(orig_text) == cleaned_text


-def test_cleanup_synonyms():
+def test_synonyms():
    tests = [
        (
            "I was in safemode, but the problem occurred in safe mode too",
@ -118,10 +118,10 @@ def test_cleanup_synonyms():
        ("found via address sanitizer or asan", "found via asan or asan"),
    ]
    for orig_text, cleaned_text in tests:
-        assert bug_features.cleanup_synonyms(orig_text) == cleaned_text
+        assert feature_cleanup.synonyms(orig_text) == cleaned_text


-def test_cleanup_crash():
+def test_crash():
    tests = [
        (
            "This bug was filed from the Socorro interface and is report bp-ba7ff893-687f-4381-b430-ba66b0170628.",
@ -133,4 +133,4 @@ def test_cleanup_crash():
        ),
    ]
    for orig_text, cleaned_text in tests:
-        assert bug_features.cleanup_crash(orig_text) == cleaned_text
+        assert feature_cleanup.crash(orig_text) == cleaned_text
--- a/tests/test_run.py
+++ b/tests/test_run.py
@ -12,8 +12,8 @@ RUNPY = os.path.join(os.path.dirname(os.path.dirname(__file__)), "run.py")

 def test_run():
    # Test running the training for the bug model.
-    print([sys.executable, RUNPY, "--train", "--goal", "bug"])
-    subprocess.run([sys.executable, RUNPY, "--train", "--goal", "bug"], check=True)
+    print([sys.executable, RUNPY, "--train", "--goal", "defect"])
+    subprocess.run([sys.executable, RUNPY, "--train", "--goal", "defect"], check=True)

    # Test loading the trained model.
-    subprocess.run([sys.executable, RUNPY, "--goal", "bug"], check=True)
+    subprocess.run([sys.executable, RUNPY, "--goal", "defect"], check=True)