Add extra functionality from run.py to the trainer script (#856)

Fixes #339
2019-08-05 19:19:59 -04:00 · 2019-08-05 19:19:59 -04:00 · 81c17ecbaf
--- a/README.md
+++ b/README.md
@ -54,7 +54,7 @@ Every time you will try to commit, pre-commit will run checks on your files to m

 ## Usage

-Run the `run.py` script to perform training / classification. The first time `run.py` is executed, the `--train` argument should be used to automatically download databases containing bugs and commits data (they will be downloaded in the data/ directory).
+Run the `trainer.py` script with the command `python3 -c 'from scripts import trainer; trainer.main()'` to perform training.

 ### Running the repository mining script

--- a/run.py
+++ b/run.py
@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-import argparse
-import sys
-
-from bugbug import bugzilla, db, repository
-from bugbug.models import MODELS, get_model_class
-
-
-def parse_args(args):
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--lemmatization",
-        help="Perform lemmatization (using spaCy)",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--training-set-size",
-        nargs="?",
-        default=14000,
-        type=int,
-        help="The size of the training set for the duplicate model",
-    )
-    parser.add_argument(
-        "--disable-url-cleanup",
-        help="Don't cleanup urls when training the duplicate model",
-        dest="cleanup_urls",
-        default=True,
-        action="store_false",
-    )
-    parser.add_argument("--train", help="Perform training", action="store_true")
-    parser.add_argument(
-        "--goal", help="Goal of the classifier", choices=MODELS.keys(), default="defect"
-    )
-    parser.add_argument(
-        "--classifier",
-        help="Type of the classifier. Only used for component classification.",
-        choices=["default", "nn"],
-        default="default",
-    )
-    parser.add_argument(
-        "--historical",
-        help="""Analyze historical bugs. Only used for defect, bugtype,
-                defectenhancementtask and regression tasks.""",
-        action="store_true",
-    )
-    return parser.parse_args(args)
-
-
-def main(args):
-    if args.goal == "component":
-        if args.classifier == "default":
-            model_class_name = "component"
-        else:
-            model_class_name = "component_nn"
-    else:
-        model_class_name = args.goal
-
-    model_class = get_model_class(model_class_name)
-
-    if args.train:
-        db.download(bugzilla.BUGS_DB)
-        db.download(repository.COMMITS_DB)
-
-        historical_supported_tasks = [
-            "defect",
-            "bugtype",
-            "defectenhancementtask",
-            "regression",
-        ]
-
-        if args.goal in historical_supported_tasks:
-            model = model_class(args.lemmatization, args.historical)
-        elif args.goal == "duplicate":
-            model = model_class(
-                args.training_set_size, args.lemmatization, args.cleanup_urls
-            )
-        else:
-            model = model_class(args.lemmatization)
-        model.train()
-
-
-if __name__ == "__main__":
-    main(parse_args(sys.argv[1:]))
--- a/scripts/trainer.py
+++ b/scripts/trainer.py
@ -3,33 +3,58 @@
 import argparse
 import json
 import os
+import sys
 from logging import INFO, basicConfig, getLogger

 from bugbug import bugzilla, db, model, repository
 from bugbug.models import get_model_class
 from bugbug.utils import CustomJsonEncoder, zstd_compress

+MODELS_WITH_TYPE = ("component",)
+HISTORICAL_SUPPORTED_TASKS = (
+    "defect",
+    "bugtype",
+    "defectenhancementtask",
+    "regression",
+)
+
 basicConfig(level=INFO)
 logger = getLogger(__name__)


 class Trainer(object):
-    def go(self, model_name):
+    def go(self, args):
        # Download datasets that were built by bugbug_data.
        os.makedirs("data", exist_ok=True)

+        if args.classifier != "default":
+            assert (
+                args.model in MODELS_WITH_TYPE
+            ), f"{args.classifier} is not a valid classifier type for {args.model}"
+
+            model_name = f"{args.model}_{args.classifier}"
+        else:
+            model_name = args.model
+
        model_class = get_model_class(model_name)
-        model_obj = model_class()
+        if args.model in HISTORICAL_SUPPORTED_TASKS:
+            model_obj = model_class(args.lemmatization, args.historical)
+        elif args.model == "duplicate":
+            model_obj = model_class(
+                args.training_set_size, args.lemmatization, args.cleanup_urls
+            )
+        else:
+            model_obj = model_class(args.lemmatization)

        if (
            isinstance(model_obj, model.BugModel)
            or isinstance(model_obj, model.BugCoupleModel)
            or (hasattr(model_obj, "bug_data") and model_obj.bug_data)
        ):
-            db.download(bugzilla.BUGS_DB, force=True)
+            db.download(bugzilla.BUGS_DB)

        if isinstance(model_obj, model.CommitModel):
-            db.download(repository.COMMITS_DB, force=True)
+            db.download(repository.COMMITS_DB)

        logger.info(f"Training *{model_name}* model")
        metrics = model_obj.train()
@ -48,13 +73,47 @@ class Trainer(object):
        logger.info(f"Model compressed")


-def main():
+def parse_args(args):
    description = "Train the models"
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument("model", help="Which model to train.")
+    parser.add_argument(
+        "--lemmatization",
+        help="Perform lemmatization (using spaCy)",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--training-set-size",
+        nargs="?",
+        default=14000,
+        type=int,
+        help="The size of the training set for the duplicate model",
+    )
+    parser.add_argument(
+        "--disable-url-cleanup",
+        help="Don't cleanup urls when training the duplicate model",
+        dest="cleanup_urls",
+        default=True,
+        action="store_false",
+    )
+    parser.add_argument(
+        "--classifier",
+        help="Type of the classifier. Only used for component classification.",
+        choices=["default", "nn"],
+        default="default",
+    )
+    parser.add_argument(
+        "--historical",
+        help="""Analyze historical bugs. Only used for defect, bugtype,
+                defectenhancementtask and regression tasks.""",
+        action="store_true",
+    )
+    return parser.parse_args(args)

-    args = parser.parse_args()
+
+def main():
+    args = parse_args(sys.argv[1:])

    retriever = Trainer()
-    retriever.go(args.model)
+    retriever.go(args)
--- a/tests/test_run.py
+++ b/tests/test_run.py
@ -1,14 +0,0 @@
-# -*- coding: utf-8 -*-
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this file,
-# You can obtain one at http://mozilla.org/MPL/2.0/.
-
-import run
-
-
-def test_run():
-    # Test running the training for the bug model.
-    run.main(run.parse_args(["--train", "--goal", "defect"]))
-
-    # Test loading the trained model.
-    run.main(run.parse_args(["--goal", "defect"]))
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@ -0,0 +1,10 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from scripts import trainer
+
+
+def test_trainer():
+    trainer.Trainer().go(trainer.parse_args(["defect"]))