Add WIP integration test (#823)

2019-09-13 15:18:46 +02:00 · 2019-09-13 15:18:46 +02:00 · c30295f870
--- a/.taskcluster.yml
+++ b/.taskcluster.yml
@ -406,3 +406,38 @@ tasks:
            description: bugbug update classify patch hook
            owner: mcastelluccio@mozilla.com
            source: ${repository}/raw/${head_rev}/.taskcluster.yml
+
+      - $if: 'tasks_for == "github-push" && head_branch[:10] == "refs/tags/"'
+        then:
+          taskId: {$eval: as_slugid("integration_test")}
+          created: {$fromNow: ''}
+          deadline: {$fromNow: '1 hour'}
+          provisionerId: aws-provisioner-v1
+          workerType: relman-svc
+          scopes:
+            - secrets:get:project/relman/bugbug/integration
+          payload:
+            features:
+              taskclusterProxy:
+                true
+            maxRunTime: 10800
+            image: mozilla/bugbug-commit-retrieval:latest
+            env:
+              TC_SECRET_ID: project/relman/bugbug/integration
+            command:
+              - "/bin/bash"
+              - "-lcx"
+              - "git clone --quiet ${repository} &&
+                cd bugbug &&
+                git -c advice.detachedHead=false checkout ${head_rev} &&
+                pip install --quiet . &&
+                pip install --quiet -r test-requirements.txt &&
+                apt-get update &&
+                apt-get install -y redis-server &&
+                python -c 'import os; print(os.environ.keys())' &&
+                bash ./scripts/integration_test.sh || true"
+          metadata:
+            name: bugbug integration test
+            description: bugbug integration test
+            owner: ${user}@users.noreply.github.com
+            source: ${repository}/raw/${head_rev}/.taskcluster.yml
--- a/bugbug/init.py
+++ b/bugbug/init.py
@ -4,7 +4,9 @@ import logging

 import pkg_resources

-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s:%(levelname)s:%(name)s:%(message)s"
+)


 def get_bugbug_version():
--- a/bugbug/db.py
+++ b/bugbug/db.py
@ -6,6 +6,7 @@
 import gzip
 import io
 import json
+import logging
 import os
 import pickle
 from contextlib import contextmanager
@ -19,6 +20,8 @@ from bugbug.utils import zstd_decompress

 DATABASES = {}

+logger = logging.getLogger(__name__)
+

 def register(path, url, version, support_files=[]):
    DATABASES[path] = {"url": url, "version": version, "support_files": support_files}
@ -62,13 +65,15 @@ def download_support_file(path, file_name):
        url = urljoin(DATABASES[path]["url"], file_name)
        path = os.path.join(os.path.dirname(path), file_name)

-        print(f"Downloading {url} to {path}")
+        logger.info(f"Downloading {url} to {path}")
        utils.download_check_etag(url, path)

        if path.endswith(".zst"):
            extract_file(path)
    except requests.exceptions.HTTPError:
-        print(f"{file_name} is not yet available to download for {path}")
+        logger.info(
+            f"{file_name} is not yet available to download for {path}", exc_info=True
+        )


 # Download and extract databases.
@ -82,11 +87,11 @@ def download(path, force=False, support_files_too=False):
    if not os.path.exists(zst_path) or force:
        url = DATABASES[path]["url"]
        try:
-            print(f"Downloading {url} to {zst_path}")
+            logger.info(f"Downloading {url} to {zst_path}")
            utils.download_check_etag(url, zst_path)

        except requests.exceptions.HTTPError:
-            print(f"{url} is not yet available to download")
+            logger.info(f"{url} is not yet available to download", exc_info=True)
            return

    extract_file(zst_path)
--- a/bugbug/model.py
+++ b/bugbug/model.py
@ -313,7 +313,7 @@ class Model:

        return feature_report

-    def train(self, importance_cutoff=0.15):
+    def train(self, importance_cutoff=0.15, limit=None):
        classes, self.class_names = self.get_labels()
        self.class_names = sort_class_names(self.class_names)

@ -326,6 +326,10 @@ class Model:
        # Calculate labels.
        y = np.array(y_iter)

+        if limit:
+            X = X[:limit]
+            y = y[:limit]
+
        print(f"X: {X.shape}, y: {y.shape}")

        is_multilabel = isinstance(y[0], np.ndarray)
--- a/http_service/.dockerignore
+++ b/http_service/.dockerignore
@ -95,7 +95,10 @@ venv/
 **/.pytest_cache/

 # Project-specific stuff
-cache/
-**/cache/
 data/
-http_service/
+*/data
+
+# Integrations tests cache
+cache/
+*/cache/
+*/*/cache/
--- a/http_service/app.py
+++ b/http_service/app.py
@ -264,7 +264,7 @@ def model_prediction(model_name, bug_id):
        LOGGER.info("Request with API TOKEN %r", auth)

    # Get the latest change from Bugzilla for the bug
-    bug = get_bugs_last_change_time([bug_id], auth)
+    bug = get_bugs_last_change_time([bug_id])

    if is_prediction_invalidated(model_name, bug_id, bug[bug_id]):
        clean_prediction_cache(model_name, bug_id)
--- a/http_service/docker-compose.yml
+++ b/http_service/docker-compose.yml
@ -24,6 +24,7 @@ services:
    environment:
      - BUGBUG_BUGZILLA_TOKEN
      - REDIS_URL=redis://redis:6379/0
+      - BUGBUG_ALLOW_MISSING_MODELS
    depends_on:
      - redis

--- a/http_service/models.py
+++ b/http_service/models.py
@ -26,6 +26,8 @@ DEFAULT_EXPIRATION_TTL = 7 * 24 * 3600  # A week

 MODEL_CACHE = {}

+ALLOW_MISSING_MODELS = bool(int(os.environ.get("BUGBUG_ALLOW_MISSING_MODELS", "0")))
+

 def result_key(model_name, bug_id):
    return f"result_{model_name}_{bug_id}"
@ -37,8 +39,18 @@ def change_time_key(model_name, bug_id):

 def get_model(model_name):
    if model_name not in MODEL_CACHE:
-        print("Recreating the model in cache")
-        model = load_model(model_name, MODELS_DIR)
+        print("Recreating the %r model in cache" % model_name)
+        try:
+            model = load_model(model_name, MODELS_DIR)
+        except FileNotFoundError:
+            if ALLOW_MISSING_MODELS:
+                print(
+                    "Missing %r model, skipping because ALLOW_MISSING_MODELS is set"
+                    % model_name
+                )
+                return None
+            else:
+                raise

        MODEL_CACHE[model_name] = model
        return model
@ -114,6 +126,10 @@ def classify_bug(

    model = get_model(model_name)

+    if not model:
+        print("Missing model %r, aborting" % model_name)
+        return "NOK"
+
    model_extra_data = model.get_extra_data()

    # TODO: Classify could choke on a single bug which could make the whole
--- a/http_service/tests/test_integration.py
+++ b/http_service/tests/test_integration.py
@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import os
+import time
+
+import requests
+
+BUGBUG_HTTP_SERVER = os.environ.get("BUGBUG_HTTP_SERVER", "http://localhost:8000/")
+
+
+def integration_test():
+    # First try to classify a single bug
+    single_bug_url = f"{BUGBUG_HTTP_SERVER}/defectenhancementtask/predict/1376406"
+    response = None
+    for i in range(100):
+        response = requests.get(single_bug_url, headers={"X-Api-Key": "Test"})
+
+        if response.status_code == 200:
+            break
+
+        time.sleep(1)
+
+    if not response:
+        raise Exception("Couldn't get an answer in 100 seconds")
+
+    response_json = response.json()
+    assert response_json["class"] == "defect", response_json
+
+    # Then try to classify a batch
+    batch_url = f"{BUGBUG_HTTP_SERVER}/defectenhancementtask/predict/batch"
+    bug_ids = [1_376_535, 1_376_412]
+    response = None
+    for i in range(100):
+        response = requests.post(
+            batch_url, headers={"X-Api-Key": "Test"}, json={"bugs": bug_ids}
+        )
+
+        if response.status_code == 200:
+            break
+
+        time.sleep(1)
+
+    if not response:
+        raise Exception("Couldn't get an answer in 100 seconds")
+
+    response_json = response.json()
+    assert response_json["bugs"]["1376535"]["class"] == "enhancement", response_json[
+        "bugs"
+    ]["1376535"]
+    assert response_json["bugs"]["1376412"]["class"] == "task", response_json["bugs"][
+        "1376412"
+    ]
+
+
+if __name__ == "__main__":
+    integration_test()
--- a/scripts/bug_retriever.py
+++ b/scripts/bug_retriever.py
@ -2,7 +2,7 @@

 import argparse
 from datetime import datetime
-from logging import INFO, basicConfig, getLogger
+from logging import getLogger

 import dateutil.parser
 from dateutil.relativedelta import relativedelta
@ -10,12 +10,11 @@ from dateutil.relativedelta import relativedelta
 from bugbug import bug_snapshot, bugzilla, db, labels, repository
 from bugbug.utils import get_secret, zstd_compress

-basicConfig(level=INFO)
 logger = getLogger(__name__)


 class Retriever(object):
-    def retrieve_bugs(self):
+    def retrieve_bugs(self, limit=None):
        bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))

        if not db.is_old_version(bugzilla.BUGS_DB):
@ -40,10 +39,14 @@ class Retriever(object):
        timespan_ids = bugzilla.get_ids_between(
            two_years_and_six_months_ago, six_months_ago
        )
+        if limit:
+            timespan_ids = timespan_ids[:limit]
        logger.info(f"Retrieved {len(timespan_ids)} IDs.")

        # Get IDs of labelled bugs.
        labelled_bug_ids = labels.get_all_bug_ids()
+        if limit:
+            labelled_bug_ids = labelled_bug_ids[:limit]
        logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")

        # Get the commits DB, as we need it to get the bug IDs linked to recent commits.
@ -60,6 +63,8 @@ class Retriever(object):
            if commit["bug_id"]
            and dateutil.parser.parse(commit["pushdate"]) >= start_date
        ]
+        if limit:
+            commit_bug_ids = commit_bug_ids[:limit]
        logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")

        # Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model).
@ -128,12 +133,17 @@ class Retriever(object):
 def main():
    description = "Retrieve and extract the information from Bugzilla instance"
    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Only download the N oldest bugs, used mainly for integration tests",
+    )

    # Parse args to show the help if `--help` is passed
-    parser.parse_args()
+    args = parser.parse_args()

    retriever = Retriever()
-    retriever.retrieve_bugs()
+    retriever.retrieve_bugs(args.limit)


 if __name__ == "__main__":
--- a/scripts/commit_retriever.py
+++ b/scripts/commit_retriever.py
@ -43,6 +43,11 @@ def main():
    description = "Retrieve and extract the information from Mozilla-Central repository"
    parser = argparse.ArgumentParser(description=description)

+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Only download the N oldest commits, used mainly for integration tests",
+    )  # TODO: Use limit
    parser.add_argument("cache-root", help="Cache for repository clones.")

    args = parser.parse_args()
--- a/scripts/integration_test.sh
+++ b/scripts/integration_test.sh
@ -0,0 +1,61 @@
+#!/bin/bash
+set -euox pipefail
+
+# Script that runs the whole data pipeline as fast as possible to validate
+# that every part is working with the others
+
+# Supposed to be run from the repository root directory
+
+cd http_service/models/;
+
+# Remove the models
+rm defectenhancementtaskmodel* || true;
+rm backout* || true;
+
+# First retrieve a subset of bugs data
+# TODO: Let the script download the previous DB as it should be pretty fast?
+bugbug-data-bugzilla --limit 100
+
+# Then retrieve a subset of commit data
+mkdir -p cache
+bugbug-data-commits --limit 100 cache
+
+# Then train a bug model
+bugbug-train --limit 500 --no-download defectenhancementtask
+
+# Then train a commit model
+bugbug-train --limit 30000 --no-download backout
+
+# Then spin the http service up
+# This part duplicates the http service Dockerfiles because we cannot easily spin Docker containers
+# up on Taskcluster
+cd ../
+pip install -r requirements.txt
+cd ../
+pwd
+ls http_service/models/
+
+export REDIS_URL=redis://localhost:6379/4
+
+# Start Redis
+redis-server >/dev/null 2>&1 &
+redis_pid=$!
+
+sleep 1
+
+# Uncomment following line to clean up the redis-server
+# redis-cli -u $REDIS_URL FLUSHDB
+
+# Start the http server
+gunicorn -b 127.0.0.1:8000 http_service.app --preload --timeout 30 -w 3 &
+gunicorn_pid=$!
+
+# Start the background worker
+env BUGBUG_ALLOW_MISSING_MODELS=1 python http_service/worker.py high default low &
+worker_pid=$!
+
+# Ensure we take down the containers at the end
+trap 'kill $gunicorn_pid && kill $worker_pid && kill $redis_pid' EXIT
+
+# Then check that we can correctly classify a bug
+sleep 10 && python http_service/tests/test_integration.py
--- a/scripts/trainer.py
+++ b/scripts/trainer.py
@ -51,13 +51,19 @@ class Trainer(object):
            or isinstance(model_obj, model.BugCoupleModel)
            or (hasattr(model_obj, "bug_data") and model_obj.bug_data)
        ):
-            db.download(bugzilla.BUGS_DB)
+            if args.download_db:
+                db.download(bugzilla.BUGS_DB)
+            else:
+                logger.info("Skipping download of the bug database")

        if isinstance(model_obj, model.CommitModel):
-            db.download(repository.COMMITS_DB)
+            if args.download_db:
+                db.download(repository.COMMITS_DB)
+            else:
+                logger.info("Skipping download of the commit database")

        logger.info(f"Training *{model_name}* model")
-        metrics = model_obj.train()
+        metrics = model_obj.train(limit=args.limit)

        # Save the metrics as a file that can be uploaded as an artifact.
        metric_file_path = "metrics.json"
@ -78,6 +84,17 @@ def parse_args(args):
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument("model", help="Which model to train.")
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Only train on a subset of the data, used mainly for integrations tests",
+    )
+    parser.add_argument(
+        "--no-download",
+        action="store_false",
+        dest="download_db",
+        help="Do not download databases, uses whatever is on disk",
+    )
    parser.add_argument(
        "--lemmatization",
        help="Perform lemmatization (using spaCy)",
--- a/tests/test_db.py
+++ b/tests/test_db.py
@ -273,7 +273,7 @@ def test_is_old_version(tmp_path):
    assert db.is_old_version(db_path)


-def test_download_support_file_missing(tmp_path, capfd):
+def test_download_support_file_missing(tmp_path, caplog):
    url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst"
    support_filename = "support_mock.zst"
    url_support = urljoin(url, support_filename)
@ -297,11 +297,8 @@ def test_download_support_file_missing(tmp_path, capfd):

    db.download_support_file(db_path, support_filename)

-    out, err = capfd.readouterr()
    path = os.path.join(
        os.path.dirname(db_path), f"{os.path.splitext(support_filename)[0]}.zst"
    )
-    assert (
-        out.split("\n")[-2]
-        == f"{support_filename} is not yet available to download for {path}"
-    )
+    expected_message = f"{support_filename} is not yet available to download for {path}"
+    assert expected_message in caplog.text