зеркало из https://github.com/mozilla/bugbug.git
Add WIP integration test (#823)
This commit is contained in:
Родитель
a786f82b8c
Коммит
c30295f870
|
@ -406,3 +406,38 @@ tasks:
|
|||
description: bugbug update classify patch hook
|
||||
owner: mcastelluccio@mozilla.com
|
||||
source: ${repository}/raw/${head_rev}/.taskcluster.yml
|
||||
|
||||
- $if: 'tasks_for == "github-push" && head_branch[:10] == "refs/tags/"'
|
||||
then:
|
||||
taskId: {$eval: as_slugid("integration_test")}
|
||||
created: {$fromNow: ''}
|
||||
deadline: {$fromNow: '1 hour'}
|
||||
provisionerId: aws-provisioner-v1
|
||||
workerType: relman-svc
|
||||
scopes:
|
||||
- secrets:get:project/relman/bugbug/integration
|
||||
payload:
|
||||
features:
|
||||
taskclusterProxy:
|
||||
true
|
||||
maxRunTime: 10800
|
||||
image: mozilla/bugbug-commit-retrieval:latest
|
||||
env:
|
||||
TC_SECRET_ID: project/relman/bugbug/integration
|
||||
command:
|
||||
- "/bin/bash"
|
||||
- "-lcx"
|
||||
- "git clone --quiet ${repository} &&
|
||||
cd bugbug &&
|
||||
git -c advice.detachedHead=false checkout ${head_rev} &&
|
||||
pip install --quiet . &&
|
||||
pip install --quiet -r test-requirements.txt &&
|
||||
apt-get update &&
|
||||
apt-get install -y redis-server &&
|
||||
python -c 'import os; print(os.environ.keys())' &&
|
||||
bash ./scripts/integration_test.sh || true"
|
||||
metadata:
|
||||
name: bugbug integration test
|
||||
description: bugbug integration test
|
||||
owner: ${user}@users.noreply.github.com
|
||||
source: ${repository}/raw/${head_rev}/.taskcluster.yml
|
||||
|
|
|
@ -4,7 +4,9 @@ import logging
|
|||
|
||||
import pkg_resources
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s:%(levelname)s:%(name)s:%(message)s"
|
||||
)
|
||||
|
||||
|
||||
def get_bugbug_version():
|
||||
|
|
13
bugbug/db.py
13
bugbug/db.py
|
@ -6,6 +6,7 @@
|
|||
import gzip
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
from contextlib import contextmanager
|
||||
|
@ -19,6 +20,8 @@ from bugbug.utils import zstd_decompress
|
|||
|
||||
DATABASES = {}
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def register(path, url, version, support_files=[]):
|
||||
DATABASES[path] = {"url": url, "version": version, "support_files": support_files}
|
||||
|
@ -62,13 +65,15 @@ def download_support_file(path, file_name):
|
|||
url = urljoin(DATABASES[path]["url"], file_name)
|
||||
path = os.path.join(os.path.dirname(path), file_name)
|
||||
|
||||
print(f"Downloading {url} to {path}")
|
||||
logger.info(f"Downloading {url} to {path}")
|
||||
utils.download_check_etag(url, path)
|
||||
|
||||
if path.endswith(".zst"):
|
||||
extract_file(path)
|
||||
except requests.exceptions.HTTPError:
|
||||
print(f"{file_name} is not yet available to download for {path}")
|
||||
logger.info(
|
||||
f"{file_name} is not yet available to download for {path}", exc_info=True
|
||||
)
|
||||
|
||||
|
||||
# Download and extract databases.
|
||||
|
@ -82,11 +87,11 @@ def download(path, force=False, support_files_too=False):
|
|||
if not os.path.exists(zst_path) or force:
|
||||
url = DATABASES[path]["url"]
|
||||
try:
|
||||
print(f"Downloading {url} to {zst_path}")
|
||||
logger.info(f"Downloading {url} to {zst_path}")
|
||||
utils.download_check_etag(url, zst_path)
|
||||
|
||||
except requests.exceptions.HTTPError:
|
||||
print(f"{url} is not yet available to download")
|
||||
logger.info(f"{url} is not yet available to download", exc_info=True)
|
||||
return
|
||||
|
||||
extract_file(zst_path)
|
||||
|
|
|
@ -313,7 +313,7 @@ class Model:
|
|||
|
||||
return feature_report
|
||||
|
||||
def train(self, importance_cutoff=0.15):
|
||||
def train(self, importance_cutoff=0.15, limit=None):
|
||||
classes, self.class_names = self.get_labels()
|
||||
self.class_names = sort_class_names(self.class_names)
|
||||
|
||||
|
@ -326,6 +326,10 @@ class Model:
|
|||
# Calculate labels.
|
||||
y = np.array(y_iter)
|
||||
|
||||
if limit:
|
||||
X = X[:limit]
|
||||
y = y[:limit]
|
||||
|
||||
print(f"X: {X.shape}, y: {y.shape}")
|
||||
|
||||
is_multilabel = isinstance(y[0], np.ndarray)
|
||||
|
|
|
@ -95,7 +95,10 @@ venv/
|
|||
**/.pytest_cache/
|
||||
|
||||
# Project-specific stuff
|
||||
cache/
|
||||
**/cache/
|
||||
data/
|
||||
http_service/
|
||||
*/data
|
||||
|
||||
# Integrations tests cache
|
||||
cache/
|
||||
*/cache/
|
||||
*/*/cache/
|
|
@ -264,7 +264,7 @@ def model_prediction(model_name, bug_id):
|
|||
LOGGER.info("Request with API TOKEN %r", auth)
|
||||
|
||||
# Get the latest change from Bugzilla for the bug
|
||||
bug = get_bugs_last_change_time([bug_id], auth)
|
||||
bug = get_bugs_last_change_time([bug_id])
|
||||
|
||||
if is_prediction_invalidated(model_name, bug_id, bug[bug_id]):
|
||||
clean_prediction_cache(model_name, bug_id)
|
||||
|
|
|
@ -24,6 +24,7 @@ services:
|
|||
environment:
|
||||
- BUGBUG_BUGZILLA_TOKEN
|
||||
- REDIS_URL=redis://redis:6379/0
|
||||
- BUGBUG_ALLOW_MISSING_MODELS
|
||||
depends_on:
|
||||
- redis
|
||||
|
||||
|
|
|
@ -26,6 +26,8 @@ DEFAULT_EXPIRATION_TTL = 7 * 24 * 3600 # A week
|
|||
|
||||
MODEL_CACHE = {}
|
||||
|
||||
ALLOW_MISSING_MODELS = bool(int(os.environ.get("BUGBUG_ALLOW_MISSING_MODELS", "0")))
|
||||
|
||||
|
||||
def result_key(model_name, bug_id):
|
||||
return f"result_{model_name}_{bug_id}"
|
||||
|
@ -37,8 +39,18 @@ def change_time_key(model_name, bug_id):
|
|||
|
||||
def get_model(model_name):
|
||||
if model_name not in MODEL_CACHE:
|
||||
print("Recreating the model in cache")
|
||||
model = load_model(model_name, MODELS_DIR)
|
||||
print("Recreating the %r model in cache" % model_name)
|
||||
try:
|
||||
model = load_model(model_name, MODELS_DIR)
|
||||
except FileNotFoundError:
|
||||
if ALLOW_MISSING_MODELS:
|
||||
print(
|
||||
"Missing %r model, skipping because ALLOW_MISSING_MODELS is set"
|
||||
% model_name
|
||||
)
|
||||
return None
|
||||
else:
|
||||
raise
|
||||
|
||||
MODEL_CACHE[model_name] = model
|
||||
return model
|
||||
|
@ -114,6 +126,10 @@ def classify_bug(
|
|||
|
||||
model = get_model(model_name)
|
||||
|
||||
if not model:
|
||||
print("Missing model %r, aborting" % model_name)
|
||||
return "NOK"
|
||||
|
||||
model_extra_data = model.get_extra_data()
|
||||
|
||||
# TODO: Classify could choke on a single bug which could make the whole
|
||||
|
|
|
@ -0,0 +1,59 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
# You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import os
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
BUGBUG_HTTP_SERVER = os.environ.get("BUGBUG_HTTP_SERVER", "http://localhost:8000/")
|
||||
|
||||
|
||||
def integration_test():
|
||||
# First try to classify a single bug
|
||||
single_bug_url = f"{BUGBUG_HTTP_SERVER}/defectenhancementtask/predict/1376406"
|
||||
response = None
|
||||
for i in range(100):
|
||||
response = requests.get(single_bug_url, headers={"X-Api-Key": "Test"})
|
||||
|
||||
if response.status_code == 200:
|
||||
break
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
if not response:
|
||||
raise Exception("Couldn't get an answer in 100 seconds")
|
||||
|
||||
response_json = response.json()
|
||||
assert response_json["class"] == "defect", response_json
|
||||
|
||||
# Then try to classify a batch
|
||||
batch_url = f"{BUGBUG_HTTP_SERVER}/defectenhancementtask/predict/batch"
|
||||
bug_ids = [1_376_535, 1_376_412]
|
||||
response = None
|
||||
for i in range(100):
|
||||
response = requests.post(
|
||||
batch_url, headers={"X-Api-Key": "Test"}, json={"bugs": bug_ids}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
break
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
if not response:
|
||||
raise Exception("Couldn't get an answer in 100 seconds")
|
||||
|
||||
response_json = response.json()
|
||||
assert response_json["bugs"]["1376535"]["class"] == "enhancement", response_json[
|
||||
"bugs"
|
||||
]["1376535"]
|
||||
assert response_json["bugs"]["1376412"]["class"] == "task", response_json["bugs"][
|
||||
"1376412"
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
integration_test()
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from logging import INFO, basicConfig, getLogger
|
||||
from logging import getLogger
|
||||
|
||||
import dateutil.parser
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
@ -10,12 +10,11 @@ from dateutil.relativedelta import relativedelta
|
|||
from bugbug import bug_snapshot, bugzilla, db, labels, repository
|
||||
from bugbug.utils import get_secret, zstd_compress
|
||||
|
||||
basicConfig(level=INFO)
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class Retriever(object):
|
||||
def retrieve_bugs(self):
|
||||
def retrieve_bugs(self, limit=None):
|
||||
bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
|
||||
|
||||
if not db.is_old_version(bugzilla.BUGS_DB):
|
||||
|
@ -40,10 +39,14 @@ class Retriever(object):
|
|||
timespan_ids = bugzilla.get_ids_between(
|
||||
two_years_and_six_months_ago, six_months_ago
|
||||
)
|
||||
if limit:
|
||||
timespan_ids = timespan_ids[:limit]
|
||||
logger.info(f"Retrieved {len(timespan_ids)} IDs.")
|
||||
|
||||
# Get IDs of labelled bugs.
|
||||
labelled_bug_ids = labels.get_all_bug_ids()
|
||||
if limit:
|
||||
labelled_bug_ids = labelled_bug_ids[:limit]
|
||||
logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")
|
||||
|
||||
# Get the commits DB, as we need it to get the bug IDs linked to recent commits.
|
||||
|
@ -60,6 +63,8 @@ class Retriever(object):
|
|||
if commit["bug_id"]
|
||||
and dateutil.parser.parse(commit["pushdate"]) >= start_date
|
||||
]
|
||||
if limit:
|
||||
commit_bug_ids = commit_bug_ids[:limit]
|
||||
logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")
|
||||
|
||||
# Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model).
|
||||
|
@ -128,12 +133,17 @@ class Retriever(object):
|
|||
def main():
|
||||
description = "Retrieve and extract the information from Bugzilla instance"
|
||||
parser = argparse.ArgumentParser(description=description)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Only download the N oldest bugs, used mainly for integration tests",
|
||||
)
|
||||
|
||||
# Parse args to show the help if `--help` is passed
|
||||
parser.parse_args()
|
||||
args = parser.parse_args()
|
||||
|
||||
retriever = Retriever()
|
||||
retriever.retrieve_bugs()
|
||||
retriever.retrieve_bugs(args.limit)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
@ -43,6 +43,11 @@ def main():
|
|||
description = "Retrieve and extract the information from Mozilla-Central repository"
|
||||
parser = argparse.ArgumentParser(description=description)
|
||||
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Only download the N oldest commits, used mainly for integration tests",
|
||||
) # TODO: Use limit
|
||||
parser.add_argument("cache-root", help="Cache for repository clones.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
#!/bin/bash
|
||||
set -euox pipefail
|
||||
|
||||
# Script that runs the whole data pipeline as fast as possible to validate
|
||||
# that every part is working with the others
|
||||
|
||||
# Supposed to be run from the repository root directory
|
||||
|
||||
cd http_service/models/;
|
||||
|
||||
# Remove the models
|
||||
rm defectenhancementtaskmodel* || true;
|
||||
rm backout* || true;
|
||||
|
||||
# First retrieve a subset of bugs data
|
||||
# TODO: Let the script download the previous DB as it should be pretty fast?
|
||||
bugbug-data-bugzilla --limit 100
|
||||
|
||||
# Then retrieve a subset of commit data
|
||||
mkdir -p cache
|
||||
bugbug-data-commits --limit 100 cache
|
||||
|
||||
# Then train a bug model
|
||||
bugbug-train --limit 500 --no-download defectenhancementtask
|
||||
|
||||
# Then train a commit model
|
||||
bugbug-train --limit 30000 --no-download backout
|
||||
|
||||
# Then spin the http service up
|
||||
# This part duplicates the http service Dockerfiles because we cannot easily spin Docker containers
|
||||
# up on Taskcluster
|
||||
cd ../
|
||||
pip install -r requirements.txt
|
||||
cd ../
|
||||
pwd
|
||||
ls http_service/models/
|
||||
|
||||
export REDIS_URL=redis://localhost:6379/4
|
||||
|
||||
# Start Redis
|
||||
redis-server >/dev/null 2>&1 &
|
||||
redis_pid=$!
|
||||
|
||||
sleep 1
|
||||
|
||||
# Uncomment following line to clean up the redis-server
|
||||
# redis-cli -u $REDIS_URL FLUSHDB
|
||||
|
||||
# Start the http server
|
||||
gunicorn -b 127.0.0.1:8000 http_service.app --preload --timeout 30 -w 3 &
|
||||
gunicorn_pid=$!
|
||||
|
||||
# Start the background worker
|
||||
env BUGBUG_ALLOW_MISSING_MODELS=1 python http_service/worker.py high default low &
|
||||
worker_pid=$!
|
||||
|
||||
# Ensure we take down the containers at the end
|
||||
trap 'kill $gunicorn_pid && kill $worker_pid && kill $redis_pid' EXIT
|
||||
|
||||
# Then check that we can correctly classify a bug
|
||||
sleep 10 && python http_service/tests/test_integration.py
|
|
@ -51,13 +51,19 @@ class Trainer(object):
|
|||
or isinstance(model_obj, model.BugCoupleModel)
|
||||
or (hasattr(model_obj, "bug_data") and model_obj.bug_data)
|
||||
):
|
||||
db.download(bugzilla.BUGS_DB)
|
||||
if args.download_db:
|
||||
db.download(bugzilla.BUGS_DB)
|
||||
else:
|
||||
logger.info("Skipping download of the bug database")
|
||||
|
||||
if isinstance(model_obj, model.CommitModel):
|
||||
db.download(repository.COMMITS_DB)
|
||||
if args.download_db:
|
||||
db.download(repository.COMMITS_DB)
|
||||
else:
|
||||
logger.info("Skipping download of the commit database")
|
||||
|
||||
logger.info(f"Training *{model_name}* model")
|
||||
metrics = model_obj.train()
|
||||
metrics = model_obj.train(limit=args.limit)
|
||||
|
||||
# Save the metrics as a file that can be uploaded as an artifact.
|
||||
metric_file_path = "metrics.json"
|
||||
|
@ -78,6 +84,17 @@ def parse_args(args):
|
|||
parser = argparse.ArgumentParser(description=description)
|
||||
|
||||
parser.add_argument("model", help="Which model to train.")
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
help="Only train on a subset of the data, used mainly for integrations tests",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-download",
|
||||
action="store_false",
|
||||
dest="download_db",
|
||||
help="Do not download databases, uses whatever is on disk",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lemmatization",
|
||||
help="Perform lemmatization (using spaCy)",
|
||||
|
|
|
@ -273,7 +273,7 @@ def test_is_old_version(tmp_path):
|
|||
assert db.is_old_version(db_path)
|
||||
|
||||
|
||||
def test_download_support_file_missing(tmp_path, capfd):
|
||||
def test_download_support_file_missing(tmp_path, caplog):
|
||||
url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst"
|
||||
support_filename = "support_mock.zst"
|
||||
url_support = urljoin(url, support_filename)
|
||||
|
@ -297,11 +297,8 @@ def test_download_support_file_missing(tmp_path, capfd):
|
|||
|
||||
db.download_support_file(db_path, support_filename)
|
||||
|
||||
out, err = capfd.readouterr()
|
||||
path = os.path.join(
|
||||
os.path.dirname(db_path), f"{os.path.splitext(support_filename)[0]}.zst"
|
||||
)
|
||||
assert (
|
||||
out.split("\n")[-2]
|
||||
== f"{support_filename} is not yet available to download for {path}"
|
||||
)
|
||||
expected_message = f"{support_filename} is not yet available to download for {path}"
|
||||
assert expected_message in caplog.text
|
||||
|
|
Загрузка…
Ссылка в новой задаче