This commit is contained in:
Boris Feld 2019-09-13 15:18:46 +02:00 коммит произвёл Marco
Родитель a786f82b8c
Коммит c30295f870
14 изменённых файлов: 241 добавлений и 26 удалений

Просмотреть файл

@ -406,3 +406,38 @@ tasks:
description: bugbug update classify patch hook
owner: mcastelluccio@mozilla.com
source: ${repository}/raw/${head_rev}/.taskcluster.yml
- $if: 'tasks_for == "github-push" && head_branch[:10] == "refs/tags/"'
then:
taskId: {$eval: as_slugid("integration_test")}
created: {$fromNow: ''}
deadline: {$fromNow: '1 hour'}
provisionerId: aws-provisioner-v1
workerType: relman-svc
scopes:
- secrets:get:project/relman/bugbug/integration
payload:
features:
taskclusterProxy:
true
maxRunTime: 10800
image: mozilla/bugbug-commit-retrieval:latest
env:
TC_SECRET_ID: project/relman/bugbug/integration
command:
- "/bin/bash"
- "-lcx"
- "git clone --quiet ${repository} &&
cd bugbug &&
git -c advice.detachedHead=false checkout ${head_rev} &&
pip install --quiet . &&
pip install --quiet -r test-requirements.txt &&
apt-get update &&
apt-get install -y redis-server &&
python -c 'import os; print(os.environ.keys())' &&
bash ./scripts/integration_test.sh || true"
metadata:
name: bugbug integration test
description: bugbug integration test
owner: ${user}@users.noreply.github.com
source: ${repository}/raw/${head_rev}/.taskcluster.yml

Просмотреть файл

@ -4,7 +4,9 @@ import logging
import pkg_resources
logging.basicConfig(level=logging.INFO)
logging.basicConfig(
level=logging.INFO, format="%(asctime)s:%(levelname)s:%(name)s:%(message)s"
)
def get_bugbug_version():

Просмотреть файл

@ -6,6 +6,7 @@
import gzip
import io
import json
import logging
import os
import pickle
from contextlib import contextmanager
@ -19,6 +20,8 @@ from bugbug.utils import zstd_decompress
DATABASES = {}
logger = logging.getLogger(__name__)
def register(path, url, version, support_files=[]):
DATABASES[path] = {"url": url, "version": version, "support_files": support_files}
@ -62,13 +65,15 @@ def download_support_file(path, file_name):
url = urljoin(DATABASES[path]["url"], file_name)
path = os.path.join(os.path.dirname(path), file_name)
print(f"Downloading {url} to {path}")
logger.info(f"Downloading {url} to {path}")
utils.download_check_etag(url, path)
if path.endswith(".zst"):
extract_file(path)
except requests.exceptions.HTTPError:
print(f"{file_name} is not yet available to download for {path}")
logger.info(
f"{file_name} is not yet available to download for {path}", exc_info=True
)
# Download and extract databases.
@ -82,11 +87,11 @@ def download(path, force=False, support_files_too=False):
if not os.path.exists(zst_path) or force:
url = DATABASES[path]["url"]
try:
print(f"Downloading {url} to {zst_path}")
logger.info(f"Downloading {url} to {zst_path}")
utils.download_check_etag(url, zst_path)
except requests.exceptions.HTTPError:
print(f"{url} is not yet available to download")
logger.info(f"{url} is not yet available to download", exc_info=True)
return
extract_file(zst_path)

Просмотреть файл

@ -313,7 +313,7 @@ class Model:
return feature_report
def train(self, importance_cutoff=0.15):
def train(self, importance_cutoff=0.15, limit=None):
classes, self.class_names = self.get_labels()
self.class_names = sort_class_names(self.class_names)
@ -326,6 +326,10 @@ class Model:
# Calculate labels.
y = np.array(y_iter)
if limit:
X = X[:limit]
y = y[:limit]
print(f"X: {X.shape}, y: {y.shape}")
is_multilabel = isinstance(y[0], np.ndarray)

Просмотреть файл

@ -95,7 +95,10 @@ venv/
**/.pytest_cache/
# Project-specific stuff
cache/
**/cache/
data/
http_service/
*/data
# Integrations tests cache
cache/
*/cache/
*/*/cache/

Просмотреть файл

@ -264,7 +264,7 @@ def model_prediction(model_name, bug_id):
LOGGER.info("Request with API TOKEN %r", auth)
# Get the latest change from Bugzilla for the bug
bug = get_bugs_last_change_time([bug_id], auth)
bug = get_bugs_last_change_time([bug_id])
if is_prediction_invalidated(model_name, bug_id, bug[bug_id]):
clean_prediction_cache(model_name, bug_id)

Просмотреть файл

@ -24,6 +24,7 @@ services:
environment:
- BUGBUG_BUGZILLA_TOKEN
- REDIS_URL=redis://redis:6379/0
- BUGBUG_ALLOW_MISSING_MODELS
depends_on:
- redis

Просмотреть файл

@ -26,6 +26,8 @@ DEFAULT_EXPIRATION_TTL = 7 * 24 * 3600 # A week
MODEL_CACHE = {}
ALLOW_MISSING_MODELS = bool(int(os.environ.get("BUGBUG_ALLOW_MISSING_MODELS", "0")))
def result_key(model_name, bug_id):
return f"result_{model_name}_{bug_id}"
@ -37,8 +39,18 @@ def change_time_key(model_name, bug_id):
def get_model(model_name):
if model_name not in MODEL_CACHE:
print("Recreating the model in cache")
model = load_model(model_name, MODELS_DIR)
print("Recreating the %r model in cache" % model_name)
try:
model = load_model(model_name, MODELS_DIR)
except FileNotFoundError:
if ALLOW_MISSING_MODELS:
print(
"Missing %r model, skipping because ALLOW_MISSING_MODELS is set"
% model_name
)
return None
else:
raise
MODEL_CACHE[model_name] = model
return model
@ -114,6 +126,10 @@ def classify_bug(
model = get_model(model_name)
if not model:
print("Missing model %r, aborting" % model_name)
return "NOK"
model_extra_data = model.get_extra_data()
# TODO: Classify could choke on a single bug which could make the whole

Просмотреть файл

@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import os
import time
import requests
BUGBUG_HTTP_SERVER = os.environ.get("BUGBUG_HTTP_SERVER", "http://localhost:8000/")
def integration_test():
# First try to classify a single bug
single_bug_url = f"{BUGBUG_HTTP_SERVER}/defectenhancementtask/predict/1376406"
response = None
for i in range(100):
response = requests.get(single_bug_url, headers={"X-Api-Key": "Test"})
if response.status_code == 200:
break
time.sleep(1)
if not response:
raise Exception("Couldn't get an answer in 100 seconds")
response_json = response.json()
assert response_json["class"] == "defect", response_json
# Then try to classify a batch
batch_url = f"{BUGBUG_HTTP_SERVER}/defectenhancementtask/predict/batch"
bug_ids = [1_376_535, 1_376_412]
response = None
for i in range(100):
response = requests.post(
batch_url, headers={"X-Api-Key": "Test"}, json={"bugs": bug_ids}
)
if response.status_code == 200:
break
time.sleep(1)
if not response:
raise Exception("Couldn't get an answer in 100 seconds")
response_json = response.json()
assert response_json["bugs"]["1376535"]["class"] == "enhancement", response_json[
"bugs"
]["1376535"]
assert response_json["bugs"]["1376412"]["class"] == "task", response_json["bugs"][
"1376412"
]
if __name__ == "__main__":
integration_test()

Просмотреть файл

@ -2,7 +2,7 @@
import argparse
from datetime import datetime
from logging import INFO, basicConfig, getLogger
from logging import getLogger
import dateutil.parser
from dateutil.relativedelta import relativedelta
@ -10,12 +10,11 @@ from dateutil.relativedelta import relativedelta
from bugbug import bug_snapshot, bugzilla, db, labels, repository
from bugbug.utils import get_secret, zstd_compress
basicConfig(level=INFO)
logger = getLogger(__name__)
class Retriever(object):
def retrieve_bugs(self):
def retrieve_bugs(self, limit=None):
bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
if not db.is_old_version(bugzilla.BUGS_DB):
@ -40,10 +39,14 @@ class Retriever(object):
timespan_ids = bugzilla.get_ids_between(
two_years_and_six_months_ago, six_months_ago
)
if limit:
timespan_ids = timespan_ids[:limit]
logger.info(f"Retrieved {len(timespan_ids)} IDs.")
# Get IDs of labelled bugs.
labelled_bug_ids = labels.get_all_bug_ids()
if limit:
labelled_bug_ids = labelled_bug_ids[:limit]
logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")
# Get the commits DB, as we need it to get the bug IDs linked to recent commits.
@ -60,6 +63,8 @@ class Retriever(object):
if commit["bug_id"]
and dateutil.parser.parse(commit["pushdate"]) >= start_date
]
if limit:
commit_bug_ids = commit_bug_ids[:limit]
logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")
# Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model).
@ -128,12 +133,17 @@ class Retriever(object):
def main():
description = "Retrieve and extract the information from Bugzilla instance"
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
"--limit",
type=int,
help="Only download the N oldest bugs, used mainly for integration tests",
)
# Parse args to show the help if `--help` is passed
parser.parse_args()
args = parser.parse_args()
retriever = Retriever()
retriever.retrieve_bugs()
retriever.retrieve_bugs(args.limit)
if __name__ == "__main__":

Просмотреть файл

@ -43,6 +43,11 @@ def main():
description = "Retrieve and extract the information from Mozilla-Central repository"
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
"--limit",
type=int,
help="Only download the N oldest commits, used mainly for integration tests",
) # TODO: Use limit
parser.add_argument("cache-root", help="Cache for repository clones.")
args = parser.parse_args()

61
scripts/integration_test.sh Executable file
Просмотреть файл

@ -0,0 +1,61 @@
#!/bin/bash
set -euox pipefail
# Script that runs the whole data pipeline as fast as possible to validate
# that every part is working with the others
# Supposed to be run from the repository root directory
cd http_service/models/;
# Remove the models
rm defectenhancementtaskmodel* || true;
rm backout* || true;
# First retrieve a subset of bugs data
# TODO: Let the script download the previous DB as it should be pretty fast?
bugbug-data-bugzilla --limit 100
# Then retrieve a subset of commit data
mkdir -p cache
bugbug-data-commits --limit 100 cache
# Then train a bug model
bugbug-train --limit 500 --no-download defectenhancementtask
# Then train a commit model
bugbug-train --limit 30000 --no-download backout
# Then spin the http service up
# This part duplicates the http service Dockerfiles because we cannot easily spin Docker containers
# up on Taskcluster
cd ../
pip install -r requirements.txt
cd ../
pwd
ls http_service/models/
export REDIS_URL=redis://localhost:6379/4
# Start Redis
redis-server >/dev/null 2>&1 &
redis_pid=$!
sleep 1
# Uncomment following line to clean up the redis-server
# redis-cli -u $REDIS_URL FLUSHDB
# Start the http server
gunicorn -b 127.0.0.1:8000 http_service.app --preload --timeout 30 -w 3 &
gunicorn_pid=$!
# Start the background worker
env BUGBUG_ALLOW_MISSING_MODELS=1 python http_service/worker.py high default low &
worker_pid=$!
# Ensure we take down the containers at the end
trap 'kill $gunicorn_pid && kill $worker_pid && kill $redis_pid' EXIT
# Then check that we can correctly classify a bug
sleep 10 && python http_service/tests/test_integration.py

Просмотреть файл

@ -51,13 +51,19 @@ class Trainer(object):
or isinstance(model_obj, model.BugCoupleModel)
or (hasattr(model_obj, "bug_data") and model_obj.bug_data)
):
db.download(bugzilla.BUGS_DB)
if args.download_db:
db.download(bugzilla.BUGS_DB)
else:
logger.info("Skipping download of the bug database")
if isinstance(model_obj, model.CommitModel):
db.download(repository.COMMITS_DB)
if args.download_db:
db.download(repository.COMMITS_DB)
else:
logger.info("Skipping download of the commit database")
logger.info(f"Training *{model_name}* model")
metrics = model_obj.train()
metrics = model_obj.train(limit=args.limit)
# Save the metrics as a file that can be uploaded as an artifact.
metric_file_path = "metrics.json"
@ -78,6 +84,17 @@ def parse_args(args):
parser = argparse.ArgumentParser(description=description)
parser.add_argument("model", help="Which model to train.")
parser.add_argument(
"--limit",
type=int,
help="Only train on a subset of the data, used mainly for integrations tests",
)
parser.add_argument(
"--no-download",
action="store_false",
dest="download_db",
help="Do not download databases, uses whatever is on disk",
)
parser.add_argument(
"--lemmatization",
help="Perform lemmatization (using spaCy)",

Просмотреть файл

@ -273,7 +273,7 @@ def test_is_old_version(tmp_path):
assert db.is_old_version(db_path)
def test_download_support_file_missing(tmp_path, capfd):
def test_download_support_file_missing(tmp_path, caplog):
url = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.zst"
support_filename = "support_mock.zst"
url_support = urljoin(url, support_filename)
@ -297,11 +297,8 @@ def test_download_support_file_missing(tmp_path, capfd):
db.download_support_file(db_path, support_filename)
out, err = capfd.readouterr()
path = os.path.join(
os.path.dirname(db_path), f"{os.path.splitext(support_filename)[0]}.zst"
)
assert (
out.split("\n")[-2]
== f"{support_filename} is not yet available to download for {path}"
)
expected_message = f"{support_filename} is not yet available to download for {path}"
assert expected_message in caplog.text