Add support for GitHub issue classification to the HTTP service (#2330)

This commit is contained in:
Ksenia 2021-05-11 02:48:07 -04:00 коммит произвёл GitHub
Родитель 7e58f6d1db
Коммит fe6f06c704
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 247 добавлений и 1 удалений

Просмотреть файл

@ -30,6 +30,7 @@ services:
image: mozilla/bugbug-http-service
environment:
- BUGBUG_BUGZILLA_TOKEN
- BUGBUG_GITHUB_TOKEN
- PORT=8000
ports:
- target: 8000
@ -44,6 +45,7 @@ services:
image: mozilla/bugbug-http-service-bg-worker
environment:
- BUGBUG_BUGZILLA_TOKEN
- BUGBUG_GITHUB_TOKEN
bugbug-spawn-pipeline:
build:

23
http_service/README.md Normal file
Просмотреть файл

@ -0,0 +1,23 @@
### Local development
**For starting the service locally run the following commands.**
Start Redis:
docker-compose up redis
Build the http service image:
docker build -t mozilla/bugbug-http-service -f Dockerfile .
Start the http service:
docker-compose up bugbug-http-service
Build the background worker image:
docker build -t mozilla/bugbug-http-service-bg-worker --build-arg TAG=latest -f Dockerfile.bg_worker .
Run the background worker:
docker-compose up bugbug-http-service-bg-worker

Просмотреть файл

@ -32,6 +32,7 @@ from bugbug import get_bugbug_version, utils
from bugbug_http.models import (
MODELS_NAMES,
classify_bug,
classify_issue,
get_config_specific_groups,
schedule_tests,
)
@ -80,6 +81,8 @@ q = Queue(
VALIDATOR = Validator()
BUGZILLA_TOKEN = os.environ.get("BUGBUG_BUGZILLA_TOKEN")
GITHUB_TOKEN = os.environ.get("BUGBUG_GITHUB_TOKEN")
BUGZILLA_API_URL = (
libmozdata.config.get("Bugzilla", "URL", "https://bugzilla.mozilla.org")
+ "/rest/bug"
@ -212,6 +215,27 @@ def schedule_bug_classification(model_name: str, bug_ids: Sequence[int]) -> None
)
def schedule_issue_classification(
model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
) -> None:
"""Schedule the classification of a issue_id list"""
job_id = get_job_id()
# Set the mapping before queuing to avoid some race conditions
job_id_mapping = {}
for issue_num in issue_nums:
key = JobInfo(classify_issue, model_name, owner, repo, issue_num).mapping_key
job_id_mapping[key] = job_id
redis_conn.mset(job_id_mapping)
schedule_job(
JobInfo(classify_issue, model_name, owner, repo, issue_nums),
job_id=job_id,
timeout=BUGZILLA_JOB_TIMEOUT,
)
def is_pending(job):
# Check if there is a job
job_id = redis_conn.get(job.mapping_key)
@ -275,6 +299,23 @@ def get_bugs_last_change_time(bug_ids):
return bugs
def get_github_issues_update_time(
owner: str, repo: str, issue_nums: Sequence[int]
) -> dict:
header = {"Authorization": "token {}".format(GITHUB_TOKEN)}
repo_url = f"https://api.github.com/repos/{owner}/{repo}/issues/"
issues = {}
for issue_num in issue_nums:
issue_url = repo_url + str(issue_num)
response = utils.get_session("github").get(issue_url, headers=header)
response.raise_for_status()
raw_issue = response.json()
issues[raw_issue["number"]] = raw_issue["updated_at"]
return issues
def is_prediction_invalidated(job, change_time):
# First get the saved change time
saved_change_time = redis_conn.get(job.change_time_key)
@ -407,6 +448,84 @@ def model_prediction(model_name, bug_id):
return compress_response(data, status_code)
@application.route(
"/<model_name>/predict/github/<string:owner>/<string:repo>/<int:issue_num>"
)
@cross_origin()
def model_prediction_github(model_name, owner, repo, issue_num):
"""
---
get:
description: Classify a single issue using given model, answer either 200 if the issue is processed or 202 if the issue is being processed
summary: Classify a single issue
parameters:
- name: model_name
in: path
schema: ModelName
- name: owner
in: path
schema:
type: str
example: webcompat
- name: repo
in: path
schema:
type: str
example: web-bugs
- name: issue_number
in: path
schema:
type: integer
example: 123456
responses:
200:
description: A single issue prediction
content:
application/json:
schema: BugPrediction
202:
description: A temporary answer for the issue being processed
content:
application/json:
schema: NotAvailableYet
401:
description: API key is missing
content:
application/json:
schema: UnauthorizedError
"""
headers = request.headers
auth = headers.get(API_TOKEN)
if not auth:
return jsonify(UnauthorizedError().dump({})), 401
else:
LOGGER.info("Request with API TOKEN %r", auth)
if model_name not in MODELS_NAMES:
return jsonify({"error": f"Model {model_name} doesn't exist"}), 404
# Get the latest change date from github for the issue
update_time = get_github_issues_update_time(owner, repo, [issue_num])
job = JobInfo(classify_issue, model_name, owner, repo, issue_num)
issue_change_time = update_time.get(issue_num)
if issue_change_time and is_prediction_invalidated(job, update_time[issue_num]):
clean_prediction_cache(job)
status_code = 200
data = get_result(job)
if not data:
if not is_pending(job):
schedule_issue_classification(model_name, owner, repo, [issue_num])
status_code = 202
data = {"ready": False}
return compress_response(data, status_code)
@application.route("/<model_name>/predict/batch", methods=["POST"])
@cross_origin()
def batch_prediction(model_name):

Просмотреть файл

@ -15,7 +15,7 @@ import zstandard
from libmozdata.bugzilla import Bugzilla
from redis import Redis
from bugbug import bugzilla, repository, test_scheduling
from bugbug import bugzilla, github, repository, test_scheduling
from bugbug.model import Model
from bugbug.utils import get_hgmo_stack
from bugbug_http.readthrough_cache import ReadthroughTTLCache
@ -26,6 +26,7 @@ LOGGER = logging.getLogger()
MODELS_NAMES = [
"defectenhancementtask",
"component",
"needsdiagnosis",
"regression",
"stepstoreproduce",
"spambug",
@ -110,6 +111,64 @@ def classify_bug(model_name: str, bug_ids: Sequence[int], bugzilla_token: str) -
return "OK"
def classify_issue(
model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
) -> str:
from bugbug_http.app import JobInfo
issue_ids_set = set(map(int, issue_nums))
issues = {
issue_num: github.fetch_issue_by_number(owner, repo, issue_num, True)
for issue_num in issue_nums
}
missing_issues = issue_ids_set.difference(issues.keys())
for issue_id in missing_issues:
job = JobInfo(classify_issue, model_name, owner, repo, issue_id)
# TODO: Find a better error format
setkey(job.result_key, orjson.dumps({"available": False}))
if not issues:
return "NOK"
model = MODEL_CACHE.get(model_name)
if not model:
LOGGER.info("Missing model %r, aborting" % model_name)
return "NOK"
model_extra_data = model.get_extra_data()
# TODO: Classify could choke on a single bug which could make the whole
# job to fail. What should we do here?
probs = model.classify(list(issues.values()), True)
indexes = probs.argmax(axis=-1)
suggestions = model.le.inverse_transform(indexes)
probs_list = probs.tolist()
indexes_list = indexes.tolist()
suggestions_list = suggestions.tolist()
for i, issue_id in enumerate(issues.keys()):
data = {
"prob": probs_list[i],
"index": indexes_list[i],
"class": suggestions_list[i],
"extra_data": model_extra_data,
}
job = JobInfo(classify_issue, model_name, owner, repo, issue_id)
setkey(job.result_key, orjson.dumps(data), compress=True)
# Save the bug last change
setkey(job.change_time_key, issues[issue_id]["updated_at"].encode())
return "OK"
@lru_cache(maxsize=None)
def get_known_tasks() -> Tuple[str, ...]:
with open("known_tasks", "r") as f:

Просмотреть файл

@ -6,6 +6,7 @@ services:
image: mozilla/bugbug-http-service
environment:
- BUGBUG_BUGZILLA_TOKEN
- BUGBUG_GITHUB_TOKEN
- REDIS_URL=redis://redis:6379/0
- PORT=8000
- PULSE_USER
@ -26,6 +27,7 @@ services:
image: mozilla/bugbug-http-service-bg-worker
environment:
- BUGBUG_BUGZILLA_TOKEN
- BUGBUG_GITHUB_TOKEN
- REDIS_URL=redis://redis:6379/0
- BUGBUG_ALLOW_MISSING_MODELS
- BUGBUG_REPO_DIR

Просмотреть файл

@ -61,6 +61,47 @@ def test_model_predict_id(client, jobs, add_result, responses):
assert retrieve_compressed_reponse(rv) == result
def test_model_predict_id_github(client, jobs, add_result, responses):
issue_id = "12345"
result = {
"prob": [0.11845558881759644, 0.8815444111824036],
"index": 1,
"class": 1,
"extra_data": {},
}
responses.add(
responses.GET,
f"https://api.github.com/repos/webcompat/web-bugs/issues/{issue_id}",
status=200,
json={"number": issue_id, "updated_at": time.time()},
)
def do_request():
return client.get(
"/needsdiagnosis/predict/github/webcompat/web-bugs/12345",
headers={API_TOKEN: "test"},
)
rv = do_request()
assert rv.status_code == 202
assert retrieve_compressed_reponse(rv) == {"ready": False}
# request still not ready
rv = do_request()
assert rv.status_code == 202
assert retrieve_compressed_reponse(rv) == {"ready": False}
assert len(jobs) == 1
# now it's ready
keys = next(iter(jobs.values()))
add_result(keys[0], result)
rv = do_request()
assert rv.status_code == 200
assert retrieve_compressed_reponse(rv) == result
def test_model_predict_batch(client, jobs, add_result, add_change_time, responses):
bug_ids = [123, 456]
result = {