Add support for GitHub issue classification to the HTTP service (#2330)

2021-05-11 02:48:07 -04:00 · 2021-05-11 02:48:07 -04:00 · fe6f06c704
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -30,6 +30,7 @@ services:
    image: mozilla/bugbug-http-service
    environment:
      - BUGBUG_BUGZILLA_TOKEN
+      - BUGBUG_GITHUB_TOKEN
      - PORT=8000
    ports:
      - target: 8000
@ -44,6 +45,7 @@ services:
    image: mozilla/bugbug-http-service-bg-worker
    environment:
      - BUGBUG_BUGZILLA_TOKEN
+      - BUGBUG_GITHUB_TOKEN

  bugbug-spawn-pipeline:
    build:
--- a/http_service/README.md
+++ b/http_service/README.md
@ -0,0 +1,23 @@
+### Local development
+
+**For starting the service locally run the following commands.**
+
+Start Redis:
+
+    docker-compose up redis
+
+Build the http service image:
+
+    docker build -t mozilla/bugbug-http-service -f Dockerfile .
+
+Start the http service:
+
+    docker-compose up bugbug-http-service
+
+Build the background worker image:
+
+    docker build -t mozilla/bugbug-http-service-bg-worker --build-arg TAG=latest -f Dockerfile.bg_worker .
+
+Run the background worker:
+
+    docker-compose up bugbug-http-service-bg-worker
--- a/http_service/bugbug_http/app.py
+++ b/http_service/bugbug_http/app.py
@ -32,6 +32,7 @@ from bugbug import get_bugbug_version, utils
 from bugbug_http.models import (
    MODELS_NAMES,
    classify_bug,
+    classify_issue,
    get_config_specific_groups,
    schedule_tests,
 )
@ -80,6 +81,8 @@ q = Queue(
 VALIDATOR = Validator()

 BUGZILLA_TOKEN = os.environ.get("BUGBUG_BUGZILLA_TOKEN")
+GITHUB_TOKEN = os.environ.get("BUGBUG_GITHUB_TOKEN")
+
 BUGZILLA_API_URL = (
    libmozdata.config.get("Bugzilla", "URL", "https://bugzilla.mozilla.org")
    + "/rest/bug"
@ -212,6 +215,27 @@ def schedule_bug_classification(model_name: str, bug_ids: Sequence[int]) -> None
    )


+def schedule_issue_classification(
+    model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
+) -> None:
+    """Schedule the classification of a issue_id list"""
+    job_id = get_job_id()
+
+    # Set the mapping before queuing to avoid some race conditions
+    job_id_mapping = {}
+    for issue_num in issue_nums:
+        key = JobInfo(classify_issue, model_name, owner, repo, issue_num).mapping_key
+        job_id_mapping[key] = job_id
+
+    redis_conn.mset(job_id_mapping)
+
+    schedule_job(
+        JobInfo(classify_issue, model_name, owner, repo, issue_nums),
+        job_id=job_id,
+        timeout=BUGZILLA_JOB_TIMEOUT,
+    )
+
+
 def is_pending(job):
    # Check if there is a job
    job_id = redis_conn.get(job.mapping_key)
@ -275,6 +299,23 @@ def get_bugs_last_change_time(bug_ids):
    return bugs


+def get_github_issues_update_time(
+    owner: str, repo: str, issue_nums: Sequence[int]
+) -> dict:
+    header = {"Authorization": "token {}".format(GITHUB_TOKEN)}
+    repo_url = f"https://api.github.com/repos/{owner}/{repo}/issues/"
+
+    issues = {}
+    for issue_num in issue_nums:
+        issue_url = repo_url + str(issue_num)
+        response = utils.get_session("github").get(issue_url, headers=header)
+        response.raise_for_status()
+        raw_issue = response.json()
+        issues[raw_issue["number"]] = raw_issue["updated_at"]
+
+    return issues
+
+
 def is_prediction_invalidated(job, change_time):
    # First get the saved change time
    saved_change_time = redis_conn.get(job.change_time_key)
@ -407,6 +448,84 @@ def model_prediction(model_name, bug_id):
    return compress_response(data, status_code)


+@application.route(
+    "/<model_name>/predict/github/<string:owner>/<string:repo>/<int:issue_num>"
+)
+@cross_origin()
+def model_prediction_github(model_name, owner, repo, issue_num):
+    """
+    ---
+    get:
+      description: Classify a single issue using given model, answer either 200 if the issue is processed or 202 if the issue is being processed
+      summary: Classify a single issue
+      parameters:
+      - name: model_name
+        in: path
+        schema: ModelName
+      - name: owner
+        in: path
+        schema:
+          type: str
+          example: webcompat
+      - name: repo
+        in: path
+        schema:
+          type: str
+          example: web-bugs
+      - name: issue_number
+        in: path
+        schema:
+          type: integer
+          example: 123456
+      responses:
+        200:
+          description: A single issue prediction
+          content:
+            application/json:
+              schema: BugPrediction
+        202:
+          description: A temporary answer for the issue being processed
+          content:
+            application/json:
+              schema: NotAvailableYet
+        401:
+          description: API key is missing
+          content:
+            application/json:
+              schema: UnauthorizedError
+    """
+    headers = request.headers
+
+    auth = headers.get(API_TOKEN)
+
+    if not auth:
+        return jsonify(UnauthorizedError().dump({})), 401
+    else:
+        LOGGER.info("Request with API TOKEN %r", auth)
+
+    if model_name not in MODELS_NAMES:
+        return jsonify({"error": f"Model {model_name} doesn't exist"}), 404
+
+    # Get the latest change date from github for the issue
+    update_time = get_github_issues_update_time(owner, repo, [issue_num])
+
+    job = JobInfo(classify_issue, model_name, owner, repo, issue_num)
+    issue_change_time = update_time.get(issue_num)
+    if issue_change_time and is_prediction_invalidated(job, update_time[issue_num]):
+        clean_prediction_cache(job)
+
+    status_code = 200
+    data = get_result(job)
+
+    if not data:
+        if not is_pending(job):
+            schedule_issue_classification(model_name, owner, repo, [issue_num])
+        status_code = 202
+        data = {"ready": False}
+
+    return compress_response(data, status_code)
+
+
@application.route("/<model_name>/predict/batch", methods=["POST"])
@cross_origin()
 def batch_prediction(model_name):
--- a/http_service/bugbug_http/models.py
+++ b/http_service/bugbug_http/models.py
@ -15,7 +15,7 @@ import zstandard
 from libmozdata.bugzilla import Bugzilla
 from redis import Redis

-from bugbug import bugzilla, repository, test_scheduling
+from bugbug import bugzilla, github, repository, test_scheduling
 from bugbug.model import Model
 from bugbug.utils import get_hgmo_stack
 from bugbug_http.readthrough_cache import ReadthroughTTLCache
@ -26,6 +26,7 @@ LOGGER = logging.getLogger()
 MODELS_NAMES = [
    "defectenhancementtask",
    "component",
+    "needsdiagnosis",
    "regression",
    "stepstoreproduce",
    "spambug",
@ -110,6 +111,64 @@ def classify_bug(model_name: str, bug_ids: Sequence[int], bugzilla_token: str) -
    return "OK"


+def classify_issue(
+    model_name: str, owner: str, repo: str, issue_nums: Sequence[int]
+) -> str:
+    from bugbug_http.app import JobInfo
+
+    issue_ids_set = set(map(int, issue_nums))
+
+    issues = {
+        issue_num: github.fetch_issue_by_number(owner, repo, issue_num, True)
+        for issue_num in issue_nums
+    }
+
+    missing_issues = issue_ids_set.difference(issues.keys())
+
+    for issue_id in missing_issues:
+        job = JobInfo(classify_issue, model_name, owner, repo, issue_id)
+
+        # TODO: Find a better error format
+        setkey(job.result_key, orjson.dumps({"available": False}))
+
+    if not issues:
+        return "NOK"
+
+    model = MODEL_CACHE.get(model_name)
+
+    if not model:
+        LOGGER.info("Missing model %r, aborting" % model_name)
+        return "NOK"
+
+    model_extra_data = model.get_extra_data()
+
+    # TODO: Classify could choke on a single bug which could make the whole
+    # job to fail. What should we do here?
+    probs = model.classify(list(issues.values()), True)
+    indexes = probs.argmax(axis=-1)
+    suggestions = model.le.inverse_transform(indexes)
+
+    probs_list = probs.tolist()
+    indexes_list = indexes.tolist()
+    suggestions_list = suggestions.tolist()
+
+    for i, issue_id in enumerate(issues.keys()):
+        data = {
+            "prob": probs_list[i],
+            "index": indexes_list[i],
+            "class": suggestions_list[i],
+            "extra_data": model_extra_data,
+        }
+
+        job = JobInfo(classify_issue, model_name, owner, repo, issue_id)
+        setkey(job.result_key, orjson.dumps(data), compress=True)
+
+        # Save the bug last change
+        setkey(job.change_time_key, issues[issue_id]["updated_at"].encode())
+
+    return "OK"
+
+
@lru_cache(maxsize=None)
 def get_known_tasks() -> Tuple[str, ...]:
    with open("known_tasks", "r") as f:
--- a/http_service/docker-compose.yml
+++ b/http_service/docker-compose.yml
@ -6,6 +6,7 @@ services:
    image: mozilla/bugbug-http-service
    environment:
      - BUGBUG_BUGZILLA_TOKEN
+      - BUGBUG_GITHUB_TOKEN
      - REDIS_URL=redis://redis:6379/0
      - PORT=8000
      - PULSE_USER
@ -26,6 +27,7 @@ services:
    image: mozilla/bugbug-http-service-bg-worker
    environment:
      - BUGBUG_BUGZILLA_TOKEN
+      - BUGBUG_GITHUB_TOKEN
      - REDIS_URL=redis://redis:6379/0
      - BUGBUG_ALLOW_MISSING_MODELS
      - BUGBUG_REPO_DIR
--- a/http_service/tests/test_bug_classification.py
+++ b/http_service/tests/test_bug_classification.py
@ -61,6 +61,47 @@ def test_model_predict_id(client, jobs, add_result, responses):
    assert retrieve_compressed_reponse(rv) == result


+def test_model_predict_id_github(client, jobs, add_result, responses):
+    issue_id = "12345"
+    result = {
+        "prob": [0.11845558881759644, 0.8815444111824036],
+        "index": 1,
+        "class": 1,
+        "extra_data": {},
+    }
+
+    responses.add(
+        responses.GET,
+        f"https://api.github.com/repos/webcompat/web-bugs/issues/{issue_id}",
+        status=200,
+        json={"number": issue_id, "updated_at": time.time()},
+    )
+
+    def do_request():
+        return client.get(
+            "/needsdiagnosis/predict/github/webcompat/web-bugs/12345",
+            headers={API_TOKEN: "test"},
+        )
+
+    rv = do_request()
+    assert rv.status_code == 202
+    assert retrieve_compressed_reponse(rv) == {"ready": False}
+
+    # request still not ready
+    rv = do_request()
+    assert rv.status_code == 202
+    assert retrieve_compressed_reponse(rv) == {"ready": False}
+    assert len(jobs) == 1
+
+    # now it's ready
+    keys = next(iter(jobs.values()))
+    add_result(keys[0], result)
+
+    rv = do_request()
+    assert rv.status_code == 200
+    assert retrieve_compressed_reponse(rv) == result
+
+
 def test_model_predict_batch(client, jobs, add_result, add_change_time, responses):
    bug_ids = [123, 456]
    result = {