Add support for GitHub issue retrieval (#2242)

First step for #259
2021-03-24 03:46:37 -04:00 · 2021-03-24 03:46:37 -04:00 · e4db6b219f
--- a/.isort.cfg
+++ b/.isort.cfg
@ -3,4 +3,4 @@ multi_line_output=3
 include_trailing_comma=True
 line_length=88
 known_first_party = bugbug,bugbug_http
-known_third_party = _pytest,apispec,apispec_webframeworks,boto3,cerberus,dateutil,flask,flask_cors,hglib,hypothesis,igraph,imblearn,jsone,jsonschema,kombu,libmozdata,lmdb,marshmallow,matplotlib,microannotate,mozci,numpy,orjson,ortools,pandas,pkg_resources,psutil,py,pyemd,pytest,redis,requests,responses,rq,rs_parsepatch,scipy,sentry_sdk,setuptools,shap,sklearn,tabulate,taskcluster,tenacity,tqdm,xgboost,yaml,zstandard
+known_third_party = _pytest,apispec,apispec_webframeworks,boto3,cerberus,dateutil,flask,flask_cors,hglib,hypothesis,igraph,imblearn,jsone,jsonschema,kombu,libmozdata,lmdb,marshmallow,matplotlib,microannotate,mozci,numpy,orjson,ortools,pandas,pkg_resources,psutil,py,pyemd,pytest,ratelimit,redis,requests,responses,rq,rs_parsepatch,scipy,sentry_sdk,setuptools,shap,sklearn,tabulate,taskcluster,tenacity,tqdm,xgboost,yaml,zstandard
--- a/bugbug/github.py
+++ b/bugbug/github.py
@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import logging
+from typing import Iterator, NewType, Tuple
+
+import requests
+from ratelimit import limits, sleep_and_retry
+
+from bugbug import db
+from bugbug.utils import get_secret
+
+logger = logging.getLogger(__name__)
+
+GITHUB_ISSUES_DB = "data/github_issues.json"
+db.register(
+    GITHUB_ISSUES_DB,
+    "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_issues.latest/artifacts/public/github_issues.json.zst",
+    1,
+)
+
+IssueDict = NewType("IssueDict", dict)
+
+PER_PAGE = 100
+# Rate limit period in seconds
+RATE_LIMIT_PERIOD = 900
+
+
+def get_issues() -> Iterator[IssueDict]:
+    yield from db.read(GITHUB_ISSUES_DB)
+
+
+@sleep_and_retry
+@limits(calls=1200, period=RATE_LIMIT_PERIOD)
+def api_limit():
+    # Allow a limited number of requests to account for rate limiting
+    pass
+
+
+def get_token() -> str:
+    return get_secret("GITHUB_TOKEN")
+
+
+def fetch_events(events_url: str) -> str:
+    api_limit()
+    logger.info(f"Fetching {events_url}")
+    headers = {"Authorization": "token {}".format(get_token())}
+    response = requests.get(events_url, headers=headers)
+    response.raise_for_status()
+    events_raw = response.json()
+    return events_raw
+
+
+def fetch_issues(
+    url: str, retrieve_events: bool, params: dict = None
+) -> Tuple[str, dict]:
+    api_limit()
+    headers = {"Authorization": "token {}".format(get_token())}
+    response = requests.get(url, params=params, headers=headers)
+    response.raise_for_status()
+    data = response.json()
+
+    logger.info(f"Fetching {url}")
+
+    if retrieve_events:
+        for item in data:
+            events = fetch_events(item["events_url"])
+            item.update({"events": events})
+
+    return data, response.links
+
+
+def get_start_page() -> int:
+    # Determine next page to fetch based on number of downloaded issues
+    issues = get_issues()
+    count = sum(1 for _ in issues)
+    return int(count / PER_PAGE) + 1
+
+
+def download_issues(
+    owner: str, repo: str, state: str, retrieve_events: bool = False
+) -> None:
+    url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo)
+    start_page = get_start_page()
+
+    params = {
+        "state": state,
+        "sort": "created",
+        "direction": "asc",
+        "per_page": PER_PAGE,
+        "page": start_page,
+    }
+
+    data, response_links = fetch_issues(
+        url=url, retrieve_events=retrieve_events, params=params
+    )
+
+    db.append(GITHUB_ISSUES_DB, data)
+    # Fetch next page
+    while "next" in response_links.keys():
+        next_page_data, response_links = fetch_issues(
+            response_links["next"]["url"], retrieve_events
+        )
+        db.append(GITHUB_ISSUES_DB, next_page_data)
+
+    logger.info("Done fetching")
--- a/requirements.txt
+++ b/requirements.txt
@ -16,6 +16,7 @@ pydriller==1.12
 pyOpenSSL>=0.14  # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43)
 python-dateutil==2.8.1
 python-hglib==2.6.2
+ratelimit==2.2.1
 requests==2.25.1
 rs_parsepatch==0.3.3
 scikit-learn==0.24.1
--- a/scripts/github_issue_retriever.py
+++ b/scripts/github_issue_retriever.py
@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import argparse
+
+from bugbug import db, github
+from bugbug.utils import zstd_compress
+
+
+class Retriever(object):
+    def retrieve_issues(
+        self, owner: str, repo: str, state: str, retrieve_events: bool
+    ) -> None:
+        db.download(github.GITHUB_ISSUES_DB)
+        github.download_issues(owner, repo, state, retrieve_events)
+        zstd_compress(github.GITHUB_ISSUES_DB)
+
+
+def main() -> None:
+    description = "Retrieve GitHub issues"
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument(
+        "--owner",
+        help="GitHub repository owner.",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--repo",
+        help="GitHub repository name.",
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--state",
+        type=str,
+        default="all",
+        help="Indicates the state of the issues to return. Can be either open, closed, or all",
+    )
+    parser.add_argument(
+        "--retrieve-events",
+        action="store_true",
+        help="Whether to retrieve events for each issue.",
+    )
+
+    # Parse args to show the help if `--help` is passed
+    args = parser.parse_args()
+
+    retriever = Retriever()
+    retriever.retrieve_issues(args.owner, args.repo, args.state, args.retrieve_events)
+
+
+if __name__ == "__main__":
+    main()
--- a/setup.py
+++ b/setup.py
@ -61,6 +61,7 @@ setup(
            "bugbug-testing-policy-stats = scripts.testing_policy_stats:main",
            "bugbug-generate-landings-risk-report = scripts.generate_landings_risk_report:main",
            "bugbug-shadow-scheduler-stats = scripts.shadow_scheduler_stats:main",
+            "bugbug-data-github = scripts.github_issue_retriever:main",
        ]
    },
    classifiers=[
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -9,7 +9,7 @@ import shutil
 import pytest
 import zstandard

-from bugbug import bugzilla, repository
+from bugbug import bugzilla, github, repository

 FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")

@ -18,7 +18,11 @@ FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
 def mock_data(tmp_path):
    os.mkdir(tmp_path / "data")

-    DBs = [os.path.basename(bugzilla.BUGS_DB), os.path.basename(repository.COMMITS_DB)]
+    DBs = [
+        os.path.basename(bugzilla.BUGS_DB),
+        os.path.basename(repository.COMMITS_DB),
+        os.path.basename(github.GITHUB_ISSUES_DB),
+    ]

    for f in DBs:
        shutil.copyfile(os.path.join(FIXTURES_DIR, f), tmp_path / "data" / f)
--- a/tests/fixtures/github_issues.json
+++ b/tests/fixtures/github_issues.json
--- a/tests/test_github.py
+++ b/tests/test_github.py
@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from unittest import mock
+
+import responses
+
+from bugbug import github
+
+github.get_token = mock.Mock(return_value="mocked_token")
+
+TEST_URL = "https://api.github.com/repos/webcompat/web-bugs/issues"
+TEST_EVENTS_URL = "https://api.github.com/repos/webcompat/web-bugs/issues/1/events"
+HEADERS = {"link": "<https://api.github.com/test&page=2>; rel='next'"}
+
+
+def test_get_start_page():
+    assert github.get_start_page() == 2
+
+
+def test_fetch_issues():
+    expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
+    expected_headers = {
+        "next": {"url": "https://api.github.com/test&page=2", "rel": "next"}
+    }
+
+    # Mock main request
+    responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
+
+    # Assert that response without events has expected format
+    response = github.fetch_issues(TEST_URL, False)
+    assert response == (expected, expected_headers)
+
+
+def test_fetch_issues_with_events():
+    expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
+    expected_events = [{"event_id": "1"}]
+    expected_headers = {
+        "next": {"url": "https://api.github.com/test&page=2", "rel": "next"}
+    }
+
+    # Mock main request
+    responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
+    # Mock events request
+    responses.add(responses.GET, TEST_EVENTS_URL, json=expected_events, status=200)
+
+    # Assert that response with events has expected format
+    response_with_events = github.fetch_issues(TEST_URL, True)
+    expected_with_events = expected
+    expected_with_events[0]["events"] = expected_events
+
+    assert response_with_events == (expected_with_events, expected_headers)
+
+
+def test_fetch_issues_empty_header():
+    expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
+
+    # Mock main request with no headers
+    responses.add(responses.GET, TEST_URL, json=expected, status=200)
+    response_no_headers = github.fetch_issues(TEST_URL, False)
+
+    assert response_no_headers == (expected, {})
+
+
+def test_download_issues():
+    expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
+    next_url_headers = {"link": "<https://api.github.com/test&page=3>; rel='next'"}
+
+    # Make sure required requests are made as long as next link is present in the header
+    responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
+    responses.add(
+        responses.GET,
+        "https://api.github.com/test&page=2",
+        json=expected,
+        status=200,
+        headers=next_url_headers,
+    )
+    responses.add(
+        responses.GET, "https://api.github.com/test&page=3", json=expected, status=200
+    )
+
+    github.download_issues("webcompat", "web-bugs", "all")