Add support for GitHub issue retrieval (#2242)

First step for #259
This commit is contained in:
Ksenia 2021-03-24 03:46:37 -04:00 коммит произвёл GitHub
Родитель 9da5a4faa5
Коммит e4db6b219f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 357 добавлений и 3 удалений

Просмотреть файл

@ -3,4 +3,4 @@ multi_line_output=3
include_trailing_comma=True
line_length=88
known_first_party = bugbug,bugbug_http
known_third_party = _pytest,apispec,apispec_webframeworks,boto3,cerberus,dateutil,flask,flask_cors,hglib,hypothesis,igraph,imblearn,jsone,jsonschema,kombu,libmozdata,lmdb,marshmallow,matplotlib,microannotate,mozci,numpy,orjson,ortools,pandas,pkg_resources,psutil,py,pyemd,pytest,redis,requests,responses,rq,rs_parsepatch,scipy,sentry_sdk,setuptools,shap,sklearn,tabulate,taskcluster,tenacity,tqdm,xgboost,yaml,zstandard
known_third_party = _pytest,apispec,apispec_webframeworks,boto3,cerberus,dateutil,flask,flask_cors,hglib,hypothesis,igraph,imblearn,jsone,jsonschema,kombu,libmozdata,lmdb,marshmallow,matplotlib,microannotate,mozci,numpy,orjson,ortools,pandas,pkg_resources,psutil,py,pyemd,pytest,ratelimit,redis,requests,responses,rq,rs_parsepatch,scipy,sentry_sdk,setuptools,shap,sklearn,tabulate,taskcluster,tenacity,tqdm,xgboost,yaml,zstandard

108
bugbug/github.py Normal file
Просмотреть файл

@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
from typing import Iterator, NewType, Tuple
import requests
from ratelimit import limits, sleep_and_retry
from bugbug import db
from bugbug.utils import get_secret
logger = logging.getLogger(__name__)
GITHUB_ISSUES_DB = "data/github_issues.json"
db.register(
GITHUB_ISSUES_DB,
"https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_issues.latest/artifacts/public/github_issues.json.zst",
1,
)
IssueDict = NewType("IssueDict", dict)
PER_PAGE = 100
# Rate limit period in seconds
RATE_LIMIT_PERIOD = 900
def get_issues() -> Iterator[IssueDict]:
yield from db.read(GITHUB_ISSUES_DB)
@sleep_and_retry
@limits(calls=1200, period=RATE_LIMIT_PERIOD)
def api_limit():
# Allow a limited number of requests to account for rate limiting
pass
def get_token() -> str:
return get_secret("GITHUB_TOKEN")
def fetch_events(events_url: str) -> str:
api_limit()
logger.info(f"Fetching {events_url}")
headers = {"Authorization": "token {}".format(get_token())}
response = requests.get(events_url, headers=headers)
response.raise_for_status()
events_raw = response.json()
return events_raw
def fetch_issues(
url: str, retrieve_events: bool, params: dict = None
) -> Tuple[str, dict]:
api_limit()
headers = {"Authorization": "token {}".format(get_token())}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
logger.info(f"Fetching {url}")
if retrieve_events:
for item in data:
events = fetch_events(item["events_url"])
item.update({"events": events})
return data, response.links
def get_start_page() -> int:
# Determine next page to fetch based on number of downloaded issues
issues = get_issues()
count = sum(1 for _ in issues)
return int(count / PER_PAGE) + 1
def download_issues(
owner: str, repo: str, state: str, retrieve_events: bool = False
) -> None:
url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo)
start_page = get_start_page()
params = {
"state": state,
"sort": "created",
"direction": "asc",
"per_page": PER_PAGE,
"page": start_page,
}
data, response_links = fetch_issues(
url=url, retrieve_events=retrieve_events, params=params
)
db.append(GITHUB_ISSUES_DB, data)
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = fetch_issues(
response_links["next"]["url"], retrieve_events
)
db.append(GITHUB_ISSUES_DB, next_page_data)
logger.info("Done fetching")

Просмотреть файл

@ -16,6 +16,7 @@ pydriller==1.12
pyOpenSSL>=0.14 # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43)
python-dateutil==2.8.1
python-hglib==2.6.2
ratelimit==2.2.1
requests==2.25.1
rs_parsepatch==0.3.3
scikit-learn==0.24.1

Просмотреть файл

@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
from bugbug import db, github
from bugbug.utils import zstd_compress
class Retriever(object):
def retrieve_issues(
self, owner: str, repo: str, state: str, retrieve_events: bool
) -> None:
db.download(github.GITHUB_ISSUES_DB)
github.download_issues(owner, repo, state, retrieve_events)
zstd_compress(github.GITHUB_ISSUES_DB)
def main() -> None:
description = "Retrieve GitHub issues"
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
"--owner",
help="GitHub repository owner.",
type=str,
required=True,
)
parser.add_argument(
"--repo",
help="GitHub repository name.",
type=str,
required=True,
)
parser.add_argument(
"--state",
type=str,
default="all",
help="Indicates the state of the issues to return. Can be either open, closed, or all",
)
parser.add_argument(
"--retrieve-events",
action="store_true",
help="Whether to retrieve events for each issue.",
)
# Parse args to show the help if `--help` is passed
args = parser.parse_args()
retriever = Retriever()
retriever.retrieve_issues(args.owner, args.repo, args.state, args.retrieve_events)
if __name__ == "__main__":
main()

Просмотреть файл

@ -61,6 +61,7 @@ setup(
"bugbug-testing-policy-stats = scripts.testing_policy_stats:main",
"bugbug-generate-landings-risk-report = scripts.generate_landings_risk_report:main",
"bugbug-shadow-scheduler-stats = scripts.shadow_scheduler_stats:main",
"bugbug-data-github = scripts.github_issue_retriever:main",
]
},
classifiers=[

Просмотреть файл

@ -9,7 +9,7 @@ import shutil
import pytest
import zstandard
from bugbug import bugzilla, repository
from bugbug import bugzilla, github, repository
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
@ -18,7 +18,11 @@ FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
def mock_data(tmp_path):
os.mkdir(tmp_path / "data")
DBs = [os.path.basename(bugzilla.BUGS_DB), os.path.basename(repository.COMMITS_DB)]
DBs = [
os.path.basename(bugzilla.BUGS_DB),
os.path.basename(repository.COMMITS_DB),
os.path.basename(github.GITHUB_ISSUES_DB),
]
for f in DBs:
shutil.copyfile(os.path.join(FIXTURES_DIR, f), tmp_path / "data" / f)

100
tests/fixtures/github_issues.json поставляемый Normal file

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

84
tests/test_github.py Normal file
Просмотреть файл

@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from unittest import mock
import responses
from bugbug import github
github.get_token = mock.Mock(return_value="mocked_token")
TEST_URL = "https://api.github.com/repos/webcompat/web-bugs/issues"
TEST_EVENTS_URL = "https://api.github.com/repos/webcompat/web-bugs/issues/1/events"
HEADERS = {"link": "<https://api.github.com/test&page=2>; rel='next'"}
def test_get_start_page():
assert github.get_start_page() == 2
def test_fetch_issues():
expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
expected_headers = {
"next": {"url": "https://api.github.com/test&page=2", "rel": "next"}
}
# Mock main request
responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
# Assert that response without events has expected format
response = github.fetch_issues(TEST_URL, False)
assert response == (expected, expected_headers)
def test_fetch_issues_with_events():
expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
expected_events = [{"event_id": "1"}]
expected_headers = {
"next": {"url": "https://api.github.com/test&page=2", "rel": "next"}
}
# Mock main request
responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
# Mock events request
responses.add(responses.GET, TEST_EVENTS_URL, json=expected_events, status=200)
# Assert that response with events has expected format
response_with_events = github.fetch_issues(TEST_URL, True)
expected_with_events = expected
expected_with_events[0]["events"] = expected_events
assert response_with_events == (expected_with_events, expected_headers)
def test_fetch_issues_empty_header():
expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
# Mock main request with no headers
responses.add(responses.GET, TEST_URL, json=expected, status=200)
response_no_headers = github.fetch_issues(TEST_URL, False)
assert response_no_headers == (expected, {})
def test_download_issues():
expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
next_url_headers = {"link": "<https://api.github.com/test&page=3>; rel='next'"}
# Make sure required requests are made as long as next link is present in the header
responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
responses.add(
responses.GET,
"https://api.github.com/test&page=2",
json=expected,
status=200,
headers=next_url_headers,
)
responses.add(
responses.GET, "https://api.github.com/test&page=3", json=expected, status=200
)
github.download_issues("webcompat", "web-bugs", "all")