From 627e182061a063f60c12265aeec3269eab56c0e4 Mon Sep 17 00:00:00 2001 From: Ksenia Date: Tue, 5 Oct 2021 08:31:00 -0400 Subject: [PATCH] Add support for multiple DBs for github (#2465) Fixes #2460 Co-authored-by: Marco Castelluccio --- bugbug/github.py | 205 +++++++++--------- bugbug/model.py | 11 +- bugbug/models/needsdiagnosis.py | 9 +- http_service/bugbug_http/models.py | 5 +- infra/data-pipeline.yml | 12 +- scripts/github_issue_classifier.py | 9 +- scripts/github_issue_retriever.py | 105 +++++---- tests/conftest.py | 4 +- ... => github_webcompat_web-bugs_issues.json} | 0 tests/test_github.py | 37 +++- tests/test_github_issue_retriever.py | 6 +- 11 files changed, 228 insertions(+), 175 deletions(-) rename tests/fixtures/{github_issues.json => github_webcompat_web-bugs_issues.json} (100%) diff --git a/bugbug/github.py b/bugbug/github.py index 1c3dd614..c7d3e19b 100644 --- a/bugbug/github.py +++ b/bugbug/github.py @@ -14,141 +14,142 @@ from bugbug.utils import get_secret logger = logging.getLogger(__name__) -GITHUB_ISSUES_DB = "data/github_issues.json" -db.register( - GITHUB_ISSUES_DB, - "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_issues.latest/artifacts/public/github_issues.json.zst", - 1, -) - IssueDict = NewType("IssueDict", dict) +DB_VERSION = 1 +DB_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_{}_{}_issues.latest/artifacts/public/github_{}_{}_issues.json.zst" + PER_PAGE = 100 # Rate limit period in seconds RATE_LIMIT_PERIOD = 900 -def get_issues() -> Iterator[IssueDict]: - yield from db.read(GITHUB_ISSUES_DB) +class Github: + def __init__( + self, owner: str, repo: str, state: str = "all", retrieve_events: bool = False + ) -> None: + self.owner = owner + self.repo = repo + self.state = state + self.retrieve_events = retrieve_events + self.db_path = "data/github_{}_{}_issues.json".format(self.owner, self.repo) -def delete_issues(match: Callable[[IssueDict], bool]) -> None: - db.delete(GITHUB_ISSUES_DB, match) + if not db.is_registered(self.db_path): + db.register( + self.db_path, + DB_URL.format(self.owner, self.repo, self.owner, self.repo), + DB_VERSION, + ) + def get_issues(self) -> Iterator[IssueDict]: + yield from db.read(self.db_path) -@sleep_and_retry -@limits(calls=1200, period=RATE_LIMIT_PERIOD) -def api_limit(): - # Allow a limited number of requests to account for rate limiting - pass + def delete_issues(self, match: Callable[[IssueDict], bool]) -> None: + db.delete(self.db_path, match) + @sleep_and_retry + @limits(calls=1200, period=RATE_LIMIT_PERIOD) + def api_limit(self): + # Allow a limited number of requests to account for rate limiting + pass -def get_token() -> str: - return get_secret("GITHUB_TOKEN") + def get_token(self) -> str: + return get_secret("GITHUB_TOKEN") + def fetch_events(self, events_url: str) -> list: + self.api_limit() + logger.info(f"Fetching {events_url}") + headers = {"Authorization": "token {}".format(self.get_token())} + response = requests.get(events_url, headers=headers) + response.raise_for_status() + events_raw = response.json() + return events_raw -def fetch_events(events_url: str) -> list: - api_limit() - logger.info(f"Fetching {events_url}") - headers = {"Authorization": "token {}".format(get_token())} - response = requests.get(events_url, headers=headers) - response.raise_for_status() - events_raw = response.json() - return events_raw + def fetch_issues( + self, url: str, retrieve_events: bool, params: dict = None + ) -> Tuple[List[IssueDict], dict]: + self.api_limit() + headers = {"Authorization": "token {}".format(self.get_token())} + response = requests.get(url, params=params, headers=headers) + response.raise_for_status() + data = response.json() + # If only one issue is requested, add it to a list + if isinstance(data, dict): + data = [data] -def fetch_issues( - url: str, retrieve_events: bool, params: dict = None -) -> Tuple[List[IssueDict], dict]: - api_limit() - headers = {"Authorization": "token {}".format(get_token())} - response = requests.get(url, params=params, headers=headers) - response.raise_for_status() - data = response.json() + logger.info(f"Fetching {url}") - # If only one issue is requested, add it to a list - if isinstance(data, dict): - data = [data] + if retrieve_events: + for item in data: + events = self.fetch_events(item["events_url"]) + item.update({"events": events}) - logger.info(f"Fetching {url}") + return data, response.links - if retrieve_events: - for item in data: - events = fetch_events(item["events_url"]) - item.update({"events": events}) + def get_start_page(self) -> int: + # Determine next page to fetch based on number of downloaded issues + issues = self.get_issues() + count = sum(1 for _ in issues) + return int(count / PER_PAGE) + 1 - return data, response.links + def fetch_issues_updated_since_timestamp(self, since: str) -> List[IssueDict]: + # Fetches changed and new issues since a specified timestamp + url = "https://api.github.com/repos/{}/{}/issues".format(self.owner, self.repo) + params = {"state": self.state, "since": since, "per_page": PER_PAGE, "page": 1} -def get_start_page() -> int: - # Determine next page to fetch based on number of downloaded issues - issues = get_issues() - count = sum(1 for _ in issues) - return int(count / PER_PAGE) + 1 - - -def fetch_issues_updated_since_timestamp( - owner: str, repo: str, state: str, since: str, retrieve_events: bool = False -) -> List[IssueDict]: - # Fetches changed and new issues since a specified timestamp - url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo) - - params = {"state": state, "since": since, "per_page": PER_PAGE, "page": 1} - - data, response_links = fetch_issues( - url=url, retrieve_events=retrieve_events, params=params - ) - - # Fetch next page - while "next" in response_links.keys(): - next_page_data, response_links = fetch_issues( - response_links["next"]["url"], retrieve_events + data, response_links = self.fetch_issues( + url=url, retrieve_events=self.retrieve_events, params=params ) - data += next_page_data - logger.info("Done fetching updates") + # Fetch next page + while "next" in response_links.keys(): + next_page_data, response_links = self.fetch_issues( + response_links["next"]["url"], self.retrieve_events + ) + data += next_page_data - return data + logger.info("Done fetching updates") + return data -def download_issues( - owner: str, repo: str, state: str, retrieve_events: bool = False -) -> None: - # Fetches all issues sorted by date of creation in ascending order - url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo) - start_page = get_start_page() + def download_issues(self) -> None: + # Fetches all issues sorted by date of creation in ascending order + url = "https://api.github.com/repos/{}/{}/issues".format(self.owner, self.repo) + start_page = self.get_start_page() - params = { - "state": state, - "sort": "created", - "direction": "asc", - "per_page": PER_PAGE, - "page": start_page, - } + params = { + "state": self.state, + "sort": "created", + "direction": "asc", + "per_page": PER_PAGE, + "page": start_page, + } - data, response_links = fetch_issues( - url=url, retrieve_events=retrieve_events, params=params - ) - - db.append(GITHUB_ISSUES_DB, data) - # Fetch next page - while "next" in response_links.keys(): - next_page_data, response_links = fetch_issues( - response_links["next"]["url"], retrieve_events + data, response_links = self.fetch_issues( + url=url, retrieve_events=self.retrieve_events, params=params ) - db.append(GITHUB_ISSUES_DB, next_page_data) - logger.info("Done downloading") + db.append(self.db_path, data) + # Fetch next page + while "next" in response_links.keys(): + next_page_data, response_links = self.fetch_issues( + response_links["next"]["url"], self.retrieve_events + ) + db.append(self.db_path, next_page_data) + logger.info("Done downloading") -def fetch_issue_by_number( - owner: str, repo: str, issue_number: int, retrieve_events: bool = False -) -> IssueDict: - # Fetches an issue by id - url = "https://api.github.com/repos/{}/{}/issues/{}".format( - owner, repo, issue_number - ) + def fetch_issue_by_number( + self, owner: str, repo: str, issue_number: int, retrieve_events: bool = False + ) -> IssueDict: + # Fetches an issue by id + url = "https://api.github.com/repos/{}/{}/issues/{}".format( + owner, repo, issue_number + ) - data = fetch_issues(url=url, retrieve_events=retrieve_events) + data = self.fetch_issues(url=url, retrieve_events=retrieve_events) - return data[0][0] + return data[0][0] diff --git a/bugbug/model.py b/bugbug/model.py index 8cae7954..f68af71d 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -23,7 +23,8 @@ from sklearn.metrics import precision_recall_fscore_support from sklearn.model_selection import cross_validate, train_test_split from tabulate import tabulate -from bugbug import bugzilla, db, github, repository +from bugbug import bugzilla, db, repository +from bugbug.github import Github from bugbug.nlp import SpacyVectorizer from bugbug.utils import split_tuple_generator, to_array @@ -754,12 +755,14 @@ class BugCoupleModel(Model): class IssueModel(Model): - def __init__(self, lemmatization=False): + def __init__(self, owner, repo, lemmatization=False): Model.__init__(self, lemmatization) - self.training_dbs = [github.GITHUB_ISSUES_DB] + + self.github = Github(owner=owner, repo=repo) + self.training_dbs = [self.github.db_path] def items_gen(self, classes): - for issue in github.get_issues(): + for issue in self.github.get_issues(): issue_number = issue["number"] if issue_number not in classes: continue diff --git a/bugbug/models/needsdiagnosis.py b/bugbug/models/needsdiagnosis.py index edd50769..71fe8b00 100644 --- a/bugbug/models/needsdiagnosis.py +++ b/bugbug/models/needsdiagnosis.py @@ -9,7 +9,7 @@ import xgboost from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline -from bugbug import feature_cleanup, github, issue_features, utils +from bugbug import feature_cleanup, issue_features, utils from bugbug.model import IssueModel logger = logging.getLogger(__name__) @@ -17,7 +17,9 @@ logger = logging.getLogger(__name__) class NeedsDiagnosisModel(IssueModel): def __init__(self, lemmatization=False): - IssueModel.__init__(self, lemmatization) + IssueModel.__init__( + self, owner="webcompat", repo="web-bugs", lemmatization=lemmatization + ) self.calculate_importance = False @@ -59,10 +61,11 @@ class NeedsDiagnosisModel(IssueModel): def get_labels(self): classes = {} - for issue in github.get_issues(): + for issue in self.github.get_issues(): # Skip issues with empty title or body if issue["title"] is None or issue["body"] is None: continue + # Skip issues that are not moderated yet as they don't have a meaningful title or body if issue["title"] == "In the moderation queue.": continue diff --git a/http_service/bugbug_http/models.py b/http_service/bugbug_http/models.py index b46de8f9..32f850f3 100644 --- a/http_service/bugbug_http/models.py +++ b/http_service/bugbug_http/models.py @@ -14,7 +14,8 @@ import requests import zstandard from redis import Redis -from bugbug import bugzilla, github, repository, test_scheduling +from bugbug import bugzilla, repository, test_scheduling +from bugbug.github import Github from bugbug.model import Model from bugbug.utils import get_hgmo_stack from bugbug_http.readthrough_cache import ReadthroughTTLCache @@ -113,6 +114,8 @@ def classify_issue( ) -> str: from bugbug_http.app import JobInfo + github = Github(owner=owner, repo=repo) + issue_ids_set = set(map(int, issue_nums)) issues = { diff --git a/infra/data-pipeline.yml b/infra/data-pipeline.yml index 1e773445..e439a7c2 100644 --- a/infra/data-pipeline.yml +++ b/infra/data-pipeline.yml @@ -249,11 +249,11 @@ tasks: - --retrieve-private artifacts: - public/github_issues.json.zst: - path: /data/github_issues.json.zst + public/github_webcompat_web-bugs_issues.json.zst: + path: /data/github_webcompat_web-bugs_issues.json.zst type: file - public/github_issues.json.version: - path: /data/github_issues.json.version + public/github_webcompat_web-bugs_issues.json.version: + path: /data/github_webcompat_web-bugs_issues.json.version type: file features: @@ -263,8 +263,8 @@ tasks: routes: - notify.email.release-mgmt-analysis@mozilla.com.on-failed - notify.irc-channel.#bugbug.on-failed - - index.project.bugbug.data_github_issues.${version} - - index.project.bugbug.data_github_issues.latest + - index.project.bugbug.data_github_webcompat_web-bugs_issues.${version} + - index.project.bugbug.data_github_webcompat_web-bugs_issues.latest metadata: name: bugbug webcompat issues retrieval description: bugbug webcompat issues retrieval diff --git a/scripts/github_issue_classifier.py b/scripts/github_issue_classifier.py index 2e6ce75c..10a49d0b 100644 --- a/scripts/github_issue_classifier.py +++ b/scripts/github_issue_classifier.py @@ -7,7 +7,8 @@ from logging import INFO, basicConfig, getLogger import numpy as np import requests -from bugbug import db, github +from bugbug import db +from bugbug.github import Github from bugbug.models import get_model_class from bugbug.utils import download_model @@ -34,13 +35,17 @@ def classify_issues( model_class = get_model_class(model_name) model = model_class.load(model_file_name) + github = Github( + owner=owner, repo=repo, state="all", retrieve_events=retrieve_events + ) + if issue_number: issues = iter( [github.fetch_issue_by_number(owner, repo, issue_number, retrieve_events)] ) assert issues, f"An issue with a number of {issue_number} was not found" else: - assert db.download(github.GITHUB_ISSUES_DB) + assert db.download(github.db_path) issues = github.get_issues() for issue in issues: diff --git a/scripts/github_issue_retriever.py b/scripts/github_issue_retriever.py index 99ddaf5e..57b859dd 100644 --- a/scripts/github_issue_retriever.py +++ b/scripts/github_issue_retriever.py @@ -7,53 +7,66 @@ import argparse from logging import getLogger from typing import List, Tuple -from bugbug import db, github -from bugbug.github import IssueDict +from bugbug import db +from bugbug.github import Github, IssueDict from bugbug.utils import extract_private, zstd_compress logger = getLogger(__name__) -def replace_with_private(original_data: List[IssueDict]) -> Tuple[List[IssueDict], set]: - """Replace title and body of automatically closed public issues. - - Replace them with title and body of a corresponding private issue - to account for moderation workflow in webcompat repository - """ - updated_ids = set() - updated_issues = [] - for item in original_data: - if item["title"] == "Issue closed.": - extracted = extract_private(item["body"]) - if extracted is None: - continue - - owner, repo, issue_number = extracted - private_issue = github.fetch_issue_by_number(owner, repo, issue_number) - if private_issue: - item["title"] = private_issue["title"] - item["body"] = private_issue["body"] - updated_ids.add(item["id"]) - updated_issues.append(item) - - return updated_issues, updated_ids - - class Retriever(object): - def retrieve_issues( + def __init__( self, owner: str, repo: str, state: str, retrieve_events: bool, retrieve_private: bool, - ) -> None: + ): + self.owner = owner + self.repo = repo + self.state = state + self.retrieve_events = retrieve_events + self.retrieve_private = retrieve_private + self.github = Github( + owner=owner, repo=repo, state=state, retrieve_events=retrieve_events + ) + + def replace_with_private( + self, original_data: List[IssueDict] + ) -> Tuple[List[IssueDict], set]: + """Replace title and body of automatically closed public issues. + + Replace them with title and body of a corresponding private issue + to account for moderation workflow in webcompat repository + """ + updated_ids = set() + updated_issues = [] + for item in original_data: + if item["title"] == "Issue closed.": + extracted = extract_private(item["body"]) + if extracted is None: + continue + + owner, repo, issue_number = extracted + private_issue = self.github.fetch_issue_by_number( + owner, repo, issue_number + ) + if private_issue: + item["title"] = private_issue["title"] + item["body"] = private_issue["body"] + updated_ids.add(item["id"]) + updated_issues.append(item) + + return updated_issues, updated_ids + + def retrieve_issues(self) -> None: last_modified = None - db.download(github.GITHUB_ISSUES_DB) + db.download(self.github.db_path) try: - last_modified = db.last_modified(github.GITHUB_ISSUES_DB) + last_modified = db.last_modified(self.github.db_path) except Exception: pass @@ -61,44 +74,44 @@ class Retriever(object): logger.info( f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}" ) - data = github.fetch_issues_updated_since_timestamp( - owner, repo, state, last_modified.isoformat(), retrieve_events + data = self.github.fetch_issues_updated_since_timestamp( + last_modified.isoformat() ) - if retrieve_private: + if self.retrieve_private: logger.info( "Replacing contents of auto closed public issues with private issues content" ) - replace_with_private(data) + self.replace_with_private(data) updated_ids = set(issue["id"] for issue in data) logger.info( "Deleting issues that were changed since the last run and saving updates" ) - github.delete_issues(lambda issue: issue["id"] in updated_ids) + self.github.delete_issues(lambda issue: issue["id"] in updated_ids) - db.append(github.GITHUB_ISSUES_DB, data) + db.append(self.github.db_path, data) logger.info("Updating finished") else: logger.info("Retrieving all issues since last_modified is not available") - github.download_issues(owner, repo, state, retrieve_events) + self.github.download_issues() - if retrieve_private: + if self.retrieve_private: logger.info( "Replacing contents of auto closed public issues with private issues content" ) - all_issues = list(github.get_issues()) - updated_issues, updated_ids = replace_with_private(all_issues) + all_issues = list(self.github.get_issues()) + updated_issues, updated_ids = self.replace_with_private(all_issues) logger.info( "Deleting public issues that were updated and saving updates" ) - github.delete_issues(lambda issue: issue["id"] in updated_ids) - db.append(github.GITHUB_ISSUES_DB, updated_issues) + self.github.delete_issues(lambda issue: issue["id"] in updated_ids) + db.append(self.github.db_path, updated_issues) - zstd_compress(github.GITHUB_ISSUES_DB) + zstd_compress(self.github.db_path) def main() -> None: @@ -136,10 +149,10 @@ def main() -> None: # Parse args to show the help if `--help` is passed args = parser.parse_args() - retriever = Retriever() - retriever.retrieve_issues( + retriever = Retriever( args.owner, args.repo, args.state, args.retrieve_events, args.retrieve_private ) + retriever.retrieve_issues() if __name__ == "__main__": diff --git a/tests/conftest.py b/tests/conftest.py index e5c64598..803deda0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ import shutil import pytest import zstandard -from bugbug import bugzilla, github, repository +from bugbug import bugzilla, repository FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures") @@ -21,7 +21,7 @@ def mock_data(tmp_path): DBs = [ os.path.basename(bugzilla.BUGS_DB), os.path.basename(repository.COMMITS_DB), - os.path.basename(github.GITHUB_ISSUES_DB), + os.path.basename("data/github_webcompat_web-bugs_issues.json"), ] for f in DBs: diff --git a/tests/fixtures/github_issues.json b/tests/fixtures/github_webcompat_web-bugs_issues.json similarity index 100% rename from tests/fixtures/github_issues.json rename to tests/fixtures/github_webcompat_web-bugs_issues.json diff --git a/tests/test_github.py b/tests/test_github.py index 67b46972..b1021efb 100644 --- a/tests/test_github.py +++ b/tests/test_github.py @@ -7,9 +7,10 @@ from unittest import mock import responses -from bugbug import github +from bugbug.github import Github -github.get_token = mock.Mock(return_value="mocked_token") +github = Github(owner="webcompat", repo="web-bugs") +github.get_token = mock.Mock(return_value="mocked_token") # type: ignore TEST_URL = "https://api.github.com/repos/webcompat/web-bugs/issues" TEST_EVENTS_URL = "https://api.github.com/repos/webcompat/web-bugs/issues/1/events" @@ -84,10 +85,35 @@ def test_download_issues() -> None: responses.GET, "https://api.github.com/test&page=3", json=expected, status=200 ) - github.download_issues("webcompat", "web-bugs", "all") + github.download_issues() + + +def test_download_issues_with_events() -> None: + github.retrieve_events = True + expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}] + expected_events = [{"event_id": "1"}] + next_url_headers = {"link": "; rel='next'"} + + # Make sure required requests are made as long as next link is present in the header + responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS) + responses.add( + responses.GET, + "https://api.github.com/test&page=2", + json=expected, + status=200, + headers=next_url_headers, + ) + responses.add( + responses.GET, "https://api.github.com/test&page=3", json=expected, status=200 + ) + # Mock events request + responses.add(responses.GET, TEST_EVENTS_URL, json=expected_events, status=200) + + github.download_issues() def test_download_issues_updated_since_timestamp() -> None: + github.retrieve_events = False first_page = [ {"id": 30515129, "issue_id": "1"}, {"id": 30536238, "issue_id": "2"}, @@ -135,14 +161,13 @@ def test_download_issues_updated_since_timestamp() -> None: result = first_page + second_page + third_page - data = github.fetch_issues_updated_since_timestamp( - "webcompat", "web-bugs", "all", "2021-04-03T20:14:04+00:00" - ) + data = github.fetch_issues_updated_since_timestamp("2021-04-03T20:14:04+00:00") assert data == result def test_fetch_issue_by_number() -> None: + github.retrieve_events = False expected = [ {"issue_id": "1", "events_url": TEST_EVENTS_URL, "labels": [{"name": "test"}]} ] diff --git a/tests/test_github_issue_retriever.py b/tests/test_github_issue_retriever.py index d8a61a38..18db722c 100644 --- a/tests/test_github_issue_retriever.py +++ b/tests/test_github_issue_retriever.py @@ -7,11 +7,11 @@ from unittest import mock import responses -from bugbug import github from bugbug.github import IssueDict -from scripts import github_issue_retriever +from scripts.github_issue_retriever import Retriever -github.get_token = mock.Mock(return_value="mocked_token") +github_issue_retriever = Retriever("webcompat", "web-bugs", "all", False, True) +github_issue_retriever.github.get_token = mock.Mock(return_value="mocked_token") # type: ignore PUBLIC_BODY = """

Thanks for the report. We have closed this issue\n