Add support for multiple DBs for github (#2465)

Fixes #2460

Co-authored-by: Marco Castelluccio <mcastelluccio@mozilla.com>
This commit is contained in:
Ksenia 2021-10-05 08:31:00 -04:00 коммит произвёл GitHub
Родитель 819358b139
Коммит 627e182061
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
11 изменённых файлов: 228 добавлений и 175 удалений

Просмотреть файл

@ -14,141 +14,142 @@ from bugbug.utils import get_secret
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
GITHUB_ISSUES_DB = "data/github_issues.json"
db.register(
GITHUB_ISSUES_DB,
"https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_issues.latest/artifacts/public/github_issues.json.zst",
1,
)
IssueDict = NewType("IssueDict", dict) IssueDict = NewType("IssueDict", dict)
DB_VERSION = 1
DB_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_{}_{}_issues.latest/artifacts/public/github_{}_{}_issues.json.zst"
PER_PAGE = 100 PER_PAGE = 100
# Rate limit period in seconds # Rate limit period in seconds
RATE_LIMIT_PERIOD = 900 RATE_LIMIT_PERIOD = 900
def get_issues() -> Iterator[IssueDict]: class Github:
yield from db.read(GITHUB_ISSUES_DB) def __init__(
self, owner: str, repo: str, state: str = "all", retrieve_events: bool = False
) -> None:
self.owner = owner
self.repo = repo
self.state = state
self.retrieve_events = retrieve_events
self.db_path = "data/github_{}_{}_issues.json".format(self.owner, self.repo)
def delete_issues(match: Callable[[IssueDict], bool]) -> None: if not db.is_registered(self.db_path):
db.delete(GITHUB_ISSUES_DB, match) db.register(
self.db_path,
DB_URL.format(self.owner, self.repo, self.owner, self.repo),
DB_VERSION,
)
def get_issues(self) -> Iterator[IssueDict]:
yield from db.read(self.db_path)
@sleep_and_retry def delete_issues(self, match: Callable[[IssueDict], bool]) -> None:
@limits(calls=1200, period=RATE_LIMIT_PERIOD) db.delete(self.db_path, match)
def api_limit():
# Allow a limited number of requests to account for rate limiting
pass
@sleep_and_retry
@limits(calls=1200, period=RATE_LIMIT_PERIOD)
def api_limit(self):
# Allow a limited number of requests to account for rate limiting
pass
def get_token() -> str: def get_token(self) -> str:
return get_secret("GITHUB_TOKEN") return get_secret("GITHUB_TOKEN")
def fetch_events(self, events_url: str) -> list:
self.api_limit()
logger.info(f"Fetching {events_url}")
headers = {"Authorization": "token {}".format(self.get_token())}
response = requests.get(events_url, headers=headers)
response.raise_for_status()
events_raw = response.json()
return events_raw
def fetch_events(events_url: str) -> list: def fetch_issues(
api_limit() self, url: str, retrieve_events: bool, params: dict = None
logger.info(f"Fetching {events_url}") ) -> Tuple[List[IssueDict], dict]:
headers = {"Authorization": "token {}".format(get_token())} self.api_limit()
response = requests.get(events_url, headers=headers) headers = {"Authorization": "token {}".format(self.get_token())}
response.raise_for_status() response = requests.get(url, params=params, headers=headers)
events_raw = response.json() response.raise_for_status()
return events_raw data = response.json()
# If only one issue is requested, add it to a list
if isinstance(data, dict):
data = [data]
def fetch_issues( logger.info(f"Fetching {url}")
url: str, retrieve_events: bool, params: dict = None
) -> Tuple[List[IssueDict], dict]:
api_limit()
headers = {"Authorization": "token {}".format(get_token())}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
# If only one issue is requested, add it to a list if retrieve_events:
if isinstance(data, dict): for item in data:
data = [data] events = self.fetch_events(item["events_url"])
item.update({"events": events})
logger.info(f"Fetching {url}") return data, response.links
if retrieve_events: def get_start_page(self) -> int:
for item in data: # Determine next page to fetch based on number of downloaded issues
events = fetch_events(item["events_url"]) issues = self.get_issues()
item.update({"events": events}) count = sum(1 for _ in issues)
return int(count / PER_PAGE) + 1
return data, response.links def fetch_issues_updated_since_timestamp(self, since: str) -> List[IssueDict]:
# Fetches changed and new issues since a specified timestamp
url = "https://api.github.com/repos/{}/{}/issues".format(self.owner, self.repo)
params = {"state": self.state, "since": since, "per_page": PER_PAGE, "page": 1}
def get_start_page() -> int: data, response_links = self.fetch_issues(
# Determine next page to fetch based on number of downloaded issues url=url, retrieve_events=self.retrieve_events, params=params
issues = get_issues()
count = sum(1 for _ in issues)
return int(count / PER_PAGE) + 1
def fetch_issues_updated_since_timestamp(
owner: str, repo: str, state: str, since: str, retrieve_events: bool = False
) -> List[IssueDict]:
# Fetches changed and new issues since a specified timestamp
url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo)
params = {"state": state, "since": since, "per_page": PER_PAGE, "page": 1}
data, response_links = fetch_issues(
url=url, retrieve_events=retrieve_events, params=params
)
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = fetch_issues(
response_links["next"]["url"], retrieve_events
) )
data += next_page_data
logger.info("Done fetching updates") # Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = self.fetch_issues(
response_links["next"]["url"], self.retrieve_events
)
data += next_page_data
return data logger.info("Done fetching updates")
return data
def download_issues( def download_issues(self) -> None:
owner: str, repo: str, state: str, retrieve_events: bool = False # Fetches all issues sorted by date of creation in ascending order
) -> None: url = "https://api.github.com/repos/{}/{}/issues".format(self.owner, self.repo)
# Fetches all issues sorted by date of creation in ascending order start_page = self.get_start_page()
url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo)
start_page = get_start_page()
params = { params = {
"state": state, "state": self.state,
"sort": "created", "sort": "created",
"direction": "asc", "direction": "asc",
"per_page": PER_PAGE, "per_page": PER_PAGE,
"page": start_page, "page": start_page,
} }
data, response_links = fetch_issues( data, response_links = self.fetch_issues(
url=url, retrieve_events=retrieve_events, params=params url=url, retrieve_events=self.retrieve_events, params=params
)
db.append(GITHUB_ISSUES_DB, data)
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = fetch_issues(
response_links["next"]["url"], retrieve_events
) )
db.append(GITHUB_ISSUES_DB, next_page_data)
logger.info("Done downloading") db.append(self.db_path, data)
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = self.fetch_issues(
response_links["next"]["url"], self.retrieve_events
)
db.append(self.db_path, next_page_data)
logger.info("Done downloading")
def fetch_issue_by_number( def fetch_issue_by_number(
owner: str, repo: str, issue_number: int, retrieve_events: bool = False self, owner: str, repo: str, issue_number: int, retrieve_events: bool = False
) -> IssueDict: ) -> IssueDict:
# Fetches an issue by id # Fetches an issue by id
url = "https://api.github.com/repos/{}/{}/issues/{}".format( url = "https://api.github.com/repos/{}/{}/issues/{}".format(
owner, repo, issue_number owner, repo, issue_number
) )
data = fetch_issues(url=url, retrieve_events=retrieve_events) data = self.fetch_issues(url=url, retrieve_events=retrieve_events)
return data[0][0] return data[0][0]

Просмотреть файл

@ -23,7 +23,8 @@ from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_validate, train_test_split from sklearn.model_selection import cross_validate, train_test_split
from tabulate import tabulate from tabulate import tabulate
from bugbug import bugzilla, db, github, repository from bugbug import bugzilla, db, repository
from bugbug.github import Github
from bugbug.nlp import SpacyVectorizer from bugbug.nlp import SpacyVectorizer
from bugbug.utils import split_tuple_generator, to_array from bugbug.utils import split_tuple_generator, to_array
@ -754,12 +755,14 @@ class BugCoupleModel(Model):
class IssueModel(Model): class IssueModel(Model):
def __init__(self, lemmatization=False): def __init__(self, owner, repo, lemmatization=False):
Model.__init__(self, lemmatization) Model.__init__(self, lemmatization)
self.training_dbs = [github.GITHUB_ISSUES_DB]
self.github = Github(owner=owner, repo=repo)
self.training_dbs = [self.github.db_path]
def items_gen(self, classes): def items_gen(self, classes):
for issue in github.get_issues(): for issue in self.github.get_issues():
issue_number = issue["number"] issue_number = issue["number"]
if issue_number not in classes: if issue_number not in classes:
continue continue

Просмотреть файл

@ -9,7 +9,7 @@ import xgboost
from sklearn.compose import ColumnTransformer from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from bugbug import feature_cleanup, github, issue_features, utils from bugbug import feature_cleanup, issue_features, utils
from bugbug.model import IssueModel from bugbug.model import IssueModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -17,7 +17,9 @@ logger = logging.getLogger(__name__)
class NeedsDiagnosisModel(IssueModel): class NeedsDiagnosisModel(IssueModel):
def __init__(self, lemmatization=False): def __init__(self, lemmatization=False):
IssueModel.__init__(self, lemmatization) IssueModel.__init__(
self, owner="webcompat", repo="web-bugs", lemmatization=lemmatization
)
self.calculate_importance = False self.calculate_importance = False
@ -59,10 +61,11 @@ class NeedsDiagnosisModel(IssueModel):
def get_labels(self): def get_labels(self):
classes = {} classes = {}
for issue in github.get_issues(): for issue in self.github.get_issues():
# Skip issues with empty title or body # Skip issues with empty title or body
if issue["title"] is None or issue["body"] is None: if issue["title"] is None or issue["body"] is None:
continue continue
# Skip issues that are not moderated yet as they don't have a meaningful title or body # Skip issues that are not moderated yet as they don't have a meaningful title or body
if issue["title"] == "In the moderation queue.": if issue["title"] == "In the moderation queue.":
continue continue

Просмотреть файл

@ -14,7 +14,8 @@ import requests
import zstandard import zstandard
from redis import Redis from redis import Redis
from bugbug import bugzilla, github, repository, test_scheduling from bugbug import bugzilla, repository, test_scheduling
from bugbug.github import Github
from bugbug.model import Model from bugbug.model import Model
from bugbug.utils import get_hgmo_stack from bugbug.utils import get_hgmo_stack
from bugbug_http.readthrough_cache import ReadthroughTTLCache from bugbug_http.readthrough_cache import ReadthroughTTLCache
@ -113,6 +114,8 @@ def classify_issue(
) -> str: ) -> str:
from bugbug_http.app import JobInfo from bugbug_http.app import JobInfo
github = Github(owner=owner, repo=repo)
issue_ids_set = set(map(int, issue_nums)) issue_ids_set = set(map(int, issue_nums))
issues = { issues = {

Просмотреть файл

@ -249,11 +249,11 @@ tasks:
- --retrieve-private - --retrieve-private
artifacts: artifacts:
public/github_issues.json.zst: public/github_webcompat_web-bugs_issues.json.zst:
path: /data/github_issues.json.zst path: /data/github_webcompat_web-bugs_issues.json.zst
type: file type: file
public/github_issues.json.version: public/github_webcompat_web-bugs_issues.json.version:
path: /data/github_issues.json.version path: /data/github_webcompat_web-bugs_issues.json.version
type: file type: file
features: features:
@ -263,8 +263,8 @@ tasks:
routes: routes:
- notify.email.release-mgmt-analysis@mozilla.com.on-failed - notify.email.release-mgmt-analysis@mozilla.com.on-failed
- notify.irc-channel.#bugbug.on-failed - notify.irc-channel.#bugbug.on-failed
- index.project.bugbug.data_github_issues.${version} - index.project.bugbug.data_github_webcompat_web-bugs_issues.${version}
- index.project.bugbug.data_github_issues.latest - index.project.bugbug.data_github_webcompat_web-bugs_issues.latest
metadata: metadata:
name: bugbug webcompat issues retrieval name: bugbug webcompat issues retrieval
description: bugbug webcompat issues retrieval description: bugbug webcompat issues retrieval

Просмотреть файл

@ -7,7 +7,8 @@ from logging import INFO, basicConfig, getLogger
import numpy as np import numpy as np
import requests import requests
from bugbug import db, github from bugbug import db
from bugbug.github import Github
from bugbug.models import get_model_class from bugbug.models import get_model_class
from bugbug.utils import download_model from bugbug.utils import download_model
@ -34,13 +35,17 @@ def classify_issues(
model_class = get_model_class(model_name) model_class = get_model_class(model_name)
model = model_class.load(model_file_name) model = model_class.load(model_file_name)
github = Github(
owner=owner, repo=repo, state="all", retrieve_events=retrieve_events
)
if issue_number: if issue_number:
issues = iter( issues = iter(
[github.fetch_issue_by_number(owner, repo, issue_number, retrieve_events)] [github.fetch_issue_by_number(owner, repo, issue_number, retrieve_events)]
) )
assert issues, f"An issue with a number of {issue_number} was not found" assert issues, f"An issue with a number of {issue_number} was not found"
else: else:
assert db.download(github.GITHUB_ISSUES_DB) assert db.download(github.db_path)
issues = github.get_issues() issues = github.get_issues()
for issue in issues: for issue in issues:

Просмотреть файл

@ -7,53 +7,66 @@ import argparse
from logging import getLogger from logging import getLogger
from typing import List, Tuple from typing import List, Tuple
from bugbug import db, github from bugbug import db
from bugbug.github import IssueDict from bugbug.github import Github, IssueDict
from bugbug.utils import extract_private, zstd_compress from bugbug.utils import extract_private, zstd_compress
logger = getLogger(__name__) logger = getLogger(__name__)
def replace_with_private(original_data: List[IssueDict]) -> Tuple[List[IssueDict], set]:
"""Replace title and body of automatically closed public issues.
Replace them with title and body of a corresponding private issue
to account for moderation workflow in webcompat repository
"""
updated_ids = set()
updated_issues = []
for item in original_data:
if item["title"] == "Issue closed.":
extracted = extract_private(item["body"])
if extracted is None:
continue
owner, repo, issue_number = extracted
private_issue = github.fetch_issue_by_number(owner, repo, issue_number)
if private_issue:
item["title"] = private_issue["title"]
item["body"] = private_issue["body"]
updated_ids.add(item["id"])
updated_issues.append(item)
return updated_issues, updated_ids
class Retriever(object): class Retriever(object):
def retrieve_issues( def __init__(
self, self,
owner: str, owner: str,
repo: str, repo: str,
state: str, state: str,
retrieve_events: bool, retrieve_events: bool,
retrieve_private: bool, retrieve_private: bool,
) -> None: ):
self.owner = owner
self.repo = repo
self.state = state
self.retrieve_events = retrieve_events
self.retrieve_private = retrieve_private
self.github = Github(
owner=owner, repo=repo, state=state, retrieve_events=retrieve_events
)
def replace_with_private(
self, original_data: List[IssueDict]
) -> Tuple[List[IssueDict], set]:
"""Replace title and body of automatically closed public issues.
Replace them with title and body of a corresponding private issue
to account for moderation workflow in webcompat repository
"""
updated_ids = set()
updated_issues = []
for item in original_data:
if item["title"] == "Issue closed.":
extracted = extract_private(item["body"])
if extracted is None:
continue
owner, repo, issue_number = extracted
private_issue = self.github.fetch_issue_by_number(
owner, repo, issue_number
)
if private_issue:
item["title"] = private_issue["title"]
item["body"] = private_issue["body"]
updated_ids.add(item["id"])
updated_issues.append(item)
return updated_issues, updated_ids
def retrieve_issues(self) -> None:
last_modified = None last_modified = None
db.download(github.GITHUB_ISSUES_DB) db.download(self.github.db_path)
try: try:
last_modified = db.last_modified(github.GITHUB_ISSUES_DB) last_modified = db.last_modified(self.github.db_path)
except Exception: except Exception:
pass pass
@ -61,44 +74,44 @@ class Retriever(object):
logger.info( logger.info(
f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}" f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}"
) )
data = github.fetch_issues_updated_since_timestamp( data = self.github.fetch_issues_updated_since_timestamp(
owner, repo, state, last_modified.isoformat(), retrieve_events last_modified.isoformat()
) )
if retrieve_private: if self.retrieve_private:
logger.info( logger.info(
"Replacing contents of auto closed public issues with private issues content" "Replacing contents of auto closed public issues with private issues content"
) )
replace_with_private(data) self.replace_with_private(data)
updated_ids = set(issue["id"] for issue in data) updated_ids = set(issue["id"] for issue in data)
logger.info( logger.info(
"Deleting issues that were changed since the last run and saving updates" "Deleting issues that were changed since the last run and saving updates"
) )
github.delete_issues(lambda issue: issue["id"] in updated_ids) self.github.delete_issues(lambda issue: issue["id"] in updated_ids)
db.append(github.GITHUB_ISSUES_DB, data) db.append(self.github.db_path, data)
logger.info("Updating finished") logger.info("Updating finished")
else: else:
logger.info("Retrieving all issues since last_modified is not available") logger.info("Retrieving all issues since last_modified is not available")
github.download_issues(owner, repo, state, retrieve_events) self.github.download_issues()
if retrieve_private: if self.retrieve_private:
logger.info( logger.info(
"Replacing contents of auto closed public issues with private issues content" "Replacing contents of auto closed public issues with private issues content"
) )
all_issues = list(github.get_issues()) all_issues = list(self.github.get_issues())
updated_issues, updated_ids = replace_with_private(all_issues) updated_issues, updated_ids = self.replace_with_private(all_issues)
logger.info( logger.info(
"Deleting public issues that were updated and saving updates" "Deleting public issues that were updated and saving updates"
) )
github.delete_issues(lambda issue: issue["id"] in updated_ids) self.github.delete_issues(lambda issue: issue["id"] in updated_ids)
db.append(github.GITHUB_ISSUES_DB, updated_issues) db.append(self.github.db_path, updated_issues)
zstd_compress(github.GITHUB_ISSUES_DB) zstd_compress(self.github.db_path)
def main() -> None: def main() -> None:
@ -136,10 +149,10 @@ def main() -> None:
# Parse args to show the help if `--help` is passed # Parse args to show the help if `--help` is passed
args = parser.parse_args() args = parser.parse_args()
retriever = Retriever() retriever = Retriever(
retriever.retrieve_issues(
args.owner, args.repo, args.state, args.retrieve_events, args.retrieve_private args.owner, args.repo, args.state, args.retrieve_events, args.retrieve_private
) )
retriever.retrieve_issues()
if __name__ == "__main__": if __name__ == "__main__":

Просмотреть файл

@ -9,7 +9,7 @@ import shutil
import pytest import pytest
import zstandard import zstandard
from bugbug import bugzilla, github, repository from bugbug import bugzilla, repository
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures") FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
@ -21,7 +21,7 @@ def mock_data(tmp_path):
DBs = [ DBs = [
os.path.basename(bugzilla.BUGS_DB), os.path.basename(bugzilla.BUGS_DB),
os.path.basename(repository.COMMITS_DB), os.path.basename(repository.COMMITS_DB),
os.path.basename(github.GITHUB_ISSUES_DB), os.path.basename("data/github_webcompat_web-bugs_issues.json"),
] ]
for f in DBs: for f in DBs:

Просмотреть файл

Просмотреть файл

@ -7,9 +7,10 @@ from unittest import mock
import responses import responses
from bugbug import github from bugbug.github import Github
github.get_token = mock.Mock(return_value="mocked_token") github = Github(owner="webcompat", repo="web-bugs")
github.get_token = mock.Mock(return_value="mocked_token") # type: ignore
TEST_URL = "https://api.github.com/repos/webcompat/web-bugs/issues" TEST_URL = "https://api.github.com/repos/webcompat/web-bugs/issues"
TEST_EVENTS_URL = "https://api.github.com/repos/webcompat/web-bugs/issues/1/events" TEST_EVENTS_URL = "https://api.github.com/repos/webcompat/web-bugs/issues/1/events"
@ -84,10 +85,35 @@ def test_download_issues() -> None:
responses.GET, "https://api.github.com/test&page=3", json=expected, status=200 responses.GET, "https://api.github.com/test&page=3", json=expected, status=200
) )
github.download_issues("webcompat", "web-bugs", "all") github.download_issues()
def test_download_issues_with_events() -> None:
github.retrieve_events = True
expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
expected_events = [{"event_id": "1"}]
next_url_headers = {"link": "<https://api.github.com/test&page=3>; rel='next'"}
# Make sure required requests are made as long as next link is present in the header
responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
responses.add(
responses.GET,
"https://api.github.com/test&page=2",
json=expected,
status=200,
headers=next_url_headers,
)
responses.add(
responses.GET, "https://api.github.com/test&page=3", json=expected, status=200
)
# Mock events request
responses.add(responses.GET, TEST_EVENTS_URL, json=expected_events, status=200)
github.download_issues()
def test_download_issues_updated_since_timestamp() -> None: def test_download_issues_updated_since_timestamp() -> None:
github.retrieve_events = False
first_page = [ first_page = [
{"id": 30515129, "issue_id": "1"}, {"id": 30515129, "issue_id": "1"},
{"id": 30536238, "issue_id": "2"}, {"id": 30536238, "issue_id": "2"},
@ -135,14 +161,13 @@ def test_download_issues_updated_since_timestamp() -> None:
result = first_page + second_page + third_page result = first_page + second_page + third_page
data = github.fetch_issues_updated_since_timestamp( data = github.fetch_issues_updated_since_timestamp("2021-04-03T20:14:04+00:00")
"webcompat", "web-bugs", "all", "2021-04-03T20:14:04+00:00"
)
assert data == result assert data == result
def test_fetch_issue_by_number() -> None: def test_fetch_issue_by_number() -> None:
github.retrieve_events = False
expected = [ expected = [
{"issue_id": "1", "events_url": TEST_EVENTS_URL, "labels": [{"name": "test"}]} {"issue_id": "1", "events_url": TEST_EVENTS_URL, "labels": [{"name": "test"}]}
] ]

Просмотреть файл

@ -7,11 +7,11 @@ from unittest import mock
import responses import responses
from bugbug import github
from bugbug.github import IssueDict from bugbug.github import IssueDict
from scripts import github_issue_retriever from scripts.github_issue_retriever import Retriever
github.get_token = mock.Mock(return_value="mocked_token") github_issue_retriever = Retriever("webcompat", "web-bugs", "all", False, True)
github_issue_retriever.github.get_token = mock.Mock(return_value="mocked_token") # type: ignore
PUBLIC_BODY = """ PUBLIC_BODY = """
<p>Thanks for the report. We have closed this issue\n <p>Thanks for the report. We have closed this issue\n