Add support for multiple DBs for github (#2465)

Fixes #2460

Co-authored-by: Marco Castelluccio <mcastelluccio@mozilla.com>
This commit is contained in:
Ksenia 2021-10-05 08:31:00 -04:00 коммит произвёл GitHub
Родитель 819358b139
Коммит 627e182061
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
11 изменённых файлов: 228 добавлений и 175 удалений

Просмотреть файл

@ -14,141 +14,142 @@ from bugbug.utils import get_secret
logger = logging.getLogger(__name__)
GITHUB_ISSUES_DB = "data/github_issues.json"
db.register(
GITHUB_ISSUES_DB,
"https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_issues.latest/artifacts/public/github_issues.json.zst",
1,
)
IssueDict = NewType("IssueDict", dict)
DB_VERSION = 1
DB_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_github_{}_{}_issues.latest/artifacts/public/github_{}_{}_issues.json.zst"
PER_PAGE = 100
# Rate limit period in seconds
RATE_LIMIT_PERIOD = 900
def get_issues() -> Iterator[IssueDict]:
yield from db.read(GITHUB_ISSUES_DB)
class Github:
def __init__(
self, owner: str, repo: str, state: str = "all", retrieve_events: bool = False
) -> None:
self.owner = owner
self.repo = repo
self.state = state
self.retrieve_events = retrieve_events
self.db_path = "data/github_{}_{}_issues.json".format(self.owner, self.repo)
def delete_issues(match: Callable[[IssueDict], bool]) -> None:
db.delete(GITHUB_ISSUES_DB, match)
if not db.is_registered(self.db_path):
db.register(
self.db_path,
DB_URL.format(self.owner, self.repo, self.owner, self.repo),
DB_VERSION,
)
def get_issues(self) -> Iterator[IssueDict]:
yield from db.read(self.db_path)
@sleep_and_retry
@limits(calls=1200, period=RATE_LIMIT_PERIOD)
def api_limit():
# Allow a limited number of requests to account for rate limiting
pass
def delete_issues(self, match: Callable[[IssueDict], bool]) -> None:
db.delete(self.db_path, match)
@sleep_and_retry
@limits(calls=1200, period=RATE_LIMIT_PERIOD)
def api_limit(self):
# Allow a limited number of requests to account for rate limiting
pass
def get_token() -> str:
return get_secret("GITHUB_TOKEN")
def get_token(self) -> str:
return get_secret("GITHUB_TOKEN")
def fetch_events(self, events_url: str) -> list:
self.api_limit()
logger.info(f"Fetching {events_url}")
headers = {"Authorization": "token {}".format(self.get_token())}
response = requests.get(events_url, headers=headers)
response.raise_for_status()
events_raw = response.json()
return events_raw
def fetch_events(events_url: str) -> list:
api_limit()
logger.info(f"Fetching {events_url}")
headers = {"Authorization": "token {}".format(get_token())}
response = requests.get(events_url, headers=headers)
response.raise_for_status()
events_raw = response.json()
return events_raw
def fetch_issues(
self, url: str, retrieve_events: bool, params: dict = None
) -> Tuple[List[IssueDict], dict]:
self.api_limit()
headers = {"Authorization": "token {}".format(self.get_token())}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
# If only one issue is requested, add it to a list
if isinstance(data, dict):
data = [data]
def fetch_issues(
url: str, retrieve_events: bool, params: dict = None
) -> Tuple[List[IssueDict], dict]:
api_limit()
headers = {"Authorization": "token {}".format(get_token())}
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
logger.info(f"Fetching {url}")
# If only one issue is requested, add it to a list
if isinstance(data, dict):
data = [data]
if retrieve_events:
for item in data:
events = self.fetch_events(item["events_url"])
item.update({"events": events})
logger.info(f"Fetching {url}")
return data, response.links
if retrieve_events:
for item in data:
events = fetch_events(item["events_url"])
item.update({"events": events})
def get_start_page(self) -> int:
# Determine next page to fetch based on number of downloaded issues
issues = self.get_issues()
count = sum(1 for _ in issues)
return int(count / PER_PAGE) + 1
return data, response.links
def fetch_issues_updated_since_timestamp(self, since: str) -> List[IssueDict]:
# Fetches changed and new issues since a specified timestamp
url = "https://api.github.com/repos/{}/{}/issues".format(self.owner, self.repo)
params = {"state": self.state, "since": since, "per_page": PER_PAGE, "page": 1}
def get_start_page() -> int:
# Determine next page to fetch based on number of downloaded issues
issues = get_issues()
count = sum(1 for _ in issues)
return int(count / PER_PAGE) + 1
def fetch_issues_updated_since_timestamp(
owner: str, repo: str, state: str, since: str, retrieve_events: bool = False
) -> List[IssueDict]:
# Fetches changed and new issues since a specified timestamp
url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo)
params = {"state": state, "since": since, "per_page": PER_PAGE, "page": 1}
data, response_links = fetch_issues(
url=url, retrieve_events=retrieve_events, params=params
)
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = fetch_issues(
response_links["next"]["url"], retrieve_events
data, response_links = self.fetch_issues(
url=url, retrieve_events=self.retrieve_events, params=params
)
data += next_page_data
logger.info("Done fetching updates")
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = self.fetch_issues(
response_links["next"]["url"], self.retrieve_events
)
data += next_page_data
return data
logger.info("Done fetching updates")
return data
def download_issues(
owner: str, repo: str, state: str, retrieve_events: bool = False
) -> None:
# Fetches all issues sorted by date of creation in ascending order
url = "https://api.github.com/repos/{}/{}/issues".format(owner, repo)
start_page = get_start_page()
def download_issues(self) -> None:
# Fetches all issues sorted by date of creation in ascending order
url = "https://api.github.com/repos/{}/{}/issues".format(self.owner, self.repo)
start_page = self.get_start_page()
params = {
"state": state,
"sort": "created",
"direction": "asc",
"per_page": PER_PAGE,
"page": start_page,
}
params = {
"state": self.state,
"sort": "created",
"direction": "asc",
"per_page": PER_PAGE,
"page": start_page,
}
data, response_links = fetch_issues(
url=url, retrieve_events=retrieve_events, params=params
)
db.append(GITHUB_ISSUES_DB, data)
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = fetch_issues(
response_links["next"]["url"], retrieve_events
data, response_links = self.fetch_issues(
url=url, retrieve_events=self.retrieve_events, params=params
)
db.append(GITHUB_ISSUES_DB, next_page_data)
logger.info("Done downloading")
db.append(self.db_path, data)
# Fetch next page
while "next" in response_links.keys():
next_page_data, response_links = self.fetch_issues(
response_links["next"]["url"], self.retrieve_events
)
db.append(self.db_path, next_page_data)
logger.info("Done downloading")
def fetch_issue_by_number(
owner: str, repo: str, issue_number: int, retrieve_events: bool = False
) -> IssueDict:
# Fetches an issue by id
url = "https://api.github.com/repos/{}/{}/issues/{}".format(
owner, repo, issue_number
)
def fetch_issue_by_number(
self, owner: str, repo: str, issue_number: int, retrieve_events: bool = False
) -> IssueDict:
# Fetches an issue by id
url = "https://api.github.com/repos/{}/{}/issues/{}".format(
owner, repo, issue_number
)
data = fetch_issues(url=url, retrieve_events=retrieve_events)
data = self.fetch_issues(url=url, retrieve_events=retrieve_events)
return data[0][0]
return data[0][0]

Просмотреть файл

@ -23,7 +23,8 @@ from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_validate, train_test_split
from tabulate import tabulate
from bugbug import bugzilla, db, github, repository
from bugbug import bugzilla, db, repository
from bugbug.github import Github
from bugbug.nlp import SpacyVectorizer
from bugbug.utils import split_tuple_generator, to_array
@ -754,12 +755,14 @@ class BugCoupleModel(Model):
class IssueModel(Model):
def __init__(self, lemmatization=False):
def __init__(self, owner, repo, lemmatization=False):
Model.__init__(self, lemmatization)
self.training_dbs = [github.GITHUB_ISSUES_DB]
self.github = Github(owner=owner, repo=repo)
self.training_dbs = [self.github.db_path]
def items_gen(self, classes):
for issue in github.get_issues():
for issue in self.github.get_issues():
issue_number = issue["number"]
if issue_number not in classes:
continue

Просмотреть файл

@ -9,7 +9,7 @@ import xgboost
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from bugbug import feature_cleanup, github, issue_features, utils
from bugbug import feature_cleanup, issue_features, utils
from bugbug.model import IssueModel
logger = logging.getLogger(__name__)
@ -17,7 +17,9 @@ logger = logging.getLogger(__name__)
class NeedsDiagnosisModel(IssueModel):
def __init__(self, lemmatization=False):
IssueModel.__init__(self, lemmatization)
IssueModel.__init__(
self, owner="webcompat", repo="web-bugs", lemmatization=lemmatization
)
self.calculate_importance = False
@ -59,10 +61,11 @@ class NeedsDiagnosisModel(IssueModel):
def get_labels(self):
classes = {}
for issue in github.get_issues():
for issue in self.github.get_issues():
# Skip issues with empty title or body
if issue["title"] is None or issue["body"] is None:
continue
# Skip issues that are not moderated yet as they don't have a meaningful title or body
if issue["title"] == "In the moderation queue.":
continue

Просмотреть файл

@ -14,7 +14,8 @@ import requests
import zstandard
from redis import Redis
from bugbug import bugzilla, github, repository, test_scheduling
from bugbug import bugzilla, repository, test_scheduling
from bugbug.github import Github
from bugbug.model import Model
from bugbug.utils import get_hgmo_stack
from bugbug_http.readthrough_cache import ReadthroughTTLCache
@ -113,6 +114,8 @@ def classify_issue(
) -> str:
from bugbug_http.app import JobInfo
github = Github(owner=owner, repo=repo)
issue_ids_set = set(map(int, issue_nums))
issues = {

Просмотреть файл

@ -249,11 +249,11 @@ tasks:
- --retrieve-private
artifacts:
public/github_issues.json.zst:
path: /data/github_issues.json.zst
public/github_webcompat_web-bugs_issues.json.zst:
path: /data/github_webcompat_web-bugs_issues.json.zst
type: file
public/github_issues.json.version:
path: /data/github_issues.json.version
public/github_webcompat_web-bugs_issues.json.version:
path: /data/github_webcompat_web-bugs_issues.json.version
type: file
features:
@ -263,8 +263,8 @@ tasks:
routes:
- notify.email.release-mgmt-analysis@mozilla.com.on-failed
- notify.irc-channel.#bugbug.on-failed
- index.project.bugbug.data_github_issues.${version}
- index.project.bugbug.data_github_issues.latest
- index.project.bugbug.data_github_webcompat_web-bugs_issues.${version}
- index.project.bugbug.data_github_webcompat_web-bugs_issues.latest
metadata:
name: bugbug webcompat issues retrieval
description: bugbug webcompat issues retrieval

Просмотреть файл

@ -7,7 +7,8 @@ from logging import INFO, basicConfig, getLogger
import numpy as np
import requests
from bugbug import db, github
from bugbug import db
from bugbug.github import Github
from bugbug.models import get_model_class
from bugbug.utils import download_model
@ -34,13 +35,17 @@ def classify_issues(
model_class = get_model_class(model_name)
model = model_class.load(model_file_name)
github = Github(
owner=owner, repo=repo, state="all", retrieve_events=retrieve_events
)
if issue_number:
issues = iter(
[github.fetch_issue_by_number(owner, repo, issue_number, retrieve_events)]
)
assert issues, f"An issue with a number of {issue_number} was not found"
else:
assert db.download(github.GITHUB_ISSUES_DB)
assert db.download(github.db_path)
issues = github.get_issues()
for issue in issues:

Просмотреть файл

@ -7,53 +7,66 @@ import argparse
from logging import getLogger
from typing import List, Tuple
from bugbug import db, github
from bugbug.github import IssueDict
from bugbug import db
from bugbug.github import Github, IssueDict
from bugbug.utils import extract_private, zstd_compress
logger = getLogger(__name__)
def replace_with_private(original_data: List[IssueDict]) -> Tuple[List[IssueDict], set]:
"""Replace title and body of automatically closed public issues.
Replace them with title and body of a corresponding private issue
to account for moderation workflow in webcompat repository
"""
updated_ids = set()
updated_issues = []
for item in original_data:
if item["title"] == "Issue closed.":
extracted = extract_private(item["body"])
if extracted is None:
continue
owner, repo, issue_number = extracted
private_issue = github.fetch_issue_by_number(owner, repo, issue_number)
if private_issue:
item["title"] = private_issue["title"]
item["body"] = private_issue["body"]
updated_ids.add(item["id"])
updated_issues.append(item)
return updated_issues, updated_ids
class Retriever(object):
def retrieve_issues(
def __init__(
self,
owner: str,
repo: str,
state: str,
retrieve_events: bool,
retrieve_private: bool,
) -> None:
):
self.owner = owner
self.repo = repo
self.state = state
self.retrieve_events = retrieve_events
self.retrieve_private = retrieve_private
self.github = Github(
owner=owner, repo=repo, state=state, retrieve_events=retrieve_events
)
def replace_with_private(
self, original_data: List[IssueDict]
) -> Tuple[List[IssueDict], set]:
"""Replace title and body of automatically closed public issues.
Replace them with title and body of a corresponding private issue
to account for moderation workflow in webcompat repository
"""
updated_ids = set()
updated_issues = []
for item in original_data:
if item["title"] == "Issue closed.":
extracted = extract_private(item["body"])
if extracted is None:
continue
owner, repo, issue_number = extracted
private_issue = self.github.fetch_issue_by_number(
owner, repo, issue_number
)
if private_issue:
item["title"] = private_issue["title"]
item["body"] = private_issue["body"]
updated_ids.add(item["id"])
updated_issues.append(item)
return updated_issues, updated_ids
def retrieve_issues(self) -> None:
last_modified = None
db.download(github.GITHUB_ISSUES_DB)
db.download(self.github.db_path)
try:
last_modified = db.last_modified(github.GITHUB_ISSUES_DB)
last_modified = db.last_modified(self.github.db_path)
except Exception:
pass
@ -61,44 +74,44 @@ class Retriever(object):
logger.info(
f"Retrieving issues modified or created since the last run on {last_modified.isoformat()}"
)
data = github.fetch_issues_updated_since_timestamp(
owner, repo, state, last_modified.isoformat(), retrieve_events
data = self.github.fetch_issues_updated_since_timestamp(
last_modified.isoformat()
)
if retrieve_private:
if self.retrieve_private:
logger.info(
"Replacing contents of auto closed public issues with private issues content"
)
replace_with_private(data)
self.replace_with_private(data)
updated_ids = set(issue["id"] for issue in data)
logger.info(
"Deleting issues that were changed since the last run and saving updates"
)
github.delete_issues(lambda issue: issue["id"] in updated_ids)
self.github.delete_issues(lambda issue: issue["id"] in updated_ids)
db.append(github.GITHUB_ISSUES_DB, data)
db.append(self.github.db_path, data)
logger.info("Updating finished")
else:
logger.info("Retrieving all issues since last_modified is not available")
github.download_issues(owner, repo, state, retrieve_events)
self.github.download_issues()
if retrieve_private:
if self.retrieve_private:
logger.info(
"Replacing contents of auto closed public issues with private issues content"
)
all_issues = list(github.get_issues())
updated_issues, updated_ids = replace_with_private(all_issues)
all_issues = list(self.github.get_issues())
updated_issues, updated_ids = self.replace_with_private(all_issues)
logger.info(
"Deleting public issues that were updated and saving updates"
)
github.delete_issues(lambda issue: issue["id"] in updated_ids)
db.append(github.GITHUB_ISSUES_DB, updated_issues)
self.github.delete_issues(lambda issue: issue["id"] in updated_ids)
db.append(self.github.db_path, updated_issues)
zstd_compress(github.GITHUB_ISSUES_DB)
zstd_compress(self.github.db_path)
def main() -> None:
@ -136,10 +149,10 @@ def main() -> None:
# Parse args to show the help if `--help` is passed
args = parser.parse_args()
retriever = Retriever()
retriever.retrieve_issues(
retriever = Retriever(
args.owner, args.repo, args.state, args.retrieve_events, args.retrieve_private
)
retriever.retrieve_issues()
if __name__ == "__main__":

Просмотреть файл

@ -9,7 +9,7 @@ import shutil
import pytest
import zstandard
from bugbug import bugzilla, github, repository
from bugbug import bugzilla, repository
FIXTURES_DIR = os.path.join(os.path.dirname(__file__), "fixtures")
@ -21,7 +21,7 @@ def mock_data(tmp_path):
DBs = [
os.path.basename(bugzilla.BUGS_DB),
os.path.basename(repository.COMMITS_DB),
os.path.basename(github.GITHUB_ISSUES_DB),
os.path.basename("data/github_webcompat_web-bugs_issues.json"),
]
for f in DBs:

Просмотреть файл

Просмотреть файл

@ -7,9 +7,10 @@ from unittest import mock
import responses
from bugbug import github
from bugbug.github import Github
github.get_token = mock.Mock(return_value="mocked_token")
github = Github(owner="webcompat", repo="web-bugs")
github.get_token = mock.Mock(return_value="mocked_token") # type: ignore
TEST_URL = "https://api.github.com/repos/webcompat/web-bugs/issues"
TEST_EVENTS_URL = "https://api.github.com/repos/webcompat/web-bugs/issues/1/events"
@ -84,10 +85,35 @@ def test_download_issues() -> None:
responses.GET, "https://api.github.com/test&page=3", json=expected, status=200
)
github.download_issues("webcompat", "web-bugs", "all")
github.download_issues()
def test_download_issues_with_events() -> None:
github.retrieve_events = True
expected = [{"issue_id": "1", "events_url": TEST_EVENTS_URL}]
expected_events = [{"event_id": "1"}]
next_url_headers = {"link": "<https://api.github.com/test&page=3>; rel='next'"}
# Make sure required requests are made as long as next link is present in the header
responses.add(responses.GET, TEST_URL, json=expected, status=200, headers=HEADERS)
responses.add(
responses.GET,
"https://api.github.com/test&page=2",
json=expected,
status=200,
headers=next_url_headers,
)
responses.add(
responses.GET, "https://api.github.com/test&page=3", json=expected, status=200
)
# Mock events request
responses.add(responses.GET, TEST_EVENTS_URL, json=expected_events, status=200)
github.download_issues()
def test_download_issues_updated_since_timestamp() -> None:
github.retrieve_events = False
first_page = [
{"id": 30515129, "issue_id": "1"},
{"id": 30536238, "issue_id": "2"},
@ -135,14 +161,13 @@ def test_download_issues_updated_since_timestamp() -> None:
result = first_page + second_page + third_page
data = github.fetch_issues_updated_since_timestamp(
"webcompat", "web-bugs", "all", "2021-04-03T20:14:04+00:00"
)
data = github.fetch_issues_updated_since_timestamp("2021-04-03T20:14:04+00:00")
assert data == result
def test_fetch_issue_by_number() -> None:
github.retrieve_events = False
expected = [
{"issue_id": "1", "events_url": TEST_EVENTS_URL, "labels": [{"name": "test"}]}
]

Просмотреть файл

@ -7,11 +7,11 @@ from unittest import mock
import responses
from bugbug import github
from bugbug.github import IssueDict
from scripts import github_issue_retriever
from scripts.github_issue_retriever import Retriever
github.get_token = mock.Mock(return_value="mocked_token")
github_issue_retriever = Retriever("webcompat", "web-bugs", "all", False, True)
github_issue_retriever.github.get_token = mock.Mock(return_value="mocked_token") # type: ignore
PUBLIC_BODY = """
<p>Thanks for the report. We have closed this issue\n