Fetch private issue contents to replace public issue body and title in the dataset to account for moderation workflow (#2387)

This commit is contained in:
Ksenia 2021-07-06 12:31:44 -04:00 коммит произвёл GitHub
Родитель 750e6a73c5
Коммит d1cb24fb78
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
6 изменённых файлов: 220 добавлений и 3 удалений

Просмотреть файл

@ -13,6 +13,7 @@ def rollback(issue, when=None):
if (
event["event"] == "renamed"
and event["rename"]["from"] != "In the moderation queue."
and event["rename"]["from"] != "Issue closed."
):
issue["title"] = event["rename"]["from"]

Просмотреть файл

@ -8,9 +8,11 @@ import errno
import json
import logging
import os
import re
import socket
import subprocess
import tarfile
import urllib.parse
from collections import deque
from contextlib import contextmanager
from datetime import datetime
@ -468,3 +470,29 @@ def get_hgmo_stack(branch: str, revision: str) -> List[bytes]:
def get_physical_cpu_count() -> int:
return psutil.cpu_count(logical=False)
def extract_metadata(body: str) -> dict:
"""Extract metadata as dict from github issue body.
Extract all metadata items and return a dictionary
Example metadata format: <!-- @public_url: *** -->
"""
match_list = re.findall(r"<!--\s@(\w+):\s(.+)\s-->", body)
return dict(match_list)
def extract_private(issue_body: str) -> Optional[tuple]:
"""Extract private issue information from public issue body
Parse public issue body and extract private issue number and
its owner/repository (webcompat repository usecase)
"""
private_url = extract_metadata(issue_body).get("private_url", "").strip()
private_issue_path = urllib.parse.urlparse(private_url).path
if private_issue_path:
owner, repo, _, number = tuple(private_issue_path.split("/")[1:])
return owner, repo, number
return None

Просмотреть файл

@ -245,6 +245,7 @@ tasks:
- --owner=webcompat
- --repo=web-bugs
- --retrieve-events
- --retrieve-private
artifacts:
public/github_issues.json.zst:

Просмотреть файл

@ -5,16 +5,48 @@
import argparse
from logging import getLogger
from typing import List, Tuple
from bugbug import db, github
from bugbug.utils import zstd_compress
from bugbug.github import IssueDict
from bugbug.utils import extract_private, zstd_compress
logger = getLogger(__name__)
def replace_with_private(original_data: List[IssueDict]) -> Tuple[List[IssueDict], set]:
"""Replace title and body of automatically closed public issues.
Replace them with title and body of a corresponding private issue
to account for moderation workflow in webcompat repository
"""
updated_ids = set()
updated_issues = []
for item in original_data:
if item["title"] == "Issue closed.":
extracted = extract_private(item["body"])
if extracted is None:
continue
owner, repo, issue_number = extracted
private_issue = github.fetch_issue_by_number(owner, repo, issue_number)
if private_issue:
item["title"] = private_issue["title"]
item["body"] = private_issue["body"]
updated_ids.add(item["id"])
updated_issues.append(item)
return updated_issues, updated_ids
class Retriever(object):
def retrieve_issues(
self, owner: str, repo: str, state: str, retrieve_events: bool
self,
owner: str,
repo: str,
state: str,
retrieve_events: bool,
retrieve_private: bool,
) -> None:
last_modified = None
@ -33,6 +65,12 @@ class Retriever(object):
owner, repo, state, last_modified.isoformat(), retrieve_events
)
if retrieve_private:
logger.info(
"Replacing contents of auto closed public issues with private issues content"
)
replace_with_private(data)
updated_ids = set(issue["id"] for issue in data)
logger.info(
@ -46,6 +84,20 @@ class Retriever(object):
logger.info("Retrieving all issues since last_modified is not available")
github.download_issues(owner, repo, state, retrieve_events)
if retrieve_private:
logger.info(
"Replacing contents of auto closed public issues with private issues content"
)
all_issues = list(github.get_issues())
updated_issues, updated_ids = replace_with_private(all_issues)
logger.info(
"Deleting public issues that were updated and saving updates"
)
github.delete_issues(lambda issue: issue["id"] in updated_ids)
db.append(github.GITHUB_ISSUES_DB, updated_issues)
zstd_compress(github.GITHUB_ISSUES_DB)
@ -75,12 +127,19 @@ def main() -> None:
action="store_true",
help="Whether to retrieve events for each issue.",
)
parser.add_argument(
"--retrieve-private",
action="store_true",
help="Whether to retrieve private issue content (only webcompat repository usecase).",
)
# Parse args to show the help if `--help` is passed
args = parser.parse_args()
retriever = Retriever()
retriever.retrieve_issues(args.owner, args.repo, args.state, args.retrieve_events)
retriever.retrieve_issues(
args.owner, args.repo, args.state, args.retrieve_events, args.retrieve_private
)
if __name__ == "__main__":

Просмотреть файл

@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from unittest import mock
import responses
from bugbug import github
from bugbug.github import IssueDict
from scripts import github_issue_retriever
github.get_token = mock.Mock(return_value="mocked_token")
PUBLIC_BODY = """
<p>Thanks for the report. We have closed this issue\n
automatically as we suspect it is invalid. If we made
a mistake, please\nfile a new issue and try to provide
more context.</p>\n
<!-- @private_url: https://github.com/webcompat/web-bugs-private/issues/12345 -->\n
"""
def test_replace_with_private() -> None:
public_closed_issue = IssueDict(
{"title": "Issue closed.", "body": PUBLIC_BODY, "id": 3456}
)
public_open_issue = IssueDict(
{"title": "example.com - test issue", "body": "issue body", "id": 3457}
)
data = [
public_closed_issue,
public_open_issue,
]
private_issue = IssueDict(
{
"title": "www.example.com - actual title",
"body": "<p>Actual body</p>",
"id": 1,
}
)
# Mock private issue request
responses.add(
responses.GET,
"https://api.github.com/repos/webcompat/web-bugs-private/issues/12345",
json=private_issue,
status=200,
)
expected = IssueDict(public_closed_issue.copy())
expected["title"] = private_issue["title"]
expected["body"] = private_issue["body"]
(
updated_issues,
updated_ids,
) = github_issue_retriever.replace_with_private(data)
assert len(updated_ids) == 1
assert len(updated_issues) == 1
assert len(data) == 2
assert public_closed_issue["id"] in updated_ids
# assert that public issue in the original list is changed
assert data[0] == expected
# assert that updated list contains an issue with private content
assert updated_issues[0] == expected
def test_replace_missing_private() -> None:
public_closed_issue_no_private = IssueDict(
{"title": "Issue closed.", "body": "no private link", "id": 3459}
)
public_open_issue = IssueDict(
{"title": "example.com - test issue 2", "body": "issue body", "id": 3458}
)
data = [public_closed_issue_no_private, public_open_issue]
expected = IssueDict(public_closed_issue_no_private.copy())
(
updated_issues,
updated_ids,
) = github_issue_retriever.replace_with_private(data)
assert len(updated_ids) == 0
assert len(updated_issues) == 0
assert len(data) == 2
assert data[0] == expected

Просмотреть файл

@ -293,3 +293,37 @@ def test_extract_db_bad_format(tmp_path):
with pytest.raises(AssertionError):
utils.extract_file(path)
def test_extract_metadata() -> None:
body = """
<!-- @private_url: https://github.com/webcompat/web-bugs-private/issues/12345 -->\n
"""
expected = {
"private_url": "https://github.com/webcompat/web-bugs-private/issues/12345"
}
result = utils.extract_metadata(body)
assert result == expected
result = utils.extract_metadata("test")
assert result == {}
def test_extract_private_url() -> None:
body = """
<p>Thanks for the report. We have closed this issue\n
automatically as we suspect it is invalid. If we made
a mistake, please\nfile a new issue and try to provide
more context.</p>\n
<!-- @private_url: https://github.com/webcompat/web-bugs-private/issues/12345 -->\n
"""
expected = ("webcompat", "web-bugs-private", "12345")
result = utils.extract_private(body)
assert result == expected
def test_extract_private_url_empty() -> None:
body = """<p>Test content</p> """
result = utils.extract_private(body)
assert result is None