Generate Phabricator revisions DB similarly to bugs DB

This way the scripts using Phabricator revisions don't have to redownload revisions
from Phabricator every time they run.
This commit is contained in:
Marco Castelluccio 2020-10-27 00:22:38 +01:00
Родитель fc210ad23b
Коммит f84945f86f
6 изменённых файлов: 182 добавлений и 22 удалений

Просмотреть файл

@ -3,9 +3,22 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
from typing import Collection, Iterator, List, NewType
from libmozdata.phabricator import PhabricatorAPI
from tqdm import tqdm
from bugbug import db
RevisionDict = NewType("RevisionDict", dict)
REVISIONS_DB = "data/revisions.json"
db.register(
REVISIONS_DB,
"https://community-tc.services.mozilla.com/api/index/v1/task/project.bugbug.data_revisions.latest/artifacts/public/revisions.json.zst",
1,
)
PHABRICATOR_API = None
TESTING_PROJECTS = {
@ -17,38 +30,54 @@ TESTING_PROJECTS = {
}
def set_api_key(url, api_key):
def get_revisions() -> Iterator[RevisionDict]:
yield from db.read(REVISIONS_DB)
def set_api_key(url: str, api_key: str) -> None:
global PHABRICATOR_API
PHABRICATOR_API = PhabricatorAPI(api_key, url)
def get(rev_ids):
def get(rev_ids: Collection[int]) -> Collection[RevisionDict]:
assert PHABRICATOR_API is not None
data = {}
rev_ids = list(set(rev_ids))
rev_ids_groups = (rev_ids[i : i + 100] for i in range(0, len(rev_ids), 100))
with tqdm(total=len(rev_ids)) as progress_bar:
for rev_ids_group in rev_ids_groups:
out = PHABRICATOR_API.request(
"differential.revision.search",
constraints={
"ids": rev_ids_group,
"ids": rev_ids,
},
attachments={"projects": True},
)
for result in out["data"]:
data[result["id"]] = result
return out["data"]
def download_revisions(rev_ids: Collection[int]) -> None:
old_rev_count = 0
new_rev_ids = set(int(rev_id) for rev_id in rev_ids)
for rev in get_revisions():
old_rev_count += 1
if rev["id"] in new_rev_ids:
new_rev_ids.remove(rev["id"])
print(f"Loaded {old_rev_count} revisions.")
new_rev_ids_list = sorted(list(new_rev_ids))
rev_ids_groups = (
new_rev_ids_list[i : i + 100] for i in range(0, len(new_rev_ids_list), 100)
)
with tqdm(total=len(new_rev_ids)) as progress_bar:
for rev_ids_group in rev_ids_groups:
revisions = get(rev_ids_group)
progress_bar.update(len(rev_ids_group))
return data
db.append(REVISIONS_DB, revisions)
def get_testing_projects(rev):
def get_testing_projects(rev: RevisionDict) -> List[str]:
return [
TESTING_PROJECTS[projectPHID]
for projectPHID in rev["attachments"]["projects"]["projectPHIDs"]

Просмотреть файл

@ -194,6 +194,46 @@ tasks:
owner: release-mgmt-analysis@mozilla.com
source: ${repository}/raw/master/data-pipeline.yml
- ID: revisions-retrieval
created: {$fromNow: ''}
deadline: {$fromNow: '2 days'}
expires: {$fromNow: '1 month'}
provisionerId: proj-bugbug
workerType: batch
dependencies:
- commit-retrieval
payload:
env:
TC_SECRET_ID: project/bugbug/production
maxRunTime: 86400
image: mozilla/bugbug-base:${version}
command:
- "bugbug-data-revisions"
artifacts:
public/revisions.json.zst:
path: /data/revisions.json.zst
type: file
public/revisions.json.version:
path: /data/revisions.json.version
type: file
features:
taskclusterProxy:
true
scopes:
- "secrets:get:project/bugbug/production"
routes:
- notify.email.release-mgmt-analysis@mozilla.com.on-failed
- notify.irc-channel.#bugbug.on-failed
- index.project.bugbug.data_revisions.${version}
- index.project.bugbug.data_revisions.latest
metadata:
name: bugbug revisions retrieval
description: bugbug revisions retrieval
owner: release-mgmt-analysis@mozilla.com
source: ${repository}/raw/master/data-pipeline.yml
- ID: test-label-scheduling-history-push_data-retrieval
created: {$fromNow: ''}
deadline: {$fromNow: '3 days'}
@ -1348,6 +1388,7 @@ tasks:
expires: {$fromNow: '1 month'}
dependencies:
- past-bugs-by-unit
- revisions-retrieval
scopes:
- hooks:modify-hook:project-bugbug/bugbug-landings-risk-report
- assume:hook-id:project-bugbug/bugbug-landings-risk-report

Просмотреть файл

@ -67,6 +67,9 @@ class LandingsRiskReportGenerator(object):
rev_start="children({})".format(commit["node"]),
)
logger.info("Downloading revisions database...")
assert db.download(phabricator.REVISIONS_DB)
logger.info("Downloading bugs database...")
assert db.download(bugzilla.BUGS_DB)
@ -154,10 +157,18 @@ class LandingsRiskReportGenerator(object):
}
logger.info("Retrieve Phabricator revisions linked to commits...")
revisions = list(
revision_ids = set(
filter(None, (repository.get_revision_id(commit) for commit in commits))
)
revision_map = phabricator.get(revisions)
logger.info("Download revisions of interest...")
phabricator.download_revisions(revision_ids)
revision_map = {
revision["id"]: revision
for revision in phabricator.get_revisions()
if revision["id"] in revision_ids
}
if meta_bugs is not None:
blocker_to_meta = collections.defaultdict(set)

Просмотреть файл

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
import argparse
from datetime import datetime
from logging import getLogger
from typing import Optional
import dateutil.parser
from dateutil.relativedelta import relativedelta
from bugbug import db, phabricator, repository
from bugbug.utils import get_secret, zstd_compress
logger = getLogger(__name__)
class Retriever(object):
def retrieve_revisions(self, limit: Optional[int] = None) -> None:
phabricator.set_api_key(
get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")
)
db.download(phabricator.REVISIONS_DB)
# Get the commits DB, as we need it to get the revision IDs linked to recent commits.
assert db.download(repository.COMMITS_DB)
# Get IDs of revisions linked to commits since a year ago.
start_date = datetime.now() - relativedelta(years=1)
revision_ids = list(
(
filter(
None,
(
repository.get_revision_id(commit)
for commit in repository.get_commits()
if dateutil.parser.parse(commit["pushdate"]) >= start_date
),
)
)
)
if limit is not None:
revision_ids = revision_ids[-limit:]
phabricator.download_revisions(revision_ids)
zstd_compress(phabricator.REVISIONS_DB)
def main() -> None:
description = "Retrieve revisions from Phabricator"
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
"--limit",
type=int,
help="Only download the N oldest revisions, used mainly for integration tests",
)
# Parse args to show the help if `--help` is passed
args = parser.parse_args()
retriever = Retriever()
retriever.retrieve_revisions(args.limit)
if __name__ == "__main__":
main()

Просмотреть файл

@ -38,6 +38,9 @@ class TestingPolicyStatsGenerator(object):
rev_start="children({})".format(commit["node"]),
)
logger.info("Downloading revisions database...")
assert db.download(phabricator.REVISIONS_DB)
logger.info("Downloading bugs database...")
assert db.download(bugzilla.BUGS_DB)
@ -63,10 +66,18 @@ class TestingPolicyStatsGenerator(object):
commits = self.get_landed_since(days_start, days_end)
logger.info("Retrieve Phabricator revisions linked to commits...")
revision_ids = list(
revision_ids = set(
filter(None, (repository.get_revision_id(commit) for commit in commits))
)
revision_map = phabricator.get(revision_ids)
logger.info("Download revisions of interest...")
phabricator.download_revisions(revision_ids)
revision_map = {
revision["id"]: revision
for revision in phabricator.get_revisions()
if revision["id"] in revision_ids
}
logger.info("Download bugs of interest...")
bugzilla.download_bugs(

Просмотреть файл

@ -46,6 +46,7 @@ setup(
"bugbug-data-commits = scripts.commit_retriever:main",
"bugbug-data-bugzilla = scripts.bug_retriever:main",
"bugbug-data-test-scheduling-history = scripts.test_scheduling_history_retriever:main",
"bugbug-data-revisions = scripts.revisions_retriever:main",
"bugbug-train = scripts.trainer:main",
"bugbug-train-similarity = scripts.similarity_trainer:main",
"bugbug-check = scripts.check:main",