2019-04-09 17:30:09 +03:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
from datetime import datetime
|
2019-09-13 16:18:46 +03:00
|
|
|
from logging import getLogger
|
2019-04-09 17:30:09 +03:00
|
|
|
|
2019-07-25 02:04:15 +03:00
|
|
|
import dateutil.parser
|
2019-04-09 17:30:09 +03:00
|
|
|
from dateutil.relativedelta import relativedelta
|
|
|
|
|
2019-07-25 02:04:15 +03:00
|
|
|
from bugbug import bug_snapshot, bugzilla, db, labels, repository
|
2019-08-04 16:29:06 +03:00
|
|
|
from bugbug.utils import get_secret, zstd_compress
|
2019-04-09 17:30:09 +03:00
|
|
|
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
class Retriever(object):
|
2019-09-13 16:18:46 +03:00
|
|
|
def retrieve_bugs(self, limit=None):
|
2019-04-09 17:30:09 +03:00
|
|
|
bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
|
|
|
|
|
2019-06-09 01:32:18 +03:00
|
|
|
if not db.is_old_version(bugzilla.BUGS_DB):
|
|
|
|
db.download(bugzilla.BUGS_DB)
|
|
|
|
|
|
|
|
# Get IDs of bugs changed since last run.
|
|
|
|
last_modified = db.last_modified(bugzilla.BUGS_DB)
|
|
|
|
logger.info(
|
|
|
|
f"Retrieving IDs of bugs modified since the last run on {last_modified}"
|
|
|
|
)
|
|
|
|
changed_ids = bugzilla.get_ids(
|
|
|
|
{"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()}
|
|
|
|
)
|
|
|
|
logger.info(f"Retrieved {len(changed_ids)} IDs.")
|
|
|
|
|
|
|
|
# Get IDs of bugs between (two years and six months ago) and (six months ago).
|
2019-04-09 17:30:09 +03:00
|
|
|
six_months_ago = datetime.utcnow() - relativedelta(months=6)
|
|
|
|
two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
|
|
|
|
logger.info(
|
2019-06-09 01:32:18 +03:00
|
|
|
f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
|
|
|
|
)
|
|
|
|
timespan_ids = bugzilla.get_ids_between(
|
|
|
|
two_years_and_six_months_ago, six_months_ago
|
|
|
|
)
|
2019-09-13 16:18:46 +03:00
|
|
|
if limit:
|
|
|
|
timespan_ids = timespan_ids[:limit]
|
2019-06-09 01:32:18 +03:00
|
|
|
logger.info(f"Retrieved {len(timespan_ids)} IDs.")
|
|
|
|
|
|
|
|
# Get IDs of labelled bugs.
|
|
|
|
labelled_bug_ids = labels.get_all_bug_ids()
|
2019-09-13 16:18:46 +03:00
|
|
|
if limit:
|
|
|
|
labelled_bug_ids = labelled_bug_ids[:limit]
|
2019-06-09 01:32:18 +03:00
|
|
|
logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")
|
|
|
|
|
2019-07-25 02:04:15 +03:00
|
|
|
# Get the commits DB, as we need it to get the bug IDs linked to recent commits.
|
2019-07-27 00:18:47 +03:00
|
|
|
if db.is_old_version(repository.COMMITS_DB) or not db.exists(
|
2019-07-25 02:04:15 +03:00
|
|
|
repository.COMMITS_DB
|
|
|
|
):
|
|
|
|
db.download(repository.COMMITS_DB, force=True)
|
|
|
|
|
|
|
|
# Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor).
|
|
|
|
start_date = datetime.now() - relativedelta(years=2, months=6)
|
|
|
|
commit_bug_ids = [
|
|
|
|
commit["bug_id"]
|
|
|
|
for commit in repository.get_commits()
|
|
|
|
if commit["bug_id"]
|
|
|
|
and dateutil.parser.parse(commit["pushdate"]) >= start_date
|
|
|
|
]
|
2019-09-13 16:18:46 +03:00
|
|
|
if limit:
|
2019-10-24 21:09:32 +03:00
|
|
|
commit_bug_ids = commit_bug_ids[-limit:]
|
2019-07-25 02:04:15 +03:00
|
|
|
logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.")
|
|
|
|
|
2019-07-25 02:06:54 +03:00
|
|
|
# Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model).
|
|
|
|
regressed_by_bug_ids = sum(
|
|
|
|
[
|
|
|
|
bug["regressed_by"]
|
|
|
|
for bug in bugzilla.get_bugs()
|
|
|
|
if bug["id"] in commit_bug_ids
|
|
|
|
],
|
|
|
|
[],
|
|
|
|
)
|
2019-11-02 19:15:20 +03:00
|
|
|
if limit:
|
|
|
|
regressed_by_bug_ids = regressed_by_bug_ids[-limit:]
|
2019-07-25 02:06:54 +03:00
|
|
|
logger.info(
|
|
|
|
f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
|
|
|
|
)
|
|
|
|
|
|
|
|
all_ids = (
|
|
|
|
timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids
|
|
|
|
)
|
2019-07-25 02:04:15 +03:00
|
|
|
all_ids_set = set(all_ids)
|
2019-06-09 01:32:18 +03:00
|
|
|
|
|
|
|
# We have to redownload bugs that were changed since the last download.
|
|
|
|
# We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
|
|
|
|
bugzilla.delete_bugs(
|
2019-07-25 02:04:15 +03:00
|
|
|
lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set
|
2019-04-09 17:30:09 +03:00
|
|
|
)
|
|
|
|
|
2019-07-25 02:04:15 +03:00
|
|
|
bugzilla.download_bugs(all_ids)
|
2019-04-09 17:30:09 +03:00
|
|
|
|
2019-07-25 02:06:54 +03:00
|
|
|
# Get regressed_by_bug_ids again (the set could have changed after downloading new bugs).
|
|
|
|
regressed_by_bug_ids = sum(
|
|
|
|
[
|
|
|
|
bug["regressed_by"]
|
|
|
|
for bug in bugzilla.get_bugs()
|
|
|
|
if bug["id"] in commit_bug_ids
|
|
|
|
],
|
|
|
|
[],
|
|
|
|
)
|
|
|
|
logger.info(
|
|
|
|
f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits."
|
|
|
|
)
|
|
|
|
|
|
|
|
bugzilla.download_bugs(regressed_by_bug_ids)
|
|
|
|
|
2019-04-09 17:30:09 +03:00
|
|
|
# Try to re-download inconsistent bugs, up to three times.
|
2019-06-18 23:37:01 +03:00
|
|
|
inconsistent_bugs = bugzilla.get_bugs()
|
2019-04-09 17:30:09 +03:00
|
|
|
for i in range(3):
|
2019-06-18 23:37:01 +03:00
|
|
|
# We look for inconsistencies in all bugs first, then, on following passes,
|
|
|
|
# we only look for inconsistencies in bugs that were found to be inconsistent in the first pass
|
|
|
|
inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs)
|
|
|
|
inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs)
|
|
|
|
|
|
|
|
if len(inconsistent_bug_ids) == 0:
|
2019-04-09 17:30:09 +03:00
|
|
|
break
|
|
|
|
|
|
|
|
logger.info(
|
2019-06-18 23:37:01 +03:00
|
|
|
f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent"
|
2019-04-09 17:30:09 +03:00
|
|
|
)
|
2019-06-18 23:37:01 +03:00
|
|
|
bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids)
|
|
|
|
bugzilla.download_bugs(inconsistent_bug_ids)
|
2019-04-09 17:30:09 +03:00
|
|
|
|
2019-08-04 16:29:06 +03:00
|
|
|
zstd_compress("data/bugs.json")
|
2019-04-09 17:30:09 +03:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
description = "Retrieve and extract the information from Bugzilla instance"
|
|
|
|
parser = argparse.ArgumentParser(description=description)
|
2019-09-13 16:18:46 +03:00
|
|
|
parser.add_argument(
|
|
|
|
"--limit",
|
|
|
|
type=int,
|
|
|
|
help="Only download the N oldest bugs, used mainly for integration tests",
|
|
|
|
)
|
2019-04-09 17:30:09 +03:00
|
|
|
|
|
|
|
# Parse args to show the help if `--help` is passed
|
2019-09-13 16:18:46 +03:00
|
|
|
args = parser.parse_args()
|
2019-04-09 17:30:09 +03:00
|
|
|
|
|
|
|
retriever = Retriever()
|
2019-09-13 16:18:46 +03:00
|
|
|
retriever.retrieve_bugs(args.limit)
|
2019-09-09 23:55:40 +03:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|