In the retrieval task, download only new or changed bugs

To support it, refactor bugzilla methods:
- adding methods to get IDs given a query and given a time period;
- renaming the internal _download method to get, since it's used externally;
- changing delete to be more flexible and allowing to use a lambda to choose which bugs to delete.

Fixes #440.
This commit is contained in:
Marco Castelluccio 2019-06-09 00:32:18 +02:00
Родитель cb8aacd71a
Коммит 735fccc4a9
4 изменённых файлов: 97 добавлений и 83 удалений

Просмотреть файл

@ -22,6 +22,30 @@ db.register(
1,
)
PRODUCTS = (
"Add-on SDK",
"Android Background Services",
"Core",
"Core Graveyard",
"DevTools",
"DevTools Graveyard",
"External Software Affecting Firefox",
"Firefox",
"Firefox Graveyard",
"Firefox Build System",
"Firefox for Android",
"Firefox for Android Graveyard",
# 'Firefox for iOS',
"Firefox Health Report",
# 'Focus',
# 'Hello (Loop)',
"NSPR",
"NSS",
"Toolkit",
"Toolkit Graveyard",
"WebExtensions",
)
ATTACHMENT_INCLUDE_FIELDS = [
"id",
"is_obsolete",
@ -59,7 +83,28 @@ def set_token(token):
Bugzilla.TOKEN = token
def _download(ids_or_query):
def get_ids(params):
assert "include_fields" not in params or params["include_fields"] == "id"
old_CHUNK_SIZE = Bugzilla.BUGZILLA_CHUNK_SIZE
try:
Bugzilla.BUGZILLA_CHUNK_SIZE = 7000
all_ids = []
def bughandler(bug):
all_ids.append(bug["id"])
params["include_fields"] = "id"
Bugzilla(params, bughandler=bughandler).get_data().wait()
finally:
Bugzilla.BUGZILLA_CHUNK_SIZE = old_CHUNK_SIZE
return all_ids
def get(ids_or_query):
new_bugs = {}
def bughandler(bug):
@ -107,31 +152,7 @@ def _download(ids_or_query):
return new_bugs
def download_bugs_between(date_from, date_to, security=False, store=True):
products = {
"Add-on SDK",
"Android Background Services",
"Core",
"Core Graveyard",
"DevTools",
"DevTools Graveyard",
"External Software Affecting Firefox",
"Firefox",
"Firefox Graveyard",
"Firefox Build System",
"Firefox for Android",
"Firefox for Android Graveyard",
# 'Firefox for iOS',
"Firefox Health Report",
# 'Focus',
# 'Hello (Loop)',
"NSPR",
"NSS",
"Toolkit",
"Toolkit Graveyard",
"WebExtensions",
}
def get_ids_between(date_from, date_to, security=False):
params = {
"f1": "creation_ts",
"o1": "greaterthan",
@ -139,47 +160,14 @@ def download_bugs_between(date_from, date_to, security=False, store=True):
"f2": "creation_ts",
"o2": "lessthan",
"v2": date_to.strftime("%Y-%m-%d"),
"product": products,
"product": PRODUCTS,
}
if not security:
params["f3"] = "bug_group"
params["o3"] = "isempty"
params["count_only"] = 1
r = requests.get("https://bugzilla.mozilla.org/rest/bug", params=params)
r.raise_for_status()
count = r.json()["bug_count"]
del params["count_only"]
params["limit"] = 100
params["order"] = "bug_id"
old_bug_ids = set(bug["id"] for bug in get_bugs())
all_bugs = []
with tqdm(total=count) as progress_bar:
for offset in range(0, count, Bugzilla.BUGZILLA_CHUNK_SIZE):
params["offset"] = offset
new_bugs = _download(params)
progress_bar.update(Bugzilla.BUGZILLA_CHUNK_SIZE)
all_bugs += [bug for bug in new_bugs.values()]
if store:
db.append(
BUGS_DB,
(
bug
for bug_id, bug in new_bugs.items()
if bug_id not in old_bug_ids
),
)
return all_bugs
return get_ids(params)
def download_bugs(bug_ids, products=None, security=False):
@ -204,7 +192,7 @@ def download_bugs(bug_ids, products=None, security=False):
)
with tqdm(total=len(new_bug_ids)) as progress_bar:
for chunk in chunks:
new_bugs = _download(chunk)
new_bugs = get(chunk)
progress_bar.update(len(chunk))
@ -225,8 +213,8 @@ def download_bugs(bug_ids, products=None, security=False):
db.append(BUGS_DB, new_bugs.values())
def delete_bugs(bug_ids):
db.delete(BUGS_DB, lambda bug: bug["id"] in set(bug_ids))
def delete_bugs(match):
db.delete(BUGS_DB, match)
def count_bugs(bug_query_params):

Просмотреть файл

@ -8,7 +8,7 @@ from logging import INFO, basicConfig, getLogger
from dateutil.relativedelta import relativedelta
from bugbug import bug_snapshot, bugzilla, labels
from bugbug import bug_snapshot, bugzilla, db, labels
from bugbug.utils import get_secret
basicConfig(level=INFO)
@ -19,29 +19,55 @@ class Retriever(object):
def retrieve_bugs(self):
bugzilla.set_token(get_secret("BUGZILLA_TOKEN"))
db.download_version(bugzilla.BUGS_DB)
if not db.is_old_version(bugzilla.BUGS_DB):
db.download(bugzilla.BUGS_DB)
# Get IDs of bugs changed since last run.
last_modified = db.last_modified(bugzilla.BUGS_DB)
logger.info(
f"Retrieving IDs of bugs modified since the last run on {last_modified}"
)
changed_ids = bugzilla.get_ids(
{"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()}
)
logger.info(f"Retrieved {len(changed_ids)} IDs.")
# Get IDs of bugs between (two years and six months ago) and (six months ago).
six_months_ago = datetime.utcnow() - relativedelta(months=6)
two_years_and_six_months_ago = six_months_ago - relativedelta(years=2)
logger.info(
"Downloading bugs from {} to {}".format(
two_years_and_six_months_ago, six_months_ago
)
f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}"
)
bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago)
timespan_ids = bugzilla.get_ids_between(
two_years_and_six_months_ago, six_months_ago
)
logger.info(f"Retrieved {len(timespan_ids)} IDs.")
logger.info("Downloading labelled bugs")
bug_ids = labels.get_all_bug_ids()
bugzilla.download_bugs(bug_ids)
# Get IDs of labelled bugs.
labelled_bug_ids = labels.get_all_bug_ids()
logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.")
all_ids = set(timespan_ids + labelled_bug_ids)
# We have to redownload bugs that were changed since the last download.
# We can remove from the DB the bugs that are outside of the considered timespan and are not labelled.
bugzilla.delete_bugs(
lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids
)
bugzilla.download_bugs(timespan_ids + labelled_bug_ids)
# Try to re-download inconsistent bugs, up to three times.
for i in range(3):
bug_ids = bug_snapshot.get_inconsistencies()
bug_ids = set(bug_snapshot.get_inconsistencies())
if len(bug_ids) == 0:
break
logger.info(
f"Re-downloading {len(bug_ids)} bugs, as they were inconsistent"
)
bugzilla.delete_bugs(bug_ids)
bugzilla.delete_bugs(lambda bug: bug["id"] in bug_ids)
bugzilla.download_bugs(bug_ids)
self.compress_file("data/bugs.json")

Просмотреть файл

@ -17,10 +17,13 @@ try:
with open("duplicate_test_bugs.json", "r") as f:
test_bugs = json.load(f)
except FileNotFoundError:
test_bugs = bugzilla.download_bugs_between(
datetime.now() - timedelta(days=21), datetime.now(), store=False
test_bug_ids = bugzilla.get_ids_between(
datetime.now() - timedelta(days=21), datetime.now()
)
test_bugs = [bug for bug in test_bugs if not bug["creator"] in REPORTERS_TO_IGNORE]
test_bugs = bugzilla.get(test_bug_ids)
test_bugs = [
bug for bug in test_bugs.values() if not bug["creator"] in REPORTERS_TO_IGNORE
]
with open("duplicate_test_bugs.json", "w") as f:
json.dump(test_bugs, f)

Просмотреть файл

@ -22,15 +22,12 @@ def fetch_untriaged(args):
# Set bugzilla token and download bugs
bugzilla.set_token(args.token)
bug_ids = bugzilla.download_bugs_between(three_months_ago, today)
bug_ids = bugzilla.get_ids_between(three_months_ago, today)
bugs = bugzilla.get(bug_ids)
# Get untriaged bugs
bugs = bugzilla.get_bugs()
untriaged_bugs = []
for bug in bugs:
if bug["id"] not in bug_ids:
continue
for bug in bugs.values():
for history in bug["history"]:
for change in history["changes"]:
if (