Download data from Bugzilla without using the search API.

At least until https://bugzilla.mozilla.org/show_bug.cgi?id=1508695 is fixed.
This commit is contained in:
Marco Castelluccio 2018-12-13 23:03:46 +01:00
Родитель 2d555e2fa4
Коммит 368c7cb3d8
4 изменённых файлов: 41 добавлений и 40 удалений

Просмотреть файл

@ -6,6 +6,8 @@ install:
- pip install -r test-requirements.txt - pip install -r test-requirements.txt
script: script:
- flake8 - flake8
- python run.py --download --train --goal bug
- python run.py --goal bug
- python -m pytest tests/test_*.py - python -m pytest tests/test_*.py
- python setup.py sdist - python setup.py sdist
- pip install dist/bugbug-$(cat VERSION).tar.gz - pip install dist/bugbug-$(cat VERSION).tar.gz

Просмотреть файл

@ -3,7 +3,6 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file, # License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/. # You can obtain one at http://mozilla.org/MPL/2.0/.
import itertools
import json import json
import os import os
@ -86,7 +85,7 @@ def _download(ids_or_query):
def download_bugs_between(date_from, date_to, security=False): def download_bugs_between(date_from, date_to, security=False):
products = [ products = set([
'Add-on SDK', 'Add-on SDK',
'Android Background Services', 'Android Background Services',
'Core', 'Core',
@ -101,41 +100,22 @@ def download_bugs_between(date_from, date_to, security=False):
'NSPR', 'NSPR',
'NSS', 'NSS',
'Toolkit', 'Toolkit',
] ])
query = { r = requests.get('https://bugzilla.mozilla.org/rest/bug?include_fields=id&f1=creation_ts&o1=greaterthan&v1={}&limit=1&order=bug_id'.format(date_from.strftime('%Y-%m-%d')))
'limit': 500, first_id = r.json()['bugs'][0]['id']
'order': 'bug_id',
'product': products,
'f1': 'bug_id', 'o1': 'greaterthan', 'v1': '',
'f2': 'creation_ts', 'o2': 'greaterthan', 'v2': date_from.strftime('%Y-%m-%d'),
'f3': 'creation_ts', 'o3': 'lessthan', 'v3': date_to.strftime('%Y-%m-%d'),
'f4': 'cf_last_resolved', 'o4': 'lessthan', 'v4': date_to.strftime('%Y-%m-%d'),
}
if not security: r = requests.get('https://bugzilla.mozilla.org/rest/bug?include_fields=id&f1=creation_ts&o1=lessthan&v1={}&limit=1&order=bug_id%20desc'.format(date_to.strftime('%Y-%m-%d')))
query['f5'] = 'bug_group' last_id = r.json()['bugs'][0]['id']
query['o5'] = 'isempty'
last_id = 0 assert first_id < last_id
total_downloaded = 0
while True:
query['v1'] = last_id
bugs = _download(query)
last_id = max([last_id] + [bug for bug in bugs.keys()]) all_ids = range(first_id, last_id + 1)
total_downloaded += len(bugs) download_bugs(all_ids, security=security, products=products)
print('Downloaded {} bugs, up to ID {}'.format(total_downloaded, last_id))
db.append(BUGS_DB, bugs.values())
if len(bugs) < 500:
break
def download_bugs(bug_ids, security=False): def download_bugs(bug_ids, products=None, security=False):
old_bug_count = 0 old_bug_count = 0
old_bugs = [] old_bugs = []
new_bug_ids = set([int(bug_id) for bug_id in bug_ids]) new_bug_ids = set([int(bug_id) for bug_id in bug_ids])
@ -147,16 +127,27 @@ def download_bugs(bug_ids, security=False):
print('Loaded {} bugs.'.format(old_bug_count)) print('Loaded {} bugs.'.format(old_bug_count))
yield from old_bugs
print('To download {} bugs.'.format(len(new_bug_ids))) print('To download {} bugs.'.format(len(new_bug_ids)))
new_bugs = _download(new_bug_ids) new_bug_ids = sorted(list(new_bug_ids))
if not security: total_downloaded = 0
new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if len(bug['groups']) == 0} chunks = (new_bug_ids[i:(i + 500)] for i in range(0, len(new_bug_ids), 500))
for chunk in chunks:
new_bugs = _download(chunk)
print('Total number of bugs: {}'.format(old_bug_count + len(new_bugs))) total_downloaded += len(new_bugs)
print('Downloaded {} bugs'.format(total_downloaded))
if not security:
new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if len(bug['groups']) == 0}
if products is not None:
new_bugs = {bug_id: bug for bug_id, bug in new_bugs.items() if bug['product'] in products}
if len(new_bugs):
db.append(BUGS_DB, new_bugs.values()) db.append(BUGS_DB, new_bugs.values())
return itertools.chain(old_bugs, new_bugs.items()) yield from new_bugs.items()

Просмотреть файл

@ -21,10 +21,11 @@ def register(path, url):
os.makedirs(parent_dir, exist_ok=True) os.makedirs(parent_dir, exist_ok=True)
def read(path): def download():
assert path in DATABASES for path, url in DATABASES.items():
if os.path.exists(path):
continue
if not os.path.exists(path):
# Download and extract database. # Download and extract database.
xz_path = '{}.xz'.format(path) xz_path = '{}.xz'.format(path)
@ -37,8 +38,12 @@ def read(path):
with lzma.open(xz_path) as input_f: with lzma.open(xz_path) as input_f:
shutil.copyfileobj(input_f, output_f) shutil.copyfileobj(input_f, output_f)
def read(path):
assert path in DATABASES
if not os.path.exists(path): if not os.path.exists(path):
raise Exception('Database {} does not exist.'.format(path)) return ()
with open(path, 'r') as f: with open(path, 'r') as f:
for line in f: for line in f:

3
run.py
Просмотреть файл

@ -6,7 +6,9 @@
import argparse import argparse
from bugbug import bugzilla from bugbug import bugzilla
from bugbug import db
from bugbug import labels from bugbug import labels
from bugbug import repository # noqa
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
@ -17,6 +19,7 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
if args.download: if args.download:
db.download()
bug_ids = labels.get_all_bug_ids() bug_ids = labels.get_all_bug_ids()
bugzilla.download_bugs(bug_ids) bugzilla.download_bugs(bug_ids)