зеркало из https://github.com/mozilla/bugbug.git
Support retrieving commits in steps (#536)
* Support retrieving commits in steps * Store component mapping ETag to actually avoid downloading it again when not needed * Store a version file alongside the DBs * Export the commits DB version file and the experiences values as artifacts of the commit-retriever task
This commit is contained in:
Родитель
32f024b9e3
Коммит
d8b84ca798
|
@ -19,6 +19,7 @@ BUGS_DB = "data/bugs.json"
|
|||
db.register(
|
||||
BUGS_DB,
|
||||
"https://index.taskcluster.net/v1/task/project.relman.bugbug.data_bugs.latest/artifacts/public/bugs.json.xz",
|
||||
1,
|
||||
)
|
||||
|
||||
ATTACHMENT_INCLUDE_FIELDS = [
|
||||
|
|
15
bugbug/db.py
15
bugbug/db.py
|
@ -18,14 +18,25 @@ import zstandard
|
|||
DATABASES = {}
|
||||
|
||||
|
||||
def register(path, url):
|
||||
DATABASES[path] = {"url": url}
|
||||
def register(path, url, version):
|
||||
DATABASES[path] = {"url": url, "version": version}
|
||||
|
||||
# Create DB parent directory.
|
||||
parent_dir = os.path.dirname(path)
|
||||
if not os.path.exists(parent_dir):
|
||||
os.makedirs(parent_dir, exist_ok=True)
|
||||
|
||||
if not os.path.exists(f"{path}.version"):
|
||||
with open(f"{path}.version", "w") as f:
|
||||
f.write(str(version))
|
||||
|
||||
|
||||
def is_old_version(path):
|
||||
with open(f"{path}.version", "r") as f:
|
||||
prev_version = int(f.read())
|
||||
|
||||
return DATABASES[path]["version"] > prev_version
|
||||
|
||||
|
||||
# Download and extract databases.
|
||||
def download():
|
||||
|
|
|
@ -10,6 +10,7 @@ import itertools
|
|||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import sys
|
||||
from collections import deque
|
||||
|
@ -17,7 +18,6 @@ from datetime import datetime
|
|||
|
||||
import hglib
|
||||
import requests
|
||||
from dateutil.relativedelta import relativedelta
|
||||
from tqdm import tqdm
|
||||
|
||||
from bugbug import db
|
||||
|
@ -26,6 +26,7 @@ COMMITS_DB = "data/commits.json"
|
|||
db.register(
|
||||
COMMITS_DB,
|
||||
"https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.xz",
|
||||
1,
|
||||
)
|
||||
|
||||
path_to_component = {}
|
||||
|
@ -390,13 +391,6 @@ class exp_queue:
|
|||
def calculate_experiences(commits, commits_to_ignore):
|
||||
print(f"Analyzing experiences from {len(commits)} commits...")
|
||||
|
||||
last_commit = {
|
||||
"author": {},
|
||||
"reviewer": {},
|
||||
"file": {},
|
||||
"directory": {},
|
||||
"component": {},
|
||||
}
|
||||
first_commit_time = {}
|
||||
|
||||
for commit in tqdm(commits):
|
||||
|
@ -407,33 +401,17 @@ def calculate_experiences(commits, commits_to_ignore):
|
|||
time_lapse = commit.pushdate - first_commit_time[commit.author]
|
||||
commit.seniority_author = time_lapse.days
|
||||
|
||||
if commit not in commits_to_ignore:
|
||||
last_commit["author"][commit.author] = commit
|
||||
for reviewer in commit.reviewers:
|
||||
last_commit["reviewer"][reviewer] = commit
|
||||
for path in commit.files:
|
||||
last_commit["file"][path] = commit
|
||||
for directory in get_directories(commit.files):
|
||||
last_commit["directory"][directory] = commit
|
||||
|
||||
components = list(
|
||||
set(
|
||||
path_to_component[path]
|
||||
for path in commit.files
|
||||
if path in path_to_component
|
||||
)
|
||||
)
|
||||
|
||||
for component in components:
|
||||
last_commit["component"][component] = commit
|
||||
|
||||
first_pushdate = commits[0].pushdate
|
||||
|
||||
# Note: In the case of files, directories, components, we can't just use the sum of previous commits, as we could end
|
||||
# up overcounting them. For example, consider a commit A which modifies "dir1" and "dir2", a commit B which modifies
|
||||
# "dir1" and a commit C which modifies "dir1" and "dir2". The number of previous commits touching the same directories
|
||||
# for C should be 2 (A + B), and not 3 (A twice + B).
|
||||
experiences = {}
|
||||
try:
|
||||
with open("data/commit_experiences.pickle", "rb") as f:
|
||||
experiences = pickle.load(f)
|
||||
except FileNotFoundError:
|
||||
experiences = {}
|
||||
|
||||
def get_experience(exp_type, commit_type, item, day, default):
|
||||
if exp_type not in experiences:
|
||||
|
@ -449,13 +427,6 @@ def calculate_experiences(commits, commits_to_ignore):
|
|||
|
||||
return experiences[exp_type][commit_type][item][day]
|
||||
|
||||
def del_experience(exp_type, items):
|
||||
for item in items:
|
||||
if last_commit[exp_type][item] is commit:
|
||||
del last_commit[exp_type][item]
|
||||
del experiences[exp_type][""][item]
|
||||
del experiences[exp_type]["backout"][item]
|
||||
|
||||
def update_experiences(experience_type, day, items):
|
||||
for commit_type in ["", "backout"]:
|
||||
total_exps = [
|
||||
|
@ -502,8 +473,6 @@ def calculate_experiences(commits, commits_to_ignore):
|
|||
total_exps[i] + 1
|
||||
)
|
||||
|
||||
del_experience(experience_type, items)
|
||||
|
||||
def update_complex_experiences(experience_type, day, items):
|
||||
for commit_type in ["", "backout"]:
|
||||
all_commit_lists = [
|
||||
|
@ -577,9 +546,7 @@ def calculate_experiences(commits, commits_to_ignore):
|
|||
day
|
||||
] = all_commit_lists[i] + (commit.node,)
|
||||
|
||||
del_experience(experience_type, items)
|
||||
|
||||
for commit in tqdm(commits):
|
||||
for i, commit in enumerate(tqdm(commits)):
|
||||
day = (commit.pushdate - first_pushdate).days
|
||||
assert day >= 0
|
||||
|
||||
|
@ -645,6 +612,9 @@ def calculate_experiences(commits, commits_to_ignore):
|
|||
|
||||
update_complex_experiences("component", day, components)
|
||||
|
||||
with open("data/commit_experiences.pickle", "wb") as f:
|
||||
pickle.dump(experiences, f, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def get_commits_to_ignore(repo_dir, commits):
|
||||
# Skip commits which are in .hg-annotate-ignore-revs or which have
|
||||
|
@ -689,9 +659,13 @@ def download_component_mapping():
|
|||
if old_etag != new_etag:
|
||||
r = requests.get(component_mapping_url)
|
||||
r.raise_for_status()
|
||||
|
||||
with open("data/component_mapping.json", "w") as f:
|
||||
f.write(r.text)
|
||||
|
||||
with open(f"data/component_mapping.etag", "w") as f:
|
||||
f.write(new_etag)
|
||||
|
||||
with open("data/component_mapping.json", "r") as f:
|
||||
path_to_component = json.load(f)
|
||||
|
||||
|
@ -700,10 +674,10 @@ def download_component_mapping():
|
|||
}
|
||||
|
||||
|
||||
def download_commits(repo_dir, date_from):
|
||||
def download_commits(repo_dir, rev_start=0):
|
||||
hg = hglib.open(repo_dir)
|
||||
|
||||
revs = get_revs(hg)
|
||||
revs = get_revs(hg, rev_start)
|
||||
|
||||
assert (
|
||||
len(revs) > 0
|
||||
|
@ -737,9 +711,6 @@ def download_commits(repo_dir, date_from):
|
|||
# Exclude commits to ignore.
|
||||
commits = [commit for commit in commits if commit not in commits_to_ignore]
|
||||
|
||||
# Exclude commits outside the range we care about.
|
||||
commits = [commit for commit in commits if commit.pushdate > date_from]
|
||||
|
||||
commits_num = len(commits)
|
||||
|
||||
print(f"Mining {commits_num} commits using {processes} processes...")
|
||||
|
@ -752,7 +723,7 @@ def download_commits(repo_dir, date_from):
|
|||
) as executor:
|
||||
commits = executor.map(_transform, commits, chunksize=64)
|
||||
commits = tqdm(commits, total=commits_num)
|
||||
db.write(COMMITS_DB, commits)
|
||||
db.append(COMMITS_DB, commits)
|
||||
|
||||
|
||||
def get_commit_map():
|
||||
|
@ -775,8 +746,9 @@ def get_commit_map():
|
|||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("repository_dir", help="Path to the repository", action="store")
|
||||
parser.add_argument(
|
||||
"rev_start", help="Which revision to start with", action="store"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
two_years_and_six_months_ago = datetime.utcnow() - relativedelta(years=2, months=6)
|
||||
|
||||
download_commits(args.repository_dir, two_years_and_six_months_ago)
|
||||
download_commits(args.repository_dir, args.rev_start)
|
||||
|
|
|
@ -14,6 +14,12 @@ tasks:
|
|||
public/commits.json.xz:
|
||||
path: /data/commits.json.xz
|
||||
type: file
|
||||
public/commits.json.version:
|
||||
path: /data/commits.json.version
|
||||
type: file
|
||||
public/commits_experiences.pickle.xz:
|
||||
path: /data/commits_experiences.pickle.xz
|
||||
type: file
|
||||
cache:
|
||||
bugbug-mercurial-repository: /cache
|
||||
scopes:
|
||||
|
|
|
@ -4,11 +4,9 @@ import argparse
|
|||
import lzma
|
||||
import os
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
from logging import INFO, basicConfig, getLogger
|
||||
|
||||
import hglib
|
||||
from dateutil.relativedelta import relativedelta
|
||||
|
||||
from bugbug import repository
|
||||
|
||||
|
@ -54,14 +52,12 @@ class Retriever(object):
|
|||
hg.pull(update=True)
|
||||
hg.close()
|
||||
|
||||
two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
|
||||
years=2, months=6
|
||||
)
|
||||
repository.download_commits(self.repo_dir, two_years_and_six_months_ago)
|
||||
repository.download_commits(self.repo_dir)
|
||||
|
||||
logger.info("commit data extracted from repository")
|
||||
|
||||
self.compress_file("data/commits.json")
|
||||
self.compress_file("data/commit_experiences.pickle")
|
||||
|
||||
def compress_file(self, path):
|
||||
with open(path, "rb") as input_f:
|
||||
|
|
|
@ -16,7 +16,7 @@ def mock_db(tmp_path):
|
|||
db_name += f".{db_compression}"
|
||||
|
||||
db_path = tmp_path / db_name
|
||||
db.register(db_path, "https://alink")
|
||||
db.register(db_path, "https://alink", 1)
|
||||
return db_path
|
||||
|
||||
return register_db
|
||||
|
@ -78,10 +78,22 @@ def test_unregistered_db(tmp_path):
|
|||
)
|
||||
def test_bad_format_compression(tmp_path, db_name):
|
||||
db_path = tmp_path / db_name
|
||||
db.register(db_path, "https://alink")
|
||||
db.register(db_path, "https://alink", 1)
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
db.write(db_path, range(7))
|
||||
|
||||
with pytest.raises(AssertionError):
|
||||
db.append(db_path, range(7))
|
||||
|
||||
|
||||
def test_register_db(tmp_path):
|
||||
db_path = tmp_path / "prova.json"
|
||||
|
||||
db.register(db_path, "https://alink", 1)
|
||||
|
||||
assert not db.is_old_version(db_path)
|
||||
|
||||
db.register(db_path, "https://alink", 2)
|
||||
|
||||
assert db.is_old_version(db_path)
|
||||
|
|
|
@ -269,7 +269,7 @@ def test_download_commits(fake_hg_repo):
|
|||
hg.push(dest=bytes(remote, "ascii"))
|
||||
copy_pushlog_database(remote, local)
|
||||
|
||||
repository.download_commits(local, datetime(1970, 1, 1))
|
||||
repository.download_commits(local)
|
||||
commits = list(repository.get_commits())
|
||||
assert len(commits) == 0
|
||||
|
||||
|
@ -278,21 +278,24 @@ def test_download_commits(fake_hg_repo):
|
|||
hg.push(dest=bytes(remote, "ascii"))
|
||||
copy_pushlog_database(remote, local)
|
||||
|
||||
repository.download_commits(local, datetime(1970, 1, 1))
|
||||
repository.download_commits(local)
|
||||
commits = list(repository.get_commits())
|
||||
assert len(commits) == 1
|
||||
assert commits[0]["node"] == revision2
|
||||
assert commits[0]["touched_prev_total_author_sum"] == 0
|
||||
|
||||
add_file(hg, local, "file3", "1\n2\n3\n4\n5\n6\n7\n")
|
||||
revision3 = commit(hg, "Bug 456 - Prova. r=moz")
|
||||
hg.push(dest=bytes(remote, "ascii"))
|
||||
copy_pushlog_database(remote, local)
|
||||
|
||||
repository.download_commits(local, datetime(1970, 1, 1))
|
||||
repository.download_commits(local, revision3)
|
||||
commits = list(repository.get_commits())
|
||||
assert len(commits) == 2
|
||||
assert commits[0]["node"] == revision2
|
||||
assert commits[0]["touched_prev_total_author_sum"] == 0
|
||||
assert commits[1]["node"] == revision3
|
||||
assert commits[1]["touched_prev_total_author_sum"] == 1
|
||||
|
||||
|
||||
def test_get_directories():
|
||||
|
|
Загрузка…
Ссылка в новой задаче