Support retrieving commits in steps (#536)

* Support retrieving commits in steps

* Store component mapping ETag to actually avoid downloading it again when not needed

* Store a version file alongside the DBs

* Export the commits DB version file and the experiences values as artifacts of the commit-retriever task
This commit is contained in:
Marco 2019-06-03 19:29:08 +02:00 коммит произвёл GitHub
Родитель 32f024b9e3
Коммит d8b84ca798
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
7 изменённых файлов: 64 добавлений и 63 удалений

Просмотреть файл

@ -19,6 +19,7 @@ BUGS_DB = "data/bugs.json"
db.register(
BUGS_DB,
"https://index.taskcluster.net/v1/task/project.relman.bugbug.data_bugs.latest/artifacts/public/bugs.json.xz",
1,
)
ATTACHMENT_INCLUDE_FIELDS = [

Просмотреть файл

@ -18,14 +18,25 @@ import zstandard
DATABASES = {}
def register(path, url):
DATABASES[path] = {"url": url}
def register(path, url, version):
DATABASES[path] = {"url": url, "version": version}
# Create DB parent directory.
parent_dir = os.path.dirname(path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir, exist_ok=True)
if not os.path.exists(f"{path}.version"):
with open(f"{path}.version", "w") as f:
f.write(str(version))
def is_old_version(path):
with open(f"{path}.version", "r") as f:
prev_version = int(f.read())
return DATABASES[path]["version"] > prev_version
# Download and extract databases.
def download():

Просмотреть файл

@ -10,6 +10,7 @@ import itertools
import json
import multiprocessing
import os
import pickle
import re
import sys
from collections import deque
@ -17,7 +18,6 @@ from datetime import datetime
import hglib
import requests
from dateutil.relativedelta import relativedelta
from tqdm import tqdm
from bugbug import db
@ -26,6 +26,7 @@ COMMITS_DB = "data/commits.json"
db.register(
COMMITS_DB,
"https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.xz",
1,
)
path_to_component = {}
@ -390,13 +391,6 @@ class exp_queue:
def calculate_experiences(commits, commits_to_ignore):
print(f"Analyzing experiences from {len(commits)} commits...")
last_commit = {
"author": {},
"reviewer": {},
"file": {},
"directory": {},
"component": {},
}
first_commit_time = {}
for commit in tqdm(commits):
@ -407,33 +401,17 @@ def calculate_experiences(commits, commits_to_ignore):
time_lapse = commit.pushdate - first_commit_time[commit.author]
commit.seniority_author = time_lapse.days
if commit not in commits_to_ignore:
last_commit["author"][commit.author] = commit
for reviewer in commit.reviewers:
last_commit["reviewer"][reviewer] = commit
for path in commit.files:
last_commit["file"][path] = commit
for directory in get_directories(commit.files):
last_commit["directory"][directory] = commit
components = list(
set(
path_to_component[path]
for path in commit.files
if path in path_to_component
)
)
for component in components:
last_commit["component"][component] = commit
first_pushdate = commits[0].pushdate
# Note: In the case of files, directories, components, we can't just use the sum of previous commits, as we could end
# up overcounting them. For example, consider a commit A which modifies "dir1" and "dir2", a commit B which modifies
# "dir1" and a commit C which modifies "dir1" and "dir2". The number of previous commits touching the same directories
# for C should be 2 (A + B), and not 3 (A twice + B).
experiences = {}
try:
with open("data/commit_experiences.pickle", "rb") as f:
experiences = pickle.load(f)
except FileNotFoundError:
experiences = {}
def get_experience(exp_type, commit_type, item, day, default):
if exp_type not in experiences:
@ -449,13 +427,6 @@ def calculate_experiences(commits, commits_to_ignore):
return experiences[exp_type][commit_type][item][day]
def del_experience(exp_type, items):
for item in items:
if last_commit[exp_type][item] is commit:
del last_commit[exp_type][item]
del experiences[exp_type][""][item]
del experiences[exp_type]["backout"][item]
def update_experiences(experience_type, day, items):
for commit_type in ["", "backout"]:
total_exps = [
@ -502,8 +473,6 @@ def calculate_experiences(commits, commits_to_ignore):
total_exps[i] + 1
)
del_experience(experience_type, items)
def update_complex_experiences(experience_type, day, items):
for commit_type in ["", "backout"]:
all_commit_lists = [
@ -577,9 +546,7 @@ def calculate_experiences(commits, commits_to_ignore):
day
] = all_commit_lists[i] + (commit.node,)
del_experience(experience_type, items)
for commit in tqdm(commits):
for i, commit in enumerate(tqdm(commits)):
day = (commit.pushdate - first_pushdate).days
assert day >= 0
@ -645,6 +612,9 @@ def calculate_experiences(commits, commits_to_ignore):
update_complex_experiences("component", day, components)
with open("data/commit_experiences.pickle", "wb") as f:
pickle.dump(experiences, f, protocol=pickle.HIGHEST_PROTOCOL)
def get_commits_to_ignore(repo_dir, commits):
# Skip commits which are in .hg-annotate-ignore-revs or which have
@ -689,9 +659,13 @@ def download_component_mapping():
if old_etag != new_etag:
r = requests.get(component_mapping_url)
r.raise_for_status()
with open("data/component_mapping.json", "w") as f:
f.write(r.text)
with open(f"data/component_mapping.etag", "w") as f:
f.write(new_etag)
with open("data/component_mapping.json", "r") as f:
path_to_component = json.load(f)
@ -700,10 +674,10 @@ def download_component_mapping():
}
def download_commits(repo_dir, date_from):
def download_commits(repo_dir, rev_start=0):
hg = hglib.open(repo_dir)
revs = get_revs(hg)
revs = get_revs(hg, rev_start)
assert (
len(revs) > 0
@ -737,9 +711,6 @@ def download_commits(repo_dir, date_from):
# Exclude commits to ignore.
commits = [commit for commit in commits if commit not in commits_to_ignore]
# Exclude commits outside the range we care about.
commits = [commit for commit in commits if commit.pushdate > date_from]
commits_num = len(commits)
print(f"Mining {commits_num} commits using {processes} processes...")
@ -752,7 +723,7 @@ def download_commits(repo_dir, date_from):
) as executor:
commits = executor.map(_transform, commits, chunksize=64)
commits = tqdm(commits, total=commits_num)
db.write(COMMITS_DB, commits)
db.append(COMMITS_DB, commits)
def get_commit_map():
@ -775,8 +746,9 @@ def get_commit_map():
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("repository_dir", help="Path to the repository", action="store")
parser.add_argument(
"rev_start", help="Which revision to start with", action="store"
)
args = parser.parse_args()
two_years_and_six_months_ago = datetime.utcnow() - relativedelta(years=2, months=6)
download_commits(args.repository_dir, two_years_and_six_months_ago)
download_commits(args.repository_dir, args.rev_start)

Просмотреть файл

@ -14,6 +14,12 @@ tasks:
public/commits.json.xz:
path: /data/commits.json.xz
type: file
public/commits.json.version:
path: /data/commits.json.version
type: file
public/commits_experiences.pickle.xz:
path: /data/commits_experiences.pickle.xz
type: file
cache:
bugbug-mercurial-repository: /cache
scopes:

Просмотреть файл

@ -4,11 +4,9 @@ import argparse
import lzma
import os
import shutil
from datetime import datetime
from logging import INFO, basicConfig, getLogger
import hglib
from dateutil.relativedelta import relativedelta
from bugbug import repository
@ -54,14 +52,12 @@ class Retriever(object):
hg.pull(update=True)
hg.close()
two_years_and_six_months_ago = datetime.utcnow() - relativedelta(
years=2, months=6
)
repository.download_commits(self.repo_dir, two_years_and_six_months_ago)
repository.download_commits(self.repo_dir)
logger.info("commit data extracted from repository")
self.compress_file("data/commits.json")
self.compress_file("data/commit_experiences.pickle")
def compress_file(self, path):
with open(path, "rb") as input_f:

Просмотреть файл

@ -16,7 +16,7 @@ def mock_db(tmp_path):
db_name += f".{db_compression}"
db_path = tmp_path / db_name
db.register(db_path, "https://alink")
db.register(db_path, "https://alink", 1)
return db_path
return register_db
@ -78,10 +78,22 @@ def test_unregistered_db(tmp_path):
)
def test_bad_format_compression(tmp_path, db_name):
db_path = tmp_path / db_name
db.register(db_path, "https://alink")
db.register(db_path, "https://alink", 1)
with pytest.raises(AssertionError):
db.write(db_path, range(7))
with pytest.raises(AssertionError):
db.append(db_path, range(7))
def test_register_db(tmp_path):
db_path = tmp_path / "prova.json"
db.register(db_path, "https://alink", 1)
assert not db.is_old_version(db_path)
db.register(db_path, "https://alink", 2)
assert db.is_old_version(db_path)

Просмотреть файл

@ -269,7 +269,7 @@ def test_download_commits(fake_hg_repo):
hg.push(dest=bytes(remote, "ascii"))
copy_pushlog_database(remote, local)
repository.download_commits(local, datetime(1970, 1, 1))
repository.download_commits(local)
commits = list(repository.get_commits())
assert len(commits) == 0
@ -278,21 +278,24 @@ def test_download_commits(fake_hg_repo):
hg.push(dest=bytes(remote, "ascii"))
copy_pushlog_database(remote, local)
repository.download_commits(local, datetime(1970, 1, 1))
repository.download_commits(local)
commits = list(repository.get_commits())
assert len(commits) == 1
assert commits[0]["node"] == revision2
assert commits[0]["touched_prev_total_author_sum"] == 0
add_file(hg, local, "file3", "1\n2\n3\n4\n5\n6\n7\n")
revision3 = commit(hg, "Bug 456 - Prova. r=moz")
hg.push(dest=bytes(remote, "ascii"))
copy_pushlog_database(remote, local)
repository.download_commits(local, datetime(1970, 1, 1))
repository.download_commits(local, revision3)
commits = list(repository.get_commits())
assert len(commits) == 2
assert commits[0]["node"] == revision2
assert commits[0]["touched_prev_total_author_sum"] == 0
assert commits[1]["node"] == revision3
assert commits[1]["touched_prev_total_author_sum"] == 1
def test_get_directories():