diff --git a/bugbug/bugzilla.py b/bugbug/bugzilla.py index 545594f5..3736e59c 100644 --- a/bugbug/bugzilla.py +++ b/bugbug/bugzilla.py @@ -19,6 +19,7 @@ BUGS_DB = "data/bugs.json" db.register( BUGS_DB, "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_bugs.latest/artifacts/public/bugs.json.xz", + 1, ) ATTACHMENT_INCLUDE_FIELDS = [ diff --git a/bugbug/db.py b/bugbug/db.py index c07cd68e..81865224 100644 --- a/bugbug/db.py +++ b/bugbug/db.py @@ -18,14 +18,25 @@ import zstandard DATABASES = {} -def register(path, url): - DATABASES[path] = {"url": url} +def register(path, url, version): + DATABASES[path] = {"url": url, "version": version} # Create DB parent directory. parent_dir = os.path.dirname(path) if not os.path.exists(parent_dir): os.makedirs(parent_dir, exist_ok=True) + if not os.path.exists(f"{path}.version"): + with open(f"{path}.version", "w") as f: + f.write(str(version)) + + +def is_old_version(path): + with open(f"{path}.version", "r") as f: + prev_version = int(f.read()) + + return DATABASES[path]["version"] > prev_version + # Download and extract databases. def download(): diff --git a/bugbug/repository.py b/bugbug/repository.py index 22a2ffcf..7efa193f 100644 --- a/bugbug/repository.py +++ b/bugbug/repository.py @@ -10,6 +10,7 @@ import itertools import json import multiprocessing import os +import pickle import re import sys from collections import deque @@ -17,7 +18,6 @@ from datetime import datetime import hglib import requests -from dateutil.relativedelta import relativedelta from tqdm import tqdm from bugbug import db @@ -26,6 +26,7 @@ COMMITS_DB = "data/commits.json" db.register( COMMITS_DB, "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/commits.json.xz", + 1, ) path_to_component = {} @@ -390,13 +391,6 @@ class exp_queue: def calculate_experiences(commits, commits_to_ignore): print(f"Analyzing experiences from {len(commits)} commits...") - last_commit = { - "author": {}, - "reviewer": {}, - "file": {}, - "directory": {}, - "component": {}, - } first_commit_time = {} for commit in tqdm(commits): @@ -407,33 +401,17 @@ def calculate_experiences(commits, commits_to_ignore): time_lapse = commit.pushdate - first_commit_time[commit.author] commit.seniority_author = time_lapse.days - if commit not in commits_to_ignore: - last_commit["author"][commit.author] = commit - for reviewer in commit.reviewers: - last_commit["reviewer"][reviewer] = commit - for path in commit.files: - last_commit["file"][path] = commit - for directory in get_directories(commit.files): - last_commit["directory"][directory] = commit - - components = list( - set( - path_to_component[path] - for path in commit.files - if path in path_to_component - ) - ) - - for component in components: - last_commit["component"][component] = commit - first_pushdate = commits[0].pushdate # Note: In the case of files, directories, components, we can't just use the sum of previous commits, as we could end # up overcounting them. For example, consider a commit A which modifies "dir1" and "dir2", a commit B which modifies # "dir1" and a commit C which modifies "dir1" and "dir2". The number of previous commits touching the same directories # for C should be 2 (A + B), and not 3 (A twice + B). - experiences = {} + try: + with open("data/commit_experiences.pickle", "rb") as f: + experiences = pickle.load(f) + except FileNotFoundError: + experiences = {} def get_experience(exp_type, commit_type, item, day, default): if exp_type not in experiences: @@ -449,13 +427,6 @@ def calculate_experiences(commits, commits_to_ignore): return experiences[exp_type][commit_type][item][day] - def del_experience(exp_type, items): - for item in items: - if last_commit[exp_type][item] is commit: - del last_commit[exp_type][item] - del experiences[exp_type][""][item] - del experiences[exp_type]["backout"][item] - def update_experiences(experience_type, day, items): for commit_type in ["", "backout"]: total_exps = [ @@ -502,8 +473,6 @@ def calculate_experiences(commits, commits_to_ignore): total_exps[i] + 1 ) - del_experience(experience_type, items) - def update_complex_experiences(experience_type, day, items): for commit_type in ["", "backout"]: all_commit_lists = [ @@ -577,9 +546,7 @@ def calculate_experiences(commits, commits_to_ignore): day ] = all_commit_lists[i] + (commit.node,) - del_experience(experience_type, items) - - for commit in tqdm(commits): + for i, commit in enumerate(tqdm(commits)): day = (commit.pushdate - first_pushdate).days assert day >= 0 @@ -645,6 +612,9 @@ def calculate_experiences(commits, commits_to_ignore): update_complex_experiences("component", day, components) + with open("data/commit_experiences.pickle", "wb") as f: + pickle.dump(experiences, f, protocol=pickle.HIGHEST_PROTOCOL) + def get_commits_to_ignore(repo_dir, commits): # Skip commits which are in .hg-annotate-ignore-revs or which have @@ -689,9 +659,13 @@ def download_component_mapping(): if old_etag != new_etag: r = requests.get(component_mapping_url) r.raise_for_status() + with open("data/component_mapping.json", "w") as f: f.write(r.text) + with open(f"data/component_mapping.etag", "w") as f: + f.write(new_etag) + with open("data/component_mapping.json", "r") as f: path_to_component = json.load(f) @@ -700,10 +674,10 @@ def download_component_mapping(): } -def download_commits(repo_dir, date_from): +def download_commits(repo_dir, rev_start=0): hg = hglib.open(repo_dir) - revs = get_revs(hg) + revs = get_revs(hg, rev_start) assert ( len(revs) > 0 @@ -737,9 +711,6 @@ def download_commits(repo_dir, date_from): # Exclude commits to ignore. commits = [commit for commit in commits if commit not in commits_to_ignore] - # Exclude commits outside the range we care about. - commits = [commit for commit in commits if commit.pushdate > date_from] - commits_num = len(commits) print(f"Mining {commits_num} commits using {processes} processes...") @@ -752,7 +723,7 @@ def download_commits(repo_dir, date_from): ) as executor: commits = executor.map(_transform, commits, chunksize=64) commits = tqdm(commits, total=commits_num) - db.write(COMMITS_DB, commits) + db.append(COMMITS_DB, commits) def get_commit_map(): @@ -775,8 +746,9 @@ def get_commit_map(): if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("repository_dir", help="Path to the repository", action="store") + parser.add_argument( + "rev_start", help="Which revision to start with", action="store" + ) args = parser.parse_args() - two_years_and_six_months_ago = datetime.utcnow() - relativedelta(years=2, months=6) - - download_commits(args.repository_dir, two_years_and_six_months_ago) + download_commits(args.repository_dir, args.rev_start) diff --git a/infra/data-pipeline.yml b/infra/data-pipeline.yml index 63a65843..b07d4190 100644 --- a/infra/data-pipeline.yml +++ b/infra/data-pipeline.yml @@ -14,6 +14,12 @@ tasks: public/commits.json.xz: path: /data/commits.json.xz type: file + public/commits.json.version: + path: /data/commits.json.version + type: file + public/commits_experiences.pickle.xz: + path: /data/commits_experiences.pickle.xz + type: file cache: bugbug-mercurial-repository: /cache scopes: diff --git a/scripts/commit_retriever.py b/scripts/commit_retriever.py index abb41345..8898d0cf 100644 --- a/scripts/commit_retriever.py +++ b/scripts/commit_retriever.py @@ -4,11 +4,9 @@ import argparse import lzma import os import shutil -from datetime import datetime from logging import INFO, basicConfig, getLogger import hglib -from dateutil.relativedelta import relativedelta from bugbug import repository @@ -54,14 +52,12 @@ class Retriever(object): hg.pull(update=True) hg.close() - two_years_and_six_months_ago = datetime.utcnow() - relativedelta( - years=2, months=6 - ) - repository.download_commits(self.repo_dir, two_years_and_six_months_ago) + repository.download_commits(self.repo_dir) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") + self.compress_file("data/commit_experiences.pickle") def compress_file(self, path): with open(path, "rb") as input_f: diff --git a/tests/test_db.py b/tests/test_db.py index dd7def71..e9508fa1 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -16,7 +16,7 @@ def mock_db(tmp_path): db_name += f".{db_compression}" db_path = tmp_path / db_name - db.register(db_path, "https://alink") + db.register(db_path, "https://alink", 1) return db_path return register_db @@ -78,10 +78,22 @@ def test_unregistered_db(tmp_path): ) def test_bad_format_compression(tmp_path, db_name): db_path = tmp_path / db_name - db.register(db_path, "https://alink") + db.register(db_path, "https://alink", 1) with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7)) + + +def test_register_db(tmp_path): + db_path = tmp_path / "prova.json" + + db.register(db_path, "https://alink", 1) + + assert not db.is_old_version(db_path) + + db.register(db_path, "https://alink", 2) + + assert db.is_old_version(db_path) diff --git a/tests/test_repository.py b/tests/test_repository.py index 29034d72..618936dd 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -269,7 +269,7 @@ def test_download_commits(fake_hg_repo): hg.push(dest=bytes(remote, "ascii")) copy_pushlog_database(remote, local) - repository.download_commits(local, datetime(1970, 1, 1)) + repository.download_commits(local) commits = list(repository.get_commits()) assert len(commits) == 0 @@ -278,21 +278,24 @@ def test_download_commits(fake_hg_repo): hg.push(dest=bytes(remote, "ascii")) copy_pushlog_database(remote, local) - repository.download_commits(local, datetime(1970, 1, 1)) + repository.download_commits(local) commits = list(repository.get_commits()) assert len(commits) == 1 assert commits[0]["node"] == revision2 + assert commits[0]["touched_prev_total_author_sum"] == 0 add_file(hg, local, "file3", "1\n2\n3\n4\n5\n6\n7\n") revision3 = commit(hg, "Bug 456 - Prova. r=moz") hg.push(dest=bytes(remote, "ascii")) copy_pushlog_database(remote, local) - repository.download_commits(local, datetime(1970, 1, 1)) + repository.download_commits(local, revision3) commits = list(repository.get_commits()) assert len(commits) == 2 assert commits[0]["node"] == revision2 + assert commits[0]["touched_prev_total_author_sum"] == 0 assert commits[1]["node"] == revision3 + assert commits[1]["touched_prev_total_author_sum"] == 1 def test_get_directories():