diff --git a/infra/annotate-pipeline.yml b/infra/annotate-pipeline.yml index 7a5247ef..f0b8d864 100644 --- a/infra/annotate-pipeline.yml +++ b/infra/annotate-pipeline.yml @@ -61,11 +61,11 @@ tasks: public/commits_to_ignore.csv: path: /commits_to_ignore.csv type: file - public/fix_commits.csv: - path: /fix_commits.csv + public/bug_fixing_commits.json.zst: + path: /bug_fixing_commits.json.zst type: file - public/bug_introducing_commits.csv: - path: /bug_introducing_commits.csv + public/bug_introducing_commits.json.zst: + path: /bug_introducing_commits.json.zst type: file cache: bugbug-mercurial-repository: /cache @@ -77,6 +77,7 @@ tasks: routes: - notify.email.release-mgmt-analysis@mozilla.com.on-failed" - notify.irc-channel.#bugbug.on-failed + - index.project.relman.bugbug_annotate.regressor_finder.latest metadata: name: bugbug regressor finder description: bugbug regressor finder diff --git a/scripts/regressor_finder.py b/scripts/regressor_finder.py index 862f188f..16634e9e 100644 --- a/scripts/regressor_finder.py +++ b/scripts/regressor_finder.py @@ -33,11 +33,34 @@ logger = getLogger(__name__) MAX_MODIFICATION_NUMBER = 50 # TODO: Set to 2 years and 6 months. If it takes too long, make the task work incrementally like microannotate-generate. RELATIVE_START_DATE = relativedelta(days=49) +# Only needed because mercurial<->git mapping could be behind. +RELATIVE_END_DATE = relativedelta(days=3) + +BUG_FIXING_COMMITS_DB = "data/bug_fixing_commits.json" +db.register( + BUG_FIXING_COMMITS_DB, + "https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_fixing_commits.json.zst", + 1, +) + +BUG_INTRODUCING_COMMITS_DB = "data/bug_introducing_commits.json" +db.register( + BUG_INTRODUCING_COMMITS_DB, + "https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_introducing_commits.json.zst", + 1, +) BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_name}model.zst" +def compress_file(path): + cctx = zstandard.ZstdCompressor() + with open(path, "rb") as input_f: + with open(f"{path}.zst", "wb") as output_f: + cctx.copy_stream(input_f, output_f) + + def download_model(model_name): if not os.path.exists(f"{model_name}model"): url = BASE_URL.format(model_name=model_name) @@ -111,7 +134,7 @@ def get_commits_to_ignore(repo_dir): return commits_to_ignore -def find_fix_commits(): +def find_bug_fixing_commits(): logger.info("Downloading commits database...") db.download_version(repository.COMMITS_DB) if db.is_old_version(repository.COMMITS_DB) or not os.path.exists( @@ -124,6 +147,21 @@ def find_fix_commits(): if db.is_old_version(bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB, force=True) + logger.info("Download previous classifications...") + db.download_version(BUG_FIXING_COMMITS_DB) + if db.is_old_version(BUG_FIXING_COMMITS_DB) or not os.path.exists( + BUG_FIXING_COMMITS_DB + ): + db.download(BUG_FIXING_COMMITS_DB, force=True) + + logger.info("Get previously classified commits...") + prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) + prev_bug_fixing_commits_nodes = set( + bug_fixing_commit["mercurial_rev"] + for bug_fixing_commit in prev_bug_fixing_commits + ) + logger.info(f"Already classified {len(prev_bug_fixing_commits)} commits...") + # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") download_model("defectenhancementtask") @@ -134,15 +172,24 @@ def find_fix_commits(): regression_model = RegressionModel.load("regressionmodel") start_date = datetime.now() - RELATIVE_START_DATE - logger.info(f"Gathering bug IDs associated to commits (since {start_date})...") + end_date = datetime.now() - RELATIVE_END_DATE + logger.info( + f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." + ) commit_map = defaultdict(list) for commit in repository.get_commits(): - if dateutil.parser.parse(commit["pushdate"]) < start_date: + if commit["node"] in prev_bug_fixing_commits_nodes: + continue + + commit_date = dateutil.parser.parse(commit["pushdate"]) + if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit) - logger.info(f"{len(commit_map)} commits found") + logger.info( + f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" + ) assert len(commit_map) > 0 def get_relevant_bugs(): @@ -156,11 +203,11 @@ def find_fix_commits(): known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() - fix_commits = [] + bug_fixing_commits = [] - def append_fix_commits(bug_id, type_): + def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: - fix_commits.append( + bug_fixing_commits.append( { "mercurial_rev": commit["node"], "git_rev": vcs_map.mercurial_to_git(commit["node"]), @@ -178,26 +225,33 @@ def find_fix_commits(): bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1 ): - append_fix_commits(bug["id"], "r") + append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": - append_fix_commits(bug["id"], "d") + append_bug_fixing_commits(bug["id"], "d") + else: + append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: - append_fix_commits(bug["id"], "r") + append_bug_fixing_commits(bug["id"], "r") else: - append_fix_commits(bug["id"], "d") + append_bug_fixing_commits(bug["id"], "d") + else: + append_bug_fixing_commits(bug["id"], "e") - with open("fix_commits.csv", "w") as f: - writer = csv.DictWriter(f, fieldnames=["mercurial_rev", "git_rev", "type"]) - writer.writeheader() - writer.writerows(fix_commits) + db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) + compress_file(BUG_FIXING_COMMITS_DB) - return fix_commits + bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits + return [ + bug_fixing_commit + for bug_fixing_commit in bug_fixing_commits + if bug_fixing_commit["type"] in ["r", "d"] + ] def find_bug_introducing_commits(cache_dir, git_repo_dir): @@ -212,7 +266,21 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir): logger.info(f"Cloning git repository to {git_repo_dir}...") clone_gecko_dev(git_repo_dir) - fix_commits = find_fix_commits() + logger.info("Download previously found bug-introducing commits...") + db.download_version(BUG_INTRODUCING_COMMITS_DB) + if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists( + BUG_INTRODUCING_COMMITS_DB + ): + db.download(BUG_INTRODUCING_COMMITS_DB, force=True) + + logger.info("Get previously found bug-introducing commits...") + prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB)) + prev_bug_introducing_commits_nodes = set( + bug_introducing_commit["bug_fixing_mercurial_rev"] + for bug_introducing_commit in prev_bug_introducing_commits + ) + logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...") + commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir) git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore) @@ -220,26 +288,38 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir): with open("git_hashes_to_ignore", "w") as f: f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore) - total_fix_commits_num = len(fix_commits) - fix_commits = [ - fix_commit - for fix_commit in fix_commits - if fix_commit["git_rev"] not in git_hashes_to_ignore + bug_fixing_commits = find_bug_fixing_commits() + + logger.info(f"{len(bug_fixing_commits)} commits to analyze") + + # Skip already found bug-introducing commits. + bug_fixing_commits = [ + bug_fixing_commit + for bug_fixing_commit in bug_fixing_commits + if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes ] + logger.info( - f"Skipped {total_fix_commits_num - len(fix_commits)} commits as they were in the ignore list" + f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) - logger.info(f"{len(fix_commits)} commits to analyze") + bug_fixing_commits = [ + bug_fixing_commit + for bug_fixing_commit in bug_fixing_commits + if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore + ] + logger.info( + f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" + ) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) - def find_bic(fix_commit): - logger.info("Analyzing {}...".format(fix_commit["git_rev"])) + def find_bic(bug_fixing_commit): + logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"])) - commit = GIT_REPO.get_commit(fix_commit["git_rev"]) + commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"]) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: @@ -249,48 +329,53 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir): commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info(bug_introducing_modifications) + bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: - bug_introducing_commit = fix_commit.copy() - bug_introducing_commit.update( + bug_introducing_commits.append( { + "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], + "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial( bug_introducing_hash ), "bug_introducing_git_rev": bug_introducing_hash, } ) - bug_introducing_commits.append(bug_introducing_commit) + + # Add an empty result, just so that we don't reanalyze this again. + if len(bug_introducing_commits) == 0: + bug_introducing_commits.append( + { + "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], + "bug_fixing_git_rev": bug_fixing_commit["git_rev"], + "bug_introducing_mercurial_rev": "", + "bug_introducing_git_rev": "", + } + ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: - results = executor.map(find_bic, fix_commits) - results = tqdm(results, total=len(fix_commits)) - results = list(itertools.chain.from_iterable(results)) + bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) + bug_introducing_commits = tqdm( + bug_introducing_commits, total=len(bug_fixing_commits) + ) + bug_introducing_commits = list( + itertools.chain.from_iterable(bug_introducing_commits) + ) - total_results_num = len(results) - results = list(filter(None, results)) + total_results_num = len(bug_introducing_commits) + bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( - f"Skipped {total_results_num - len(results)} commits as they were too big" + f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) - with open("bug_introducing_commits.csv", "w") as f: - writer = csv.DictWriter( - f, - fieldnames=[ - "mercurial_rev", - "git_rev", - "type", - "bug_introducing_mercurial_rev", - "bug_introducing_git_rev", - ], - ) - writer.writeheader() - writer.writerows(results) + db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits) + compress_file(BUG_INTRODUCING_COMMITS_DB) def main():