зеркало из https://github.com/mozilla/bugbug.git
Store regressor finder results in bugbug DBs and make it run only on commits which haven't been analyzed yet
This commit is contained in:
Родитель
d54472f027
Коммит
fbaef0661d
|
@ -61,11 +61,11 @@ tasks:
|
|||
public/commits_to_ignore.csv:
|
||||
path: /commits_to_ignore.csv
|
||||
type: file
|
||||
public/fix_commits.csv:
|
||||
path: /fix_commits.csv
|
||||
public/bug_fixing_commits.json.zst:
|
||||
path: /bug_fixing_commits.json.zst
|
||||
type: file
|
||||
public/bug_introducing_commits.csv:
|
||||
path: /bug_introducing_commits.csv
|
||||
public/bug_introducing_commits.json.zst:
|
||||
path: /bug_introducing_commits.json.zst
|
||||
type: file
|
||||
cache:
|
||||
bugbug-mercurial-repository: /cache
|
||||
|
@ -77,6 +77,7 @@ tasks:
|
|||
routes:
|
||||
- notify.email.release-mgmt-analysis@mozilla.com.on-failed"
|
||||
- notify.irc-channel.#bugbug.on-failed
|
||||
- index.project.relman.bugbug_annotate.regressor_finder.latest
|
||||
metadata:
|
||||
name: bugbug regressor finder
|
||||
description: bugbug regressor finder
|
||||
|
|
|
@ -33,11 +33,34 @@ logger = getLogger(__name__)
|
|||
MAX_MODIFICATION_NUMBER = 50
|
||||
# TODO: Set to 2 years and 6 months. If it takes too long, make the task work incrementally like microannotate-generate.
|
||||
RELATIVE_START_DATE = relativedelta(days=49)
|
||||
# Only needed because mercurial<->git mapping could be behind.
|
||||
RELATIVE_END_DATE = relativedelta(days=3)
|
||||
|
||||
BUG_FIXING_COMMITS_DB = "data/bug_fixing_commits.json"
|
||||
db.register(
|
||||
BUG_FIXING_COMMITS_DB,
|
||||
"https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_fixing_commits.json.zst",
|
||||
1,
|
||||
)
|
||||
|
||||
BUG_INTRODUCING_COMMITS_DB = "data/bug_introducing_commits.json"
|
||||
db.register(
|
||||
BUG_INTRODUCING_COMMITS_DB,
|
||||
"https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_introducing_commits.json.zst",
|
||||
1,
|
||||
)
|
||||
|
||||
|
||||
BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_name}model.zst"
|
||||
|
||||
|
||||
def compress_file(path):
|
||||
cctx = zstandard.ZstdCompressor()
|
||||
with open(path, "rb") as input_f:
|
||||
with open(f"{path}.zst", "wb") as output_f:
|
||||
cctx.copy_stream(input_f, output_f)
|
||||
|
||||
|
||||
def download_model(model_name):
|
||||
if not os.path.exists(f"{model_name}model"):
|
||||
url = BASE_URL.format(model_name=model_name)
|
||||
|
@ -111,7 +134,7 @@ def get_commits_to_ignore(repo_dir):
|
|||
return commits_to_ignore
|
||||
|
||||
|
||||
def find_fix_commits():
|
||||
def find_bug_fixing_commits():
|
||||
logger.info("Downloading commits database...")
|
||||
db.download_version(repository.COMMITS_DB)
|
||||
if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
|
||||
|
@ -124,6 +147,21 @@ def find_fix_commits():
|
|||
if db.is_old_version(bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB):
|
||||
db.download(bugzilla.BUGS_DB, force=True)
|
||||
|
||||
logger.info("Download previous classifications...")
|
||||
db.download_version(BUG_FIXING_COMMITS_DB)
|
||||
if db.is_old_version(BUG_FIXING_COMMITS_DB) or not os.path.exists(
|
||||
BUG_FIXING_COMMITS_DB
|
||||
):
|
||||
db.download(BUG_FIXING_COMMITS_DB, force=True)
|
||||
|
||||
logger.info("Get previously classified commits...")
|
||||
prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
|
||||
prev_bug_fixing_commits_nodes = set(
|
||||
bug_fixing_commit["mercurial_rev"]
|
||||
for bug_fixing_commit in prev_bug_fixing_commits
|
||||
)
|
||||
logger.info(f"Already classified {len(prev_bug_fixing_commits)} commits...")
|
||||
|
||||
# TODO: Switch to the pure Defect model, as it's better in this case.
|
||||
logger.info("Downloading defect/enhancement/task model...")
|
||||
download_model("defectenhancementtask")
|
||||
|
@ -134,15 +172,24 @@ def find_fix_commits():
|
|||
regression_model = RegressionModel.load("regressionmodel")
|
||||
|
||||
start_date = datetime.now() - RELATIVE_START_DATE
|
||||
logger.info(f"Gathering bug IDs associated to commits (since {start_date})...")
|
||||
end_date = datetime.now() - RELATIVE_END_DATE
|
||||
logger.info(
|
||||
f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
|
||||
)
|
||||
commit_map = defaultdict(list)
|
||||
for commit in repository.get_commits():
|
||||
if dateutil.parser.parse(commit["pushdate"]) < start_date:
|
||||
if commit["node"] in prev_bug_fixing_commits_nodes:
|
||||
continue
|
||||
|
||||
commit_date = dateutil.parser.parse(commit["pushdate"])
|
||||
if commit_date < start_date or commit_date > end_date:
|
||||
continue
|
||||
|
||||
commit_map[commit["bug_id"]].append(commit)
|
||||
|
||||
logger.info(f"{len(commit_map)} commits found")
|
||||
logger.info(
|
||||
f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
|
||||
)
|
||||
assert len(commit_map) > 0
|
||||
|
||||
def get_relevant_bugs():
|
||||
|
@ -156,11 +203,11 @@ def find_fix_commits():
|
|||
known_defect_labels = defect_model.get_labels()
|
||||
known_regression_labels = regression_model.get_labels()
|
||||
|
||||
fix_commits = []
|
||||
bug_fixing_commits = []
|
||||
|
||||
def append_fix_commits(bug_id, type_):
|
||||
def append_bug_fixing_commits(bug_id, type_):
|
||||
for commit in commit_map[bug_id]:
|
||||
fix_commits.append(
|
||||
bug_fixing_commits.append(
|
||||
{
|
||||
"mercurial_rev": commit["node"],
|
||||
"git_rev": vcs_map.mercurial_to_git(commit["node"]),
|
||||
|
@ -178,26 +225,33 @@ def find_fix_commits():
|
|||
bug["id"] in known_regression_labels
|
||||
and known_regression_labels[bug["id"]] == 1
|
||||
):
|
||||
append_fix_commits(bug["id"], "r")
|
||||
append_bug_fixing_commits(bug["id"], "r")
|
||||
continue
|
||||
|
||||
if bug["id"] in known_defect_labels:
|
||||
if known_defect_labels[bug["id"]] == "defect":
|
||||
append_fix_commits(bug["id"], "d")
|
||||
append_bug_fixing_commits(bug["id"], "d")
|
||||
else:
|
||||
append_bug_fixing_commits(bug["id"], "e")
|
||||
continue
|
||||
|
||||
if defect_model.classify(bug)[0] == "defect":
|
||||
if regression_model.classify(bug)[0] == 1:
|
||||
append_fix_commits(bug["id"], "r")
|
||||
append_bug_fixing_commits(bug["id"], "r")
|
||||
else:
|
||||
append_fix_commits(bug["id"], "d")
|
||||
append_bug_fixing_commits(bug["id"], "d")
|
||||
else:
|
||||
append_bug_fixing_commits(bug["id"], "e")
|
||||
|
||||
with open("fix_commits.csv", "w") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["mercurial_rev", "git_rev", "type"])
|
||||
writer.writeheader()
|
||||
writer.writerows(fix_commits)
|
||||
db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
|
||||
compress_file(BUG_FIXING_COMMITS_DB)
|
||||
|
||||
return fix_commits
|
||||
bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
|
||||
return [
|
||||
bug_fixing_commit
|
||||
for bug_fixing_commit in bug_fixing_commits
|
||||
if bug_fixing_commit["type"] in ["r", "d"]
|
||||
]
|
||||
|
||||
|
||||
def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
||||
|
@ -212,7 +266,21 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
|||
logger.info(f"Cloning git repository to {git_repo_dir}...")
|
||||
clone_gecko_dev(git_repo_dir)
|
||||
|
||||
fix_commits = find_fix_commits()
|
||||
logger.info("Download previously found bug-introducing commits...")
|
||||
db.download_version(BUG_INTRODUCING_COMMITS_DB)
|
||||
if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
|
||||
BUG_INTRODUCING_COMMITS_DB
|
||||
):
|
||||
db.download(BUG_INTRODUCING_COMMITS_DB, force=True)
|
||||
|
||||
logger.info("Get previously found bug-introducing commits...")
|
||||
prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
|
||||
prev_bug_introducing_commits_nodes = set(
|
||||
bug_introducing_commit["bug_fixing_mercurial_rev"]
|
||||
for bug_introducing_commit in prev_bug_introducing_commits
|
||||
)
|
||||
logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")
|
||||
|
||||
commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)
|
||||
|
||||
git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)
|
||||
|
@ -220,26 +288,38 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
|||
with open("git_hashes_to_ignore", "w") as f:
|
||||
f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)
|
||||
|
||||
total_fix_commits_num = len(fix_commits)
|
||||
fix_commits = [
|
||||
fix_commit
|
||||
for fix_commit in fix_commits
|
||||
if fix_commit["git_rev"] not in git_hashes_to_ignore
|
||||
bug_fixing_commits = find_bug_fixing_commits()
|
||||
|
||||
logger.info(f"{len(bug_fixing_commits)} commits to analyze")
|
||||
|
||||
# Skip already found bug-introducing commits.
|
||||
bug_fixing_commits = [
|
||||
bug_fixing_commit
|
||||
for bug_fixing_commit in bug_fixing_commits
|
||||
if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
|
||||
]
|
||||
|
||||
logger.info(
|
||||
f"Skipped {total_fix_commits_num - len(fix_commits)} commits as they were in the ignore list"
|
||||
f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
|
||||
)
|
||||
|
||||
logger.info(f"{len(fix_commits)} commits to analyze")
|
||||
bug_fixing_commits = [
|
||||
bug_fixing_commit
|
||||
for bug_fixing_commit in bug_fixing_commits
|
||||
if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
|
||||
]
|
||||
logger.info(
|
||||
f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
|
||||
)
|
||||
|
||||
def _init(git_repo_dir):
|
||||
global GIT_REPO
|
||||
GIT_REPO = GitRepository(git_repo_dir)
|
||||
|
||||
def find_bic(fix_commit):
|
||||
logger.info("Analyzing {}...".format(fix_commit["git_rev"]))
|
||||
def find_bic(bug_fixing_commit):
|
||||
logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))
|
||||
|
||||
commit = GIT_REPO.get_commit(fix_commit["git_rev"])
|
||||
commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])
|
||||
|
||||
# Skip huge changes, we'll likely be wrong with them.
|
||||
if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
|
||||
|
@ -249,48 +329,53 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
|||
commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
|
||||
)
|
||||
logger.info(bug_introducing_modifications)
|
||||
|
||||
bug_introducing_commits = []
|
||||
for bug_introducing_hashes in bug_introducing_modifications.values():
|
||||
for bug_introducing_hash in bug_introducing_hashes:
|
||||
bug_introducing_commit = fix_commit.copy()
|
||||
bug_introducing_commit.update(
|
||||
bug_introducing_commits.append(
|
||||
{
|
||||
"bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
|
||||
"bug_fixing_git_rev": bug_fixing_commit["git_rev"],
|
||||
"bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
|
||||
bug_introducing_hash
|
||||
),
|
||||
"bug_introducing_git_rev": bug_introducing_hash,
|
||||
}
|
||||
)
|
||||
bug_introducing_commits.append(bug_introducing_commit)
|
||||
|
||||
# Add an empty result, just so that we don't reanalyze this again.
|
||||
if len(bug_introducing_commits) == 0:
|
||||
bug_introducing_commits.append(
|
||||
{
|
||||
"bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
|
||||
"bug_fixing_git_rev": bug_fixing_commit["git_rev"],
|
||||
"bug_introducing_mercurial_rev": "",
|
||||
"bug_introducing_git_rev": "",
|
||||
}
|
||||
)
|
||||
|
||||
return bug_introducing_commits
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(
|
||||
initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
|
||||
) as executor:
|
||||
results = executor.map(find_bic, fix_commits)
|
||||
results = tqdm(results, total=len(fix_commits))
|
||||
results = list(itertools.chain.from_iterable(results))
|
||||
bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
|
||||
bug_introducing_commits = tqdm(
|
||||
bug_introducing_commits, total=len(bug_fixing_commits)
|
||||
)
|
||||
bug_introducing_commits = list(
|
||||
itertools.chain.from_iterable(bug_introducing_commits)
|
||||
)
|
||||
|
||||
total_results_num = len(results)
|
||||
results = list(filter(None, results))
|
||||
total_results_num = len(bug_introducing_commits)
|
||||
bug_introducing_commits = list(filter(None, bug_introducing_commits))
|
||||
logger.info(
|
||||
f"Skipped {total_results_num - len(results)} commits as they were too big"
|
||||
f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
|
||||
)
|
||||
|
||||
with open("bug_introducing_commits.csv", "w") as f:
|
||||
writer = csv.DictWriter(
|
||||
f,
|
||||
fieldnames=[
|
||||
"mercurial_rev",
|
||||
"git_rev",
|
||||
"type",
|
||||
"bug_introducing_mercurial_rev",
|
||||
"bug_introducing_git_rev",
|
||||
],
|
||||
)
|
||||
writer.writeheader()
|
||||
writer.writerows(results)
|
||||
db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
|
||||
compress_file(BUG_INTRODUCING_COMMITS_DB)
|
||||
|
||||
|
||||
def main():
|
||||
|
|
Загрузка…
Ссылка в новой задаче