зеркало из https://github.com/mozilla/bugbug.git
Store regressor finder results in bugbug DBs and make it run only on commits which haven't been analyzed yet
This commit is contained in:
Родитель
d54472f027
Коммит
fbaef0661d
|
@ -61,11 +61,11 @@ tasks:
|
||||||
public/commits_to_ignore.csv:
|
public/commits_to_ignore.csv:
|
||||||
path: /commits_to_ignore.csv
|
path: /commits_to_ignore.csv
|
||||||
type: file
|
type: file
|
||||||
public/fix_commits.csv:
|
public/bug_fixing_commits.json.zst:
|
||||||
path: /fix_commits.csv
|
path: /bug_fixing_commits.json.zst
|
||||||
type: file
|
type: file
|
||||||
public/bug_introducing_commits.csv:
|
public/bug_introducing_commits.json.zst:
|
||||||
path: /bug_introducing_commits.csv
|
path: /bug_introducing_commits.json.zst
|
||||||
type: file
|
type: file
|
||||||
cache:
|
cache:
|
||||||
bugbug-mercurial-repository: /cache
|
bugbug-mercurial-repository: /cache
|
||||||
|
@ -77,6 +77,7 @@ tasks:
|
||||||
routes:
|
routes:
|
||||||
- notify.email.release-mgmt-analysis@mozilla.com.on-failed"
|
- notify.email.release-mgmt-analysis@mozilla.com.on-failed"
|
||||||
- notify.irc-channel.#bugbug.on-failed
|
- notify.irc-channel.#bugbug.on-failed
|
||||||
|
- index.project.relman.bugbug_annotate.regressor_finder.latest
|
||||||
metadata:
|
metadata:
|
||||||
name: bugbug regressor finder
|
name: bugbug regressor finder
|
||||||
description: bugbug regressor finder
|
description: bugbug regressor finder
|
||||||
|
|
|
@ -33,11 +33,34 @@ logger = getLogger(__name__)
|
||||||
MAX_MODIFICATION_NUMBER = 50
|
MAX_MODIFICATION_NUMBER = 50
|
||||||
# TODO: Set to 2 years and 6 months. If it takes too long, make the task work incrementally like microannotate-generate.
|
# TODO: Set to 2 years and 6 months. If it takes too long, make the task work incrementally like microannotate-generate.
|
||||||
RELATIVE_START_DATE = relativedelta(days=49)
|
RELATIVE_START_DATE = relativedelta(days=49)
|
||||||
|
# Only needed because mercurial<->git mapping could be behind.
|
||||||
|
RELATIVE_END_DATE = relativedelta(days=3)
|
||||||
|
|
||||||
|
BUG_FIXING_COMMITS_DB = "data/bug_fixing_commits.json"
|
||||||
|
db.register(
|
||||||
|
BUG_FIXING_COMMITS_DB,
|
||||||
|
"https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_fixing_commits.json.zst",
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
|
||||||
|
BUG_INTRODUCING_COMMITS_DB = "data/bug_introducing_commits.json"
|
||||||
|
db.register(
|
||||||
|
BUG_INTRODUCING_COMMITS_DB,
|
||||||
|
"https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_introducing_commits.json.zst",
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_name}model.zst"
|
BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_name}model.zst"
|
||||||
|
|
||||||
|
|
||||||
|
def compress_file(path):
|
||||||
|
cctx = zstandard.ZstdCompressor()
|
||||||
|
with open(path, "rb") as input_f:
|
||||||
|
with open(f"{path}.zst", "wb") as output_f:
|
||||||
|
cctx.copy_stream(input_f, output_f)
|
||||||
|
|
||||||
|
|
||||||
def download_model(model_name):
|
def download_model(model_name):
|
||||||
if not os.path.exists(f"{model_name}model"):
|
if not os.path.exists(f"{model_name}model"):
|
||||||
url = BASE_URL.format(model_name=model_name)
|
url = BASE_URL.format(model_name=model_name)
|
||||||
|
@ -111,7 +134,7 @@ def get_commits_to_ignore(repo_dir):
|
||||||
return commits_to_ignore
|
return commits_to_ignore
|
||||||
|
|
||||||
|
|
||||||
def find_fix_commits():
|
def find_bug_fixing_commits():
|
||||||
logger.info("Downloading commits database...")
|
logger.info("Downloading commits database...")
|
||||||
db.download_version(repository.COMMITS_DB)
|
db.download_version(repository.COMMITS_DB)
|
||||||
if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
|
if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
|
||||||
|
@ -124,6 +147,21 @@ def find_fix_commits():
|
||||||
if db.is_old_version(bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB):
|
if db.is_old_version(bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB):
|
||||||
db.download(bugzilla.BUGS_DB, force=True)
|
db.download(bugzilla.BUGS_DB, force=True)
|
||||||
|
|
||||||
|
logger.info("Download previous classifications...")
|
||||||
|
db.download_version(BUG_FIXING_COMMITS_DB)
|
||||||
|
if db.is_old_version(BUG_FIXING_COMMITS_DB) or not os.path.exists(
|
||||||
|
BUG_FIXING_COMMITS_DB
|
||||||
|
):
|
||||||
|
db.download(BUG_FIXING_COMMITS_DB, force=True)
|
||||||
|
|
||||||
|
logger.info("Get previously classified commits...")
|
||||||
|
prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
|
||||||
|
prev_bug_fixing_commits_nodes = set(
|
||||||
|
bug_fixing_commit["mercurial_rev"]
|
||||||
|
for bug_fixing_commit in prev_bug_fixing_commits
|
||||||
|
)
|
||||||
|
logger.info(f"Already classified {len(prev_bug_fixing_commits)} commits...")
|
||||||
|
|
||||||
# TODO: Switch to the pure Defect model, as it's better in this case.
|
# TODO: Switch to the pure Defect model, as it's better in this case.
|
||||||
logger.info("Downloading defect/enhancement/task model...")
|
logger.info("Downloading defect/enhancement/task model...")
|
||||||
download_model("defectenhancementtask")
|
download_model("defectenhancementtask")
|
||||||
|
@ -134,15 +172,24 @@ def find_fix_commits():
|
||||||
regression_model = RegressionModel.load("regressionmodel")
|
regression_model = RegressionModel.load("regressionmodel")
|
||||||
|
|
||||||
start_date = datetime.now() - RELATIVE_START_DATE
|
start_date = datetime.now() - RELATIVE_START_DATE
|
||||||
logger.info(f"Gathering bug IDs associated to commits (since {start_date})...")
|
end_date = datetime.now() - RELATIVE_END_DATE
|
||||||
|
logger.info(
|
||||||
|
f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
|
||||||
|
)
|
||||||
commit_map = defaultdict(list)
|
commit_map = defaultdict(list)
|
||||||
for commit in repository.get_commits():
|
for commit in repository.get_commits():
|
||||||
if dateutil.parser.parse(commit["pushdate"]) < start_date:
|
if commit["node"] in prev_bug_fixing_commits_nodes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
commit_date = dateutil.parser.parse(commit["pushdate"])
|
||||||
|
if commit_date < start_date or commit_date > end_date:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
commit_map[commit["bug_id"]].append(commit)
|
commit_map[commit["bug_id"]].append(commit)
|
||||||
|
|
||||||
logger.info(f"{len(commit_map)} commits found")
|
logger.info(
|
||||||
|
f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
|
||||||
|
)
|
||||||
assert len(commit_map) > 0
|
assert len(commit_map) > 0
|
||||||
|
|
||||||
def get_relevant_bugs():
|
def get_relevant_bugs():
|
||||||
|
@ -156,11 +203,11 @@ def find_fix_commits():
|
||||||
known_defect_labels = defect_model.get_labels()
|
known_defect_labels = defect_model.get_labels()
|
||||||
known_regression_labels = regression_model.get_labels()
|
known_regression_labels = regression_model.get_labels()
|
||||||
|
|
||||||
fix_commits = []
|
bug_fixing_commits = []
|
||||||
|
|
||||||
def append_fix_commits(bug_id, type_):
|
def append_bug_fixing_commits(bug_id, type_):
|
||||||
for commit in commit_map[bug_id]:
|
for commit in commit_map[bug_id]:
|
||||||
fix_commits.append(
|
bug_fixing_commits.append(
|
||||||
{
|
{
|
||||||
"mercurial_rev": commit["node"],
|
"mercurial_rev": commit["node"],
|
||||||
"git_rev": vcs_map.mercurial_to_git(commit["node"]),
|
"git_rev": vcs_map.mercurial_to_git(commit["node"]),
|
||||||
|
@ -178,26 +225,33 @@ def find_fix_commits():
|
||||||
bug["id"] in known_regression_labels
|
bug["id"] in known_regression_labels
|
||||||
and known_regression_labels[bug["id"]] == 1
|
and known_regression_labels[bug["id"]] == 1
|
||||||
):
|
):
|
||||||
append_fix_commits(bug["id"], "r")
|
append_bug_fixing_commits(bug["id"], "r")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if bug["id"] in known_defect_labels:
|
if bug["id"] in known_defect_labels:
|
||||||
if known_defect_labels[bug["id"]] == "defect":
|
if known_defect_labels[bug["id"]] == "defect":
|
||||||
append_fix_commits(bug["id"], "d")
|
append_bug_fixing_commits(bug["id"], "d")
|
||||||
|
else:
|
||||||
|
append_bug_fixing_commits(bug["id"], "e")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if defect_model.classify(bug)[0] == "defect":
|
if defect_model.classify(bug)[0] == "defect":
|
||||||
if regression_model.classify(bug)[0] == 1:
|
if regression_model.classify(bug)[0] == 1:
|
||||||
append_fix_commits(bug["id"], "r")
|
append_bug_fixing_commits(bug["id"], "r")
|
||||||
else:
|
else:
|
||||||
append_fix_commits(bug["id"], "d")
|
append_bug_fixing_commits(bug["id"], "d")
|
||||||
|
else:
|
||||||
|
append_bug_fixing_commits(bug["id"], "e")
|
||||||
|
|
||||||
with open("fix_commits.csv", "w") as f:
|
db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
|
||||||
writer = csv.DictWriter(f, fieldnames=["mercurial_rev", "git_rev", "type"])
|
compress_file(BUG_FIXING_COMMITS_DB)
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(fix_commits)
|
|
||||||
|
|
||||||
return fix_commits
|
bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
|
||||||
|
return [
|
||||||
|
bug_fixing_commit
|
||||||
|
for bug_fixing_commit in bug_fixing_commits
|
||||||
|
if bug_fixing_commit["type"] in ["r", "d"]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
||||||
|
@ -212,7 +266,21 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
||||||
logger.info(f"Cloning git repository to {git_repo_dir}...")
|
logger.info(f"Cloning git repository to {git_repo_dir}...")
|
||||||
clone_gecko_dev(git_repo_dir)
|
clone_gecko_dev(git_repo_dir)
|
||||||
|
|
||||||
fix_commits = find_fix_commits()
|
logger.info("Download previously found bug-introducing commits...")
|
||||||
|
db.download_version(BUG_INTRODUCING_COMMITS_DB)
|
||||||
|
if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
|
||||||
|
BUG_INTRODUCING_COMMITS_DB
|
||||||
|
):
|
||||||
|
db.download(BUG_INTRODUCING_COMMITS_DB, force=True)
|
||||||
|
|
||||||
|
logger.info("Get previously found bug-introducing commits...")
|
||||||
|
prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
|
||||||
|
prev_bug_introducing_commits_nodes = set(
|
||||||
|
bug_introducing_commit["bug_fixing_mercurial_rev"]
|
||||||
|
for bug_introducing_commit in prev_bug_introducing_commits
|
||||||
|
)
|
||||||
|
logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")
|
||||||
|
|
||||||
commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)
|
commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)
|
||||||
|
|
||||||
git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)
|
git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)
|
||||||
|
@ -220,26 +288,38 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
||||||
with open("git_hashes_to_ignore", "w") as f:
|
with open("git_hashes_to_ignore", "w") as f:
|
||||||
f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)
|
f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)
|
||||||
|
|
||||||
total_fix_commits_num = len(fix_commits)
|
bug_fixing_commits = find_bug_fixing_commits()
|
||||||
fix_commits = [
|
|
||||||
fix_commit
|
logger.info(f"{len(bug_fixing_commits)} commits to analyze")
|
||||||
for fix_commit in fix_commits
|
|
||||||
if fix_commit["git_rev"] not in git_hashes_to_ignore
|
# Skip already found bug-introducing commits.
|
||||||
|
bug_fixing_commits = [
|
||||||
|
bug_fixing_commit
|
||||||
|
for bug_fixing_commit in bug_fixing_commits
|
||||||
|
if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Skipped {total_fix_commits_num - len(fix_commits)} commits as they were in the ignore list"
|
f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"{len(fix_commits)} commits to analyze")
|
bug_fixing_commits = [
|
||||||
|
bug_fixing_commit
|
||||||
|
for bug_fixing_commit in bug_fixing_commits
|
||||||
|
if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
|
||||||
|
]
|
||||||
|
logger.info(
|
||||||
|
f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
|
||||||
|
)
|
||||||
|
|
||||||
def _init(git_repo_dir):
|
def _init(git_repo_dir):
|
||||||
global GIT_REPO
|
global GIT_REPO
|
||||||
GIT_REPO = GitRepository(git_repo_dir)
|
GIT_REPO = GitRepository(git_repo_dir)
|
||||||
|
|
||||||
def find_bic(fix_commit):
|
def find_bic(bug_fixing_commit):
|
||||||
logger.info("Analyzing {}...".format(fix_commit["git_rev"]))
|
logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))
|
||||||
|
|
||||||
commit = GIT_REPO.get_commit(fix_commit["git_rev"])
|
commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])
|
||||||
|
|
||||||
# Skip huge changes, we'll likely be wrong with them.
|
# Skip huge changes, we'll likely be wrong with them.
|
||||||
if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
|
if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
|
||||||
|
@ -249,48 +329,53 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
|
||||||
commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
|
commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
|
||||||
)
|
)
|
||||||
logger.info(bug_introducing_modifications)
|
logger.info(bug_introducing_modifications)
|
||||||
|
|
||||||
bug_introducing_commits = []
|
bug_introducing_commits = []
|
||||||
for bug_introducing_hashes in bug_introducing_modifications.values():
|
for bug_introducing_hashes in bug_introducing_modifications.values():
|
||||||
for bug_introducing_hash in bug_introducing_hashes:
|
for bug_introducing_hash in bug_introducing_hashes:
|
||||||
bug_introducing_commit = fix_commit.copy()
|
bug_introducing_commits.append(
|
||||||
bug_introducing_commit.update(
|
|
||||||
{
|
{
|
||||||
|
"bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
|
||||||
|
"bug_fixing_git_rev": bug_fixing_commit["git_rev"],
|
||||||
"bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
|
"bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
|
||||||
bug_introducing_hash
|
bug_introducing_hash
|
||||||
),
|
),
|
||||||
"bug_introducing_git_rev": bug_introducing_hash,
|
"bug_introducing_git_rev": bug_introducing_hash,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
bug_introducing_commits.append(bug_introducing_commit)
|
|
||||||
|
# Add an empty result, just so that we don't reanalyze this again.
|
||||||
|
if len(bug_introducing_commits) == 0:
|
||||||
|
bug_introducing_commits.append(
|
||||||
|
{
|
||||||
|
"bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
|
||||||
|
"bug_fixing_git_rev": bug_fixing_commit["git_rev"],
|
||||||
|
"bug_introducing_mercurial_rev": "",
|
||||||
|
"bug_introducing_git_rev": "",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
return bug_introducing_commits
|
return bug_introducing_commits
|
||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(
|
with concurrent.futures.ThreadPoolExecutor(
|
||||||
initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
|
initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
|
||||||
) as executor:
|
) as executor:
|
||||||
results = executor.map(find_bic, fix_commits)
|
bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
|
||||||
results = tqdm(results, total=len(fix_commits))
|
bug_introducing_commits = tqdm(
|
||||||
results = list(itertools.chain.from_iterable(results))
|
bug_introducing_commits, total=len(bug_fixing_commits)
|
||||||
|
)
|
||||||
|
bug_introducing_commits = list(
|
||||||
|
itertools.chain.from_iterable(bug_introducing_commits)
|
||||||
|
)
|
||||||
|
|
||||||
total_results_num = len(results)
|
total_results_num = len(bug_introducing_commits)
|
||||||
results = list(filter(None, results))
|
bug_introducing_commits = list(filter(None, bug_introducing_commits))
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Skipped {total_results_num - len(results)} commits as they were too big"
|
f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
|
||||||
)
|
)
|
||||||
|
|
||||||
with open("bug_introducing_commits.csv", "w") as f:
|
db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
|
||||||
writer = csv.DictWriter(
|
compress_file(BUG_INTRODUCING_COMMITS_DB)
|
||||||
f,
|
|
||||||
fieldnames=[
|
|
||||||
"mercurial_rev",
|
|
||||||
"git_rev",
|
|
||||||
"type",
|
|
||||||
"bug_introducing_mercurial_rev",
|
|
||||||
"bug_introducing_git_rev",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
writer.writeheader()
|
|
||||||
writer.writerows(results)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
Загрузка…
Ссылка в новой задаче