Store regressor finder results in bugbug DBs and make it run only on commits which haven't been analyzed yet

This commit is contained in:
Marco Castelluccio 2019-07-23 02:01:26 +02:00
Родитель d54472f027
Коммит fbaef0661d
2 изменённых файлов: 139 добавлений и 53 удалений

Просмотреть файл

@ -61,11 +61,11 @@ tasks:
public/commits_to_ignore.csv: public/commits_to_ignore.csv:
path: /commits_to_ignore.csv path: /commits_to_ignore.csv
type: file type: file
public/fix_commits.csv: public/bug_fixing_commits.json.zst:
path: /fix_commits.csv path: /bug_fixing_commits.json.zst
type: file type: file
public/bug_introducing_commits.csv: public/bug_introducing_commits.json.zst:
path: /bug_introducing_commits.csv path: /bug_introducing_commits.json.zst
type: file type: file
cache: cache:
bugbug-mercurial-repository: /cache bugbug-mercurial-repository: /cache
@ -77,6 +77,7 @@ tasks:
routes: routes:
- notify.email.release-mgmt-analysis@mozilla.com.on-failed" - notify.email.release-mgmt-analysis@mozilla.com.on-failed"
- notify.irc-channel.#bugbug.on-failed - notify.irc-channel.#bugbug.on-failed
- index.project.relman.bugbug_annotate.regressor_finder.latest
metadata: metadata:
name: bugbug regressor finder name: bugbug regressor finder
description: bugbug regressor finder description: bugbug regressor finder

Просмотреть файл

@ -33,11 +33,34 @@ logger = getLogger(__name__)
MAX_MODIFICATION_NUMBER = 50 MAX_MODIFICATION_NUMBER = 50
# TODO: Set to 2 years and 6 months. If it takes too long, make the task work incrementally like microannotate-generate. # TODO: Set to 2 years and 6 months. If it takes too long, make the task work incrementally like microannotate-generate.
RELATIVE_START_DATE = relativedelta(days=49) RELATIVE_START_DATE = relativedelta(days=49)
# Only needed because mercurial<->git mapping could be behind.
RELATIVE_END_DATE = relativedelta(days=3)
BUG_FIXING_COMMITS_DB = "data/bug_fixing_commits.json"
db.register(
BUG_FIXING_COMMITS_DB,
"https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_fixing_commits.json.zst",
1,
)
BUG_INTRODUCING_COMMITS_DB = "data/bug_introducing_commits.json"
db.register(
BUG_INTRODUCING_COMMITS_DB,
"https://index.taskcluster.net/v1/task/project.relman.bugbug_annotate.regressor_finder.latest/artifacts/public/bug_introducing_commits.json.zst",
1,
)
BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_name}model.zst" BASE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_name}model.zst"
def compress_file(path):
cctx = zstandard.ZstdCompressor()
with open(path, "rb") as input_f:
with open(f"{path}.zst", "wb") as output_f:
cctx.copy_stream(input_f, output_f)
def download_model(model_name): def download_model(model_name):
if not os.path.exists(f"{model_name}model"): if not os.path.exists(f"{model_name}model"):
url = BASE_URL.format(model_name=model_name) url = BASE_URL.format(model_name=model_name)
@ -111,7 +134,7 @@ def get_commits_to_ignore(repo_dir):
return commits_to_ignore return commits_to_ignore
def find_fix_commits(): def find_bug_fixing_commits():
logger.info("Downloading commits database...") logger.info("Downloading commits database...")
db.download_version(repository.COMMITS_DB) db.download_version(repository.COMMITS_DB)
if db.is_old_version(repository.COMMITS_DB) or not os.path.exists( if db.is_old_version(repository.COMMITS_DB) or not os.path.exists(
@ -124,6 +147,21 @@ def find_fix_commits():
if db.is_old_version(bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB): if db.is_old_version(bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB):
db.download(bugzilla.BUGS_DB, force=True) db.download(bugzilla.BUGS_DB, force=True)
logger.info("Download previous classifications...")
db.download_version(BUG_FIXING_COMMITS_DB)
if db.is_old_version(BUG_FIXING_COMMITS_DB) or not os.path.exists(
BUG_FIXING_COMMITS_DB
):
db.download(BUG_FIXING_COMMITS_DB, force=True)
logger.info("Get previously classified commits...")
prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB))
prev_bug_fixing_commits_nodes = set(
bug_fixing_commit["mercurial_rev"]
for bug_fixing_commit in prev_bug_fixing_commits
)
logger.info(f"Already classified {len(prev_bug_fixing_commits)} commits...")
# TODO: Switch to the pure Defect model, as it's better in this case. # TODO: Switch to the pure Defect model, as it's better in this case.
logger.info("Downloading defect/enhancement/task model...") logger.info("Downloading defect/enhancement/task model...")
download_model("defectenhancementtask") download_model("defectenhancementtask")
@ -134,15 +172,24 @@ def find_fix_commits():
regression_model = RegressionModel.load("regressionmodel") regression_model = RegressionModel.load("regressionmodel")
start_date = datetime.now() - RELATIVE_START_DATE start_date = datetime.now() - RELATIVE_START_DATE
logger.info(f"Gathering bug IDs associated to commits (since {start_date})...") end_date = datetime.now() - RELATIVE_END_DATE
logger.info(
f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..."
)
commit_map = defaultdict(list) commit_map = defaultdict(list)
for commit in repository.get_commits(): for commit in repository.get_commits():
if dateutil.parser.parse(commit["pushdate"]) < start_date: if commit["node"] in prev_bug_fixing_commits_nodes:
continue
commit_date = dateutil.parser.parse(commit["pushdate"])
if commit_date < start_date or commit_date > end_date:
continue continue
commit_map[commit["bug_id"]].append(commit) commit_map[commit["bug_id"]].append(commit)
logger.info(f"{len(commit_map)} commits found") logger.info(
f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits"
)
assert len(commit_map) > 0 assert len(commit_map) > 0
def get_relevant_bugs(): def get_relevant_bugs():
@ -156,11 +203,11 @@ def find_fix_commits():
known_defect_labels = defect_model.get_labels() known_defect_labels = defect_model.get_labels()
known_regression_labels = regression_model.get_labels() known_regression_labels = regression_model.get_labels()
fix_commits = [] bug_fixing_commits = []
def append_fix_commits(bug_id, type_): def append_bug_fixing_commits(bug_id, type_):
for commit in commit_map[bug_id]: for commit in commit_map[bug_id]:
fix_commits.append( bug_fixing_commits.append(
{ {
"mercurial_rev": commit["node"], "mercurial_rev": commit["node"],
"git_rev": vcs_map.mercurial_to_git(commit["node"]), "git_rev": vcs_map.mercurial_to_git(commit["node"]),
@ -178,26 +225,33 @@ def find_fix_commits():
bug["id"] in known_regression_labels bug["id"] in known_regression_labels
and known_regression_labels[bug["id"]] == 1 and known_regression_labels[bug["id"]] == 1
): ):
append_fix_commits(bug["id"], "r") append_bug_fixing_commits(bug["id"], "r")
continue continue
if bug["id"] in known_defect_labels: if bug["id"] in known_defect_labels:
if known_defect_labels[bug["id"]] == "defect": if known_defect_labels[bug["id"]] == "defect":
append_fix_commits(bug["id"], "d") append_bug_fixing_commits(bug["id"], "d")
else:
append_bug_fixing_commits(bug["id"], "e")
continue continue
if defect_model.classify(bug)[0] == "defect": if defect_model.classify(bug)[0] == "defect":
if regression_model.classify(bug)[0] == 1: if regression_model.classify(bug)[0] == 1:
append_fix_commits(bug["id"], "r") append_bug_fixing_commits(bug["id"], "r")
else: else:
append_fix_commits(bug["id"], "d") append_bug_fixing_commits(bug["id"], "d")
else:
append_bug_fixing_commits(bug["id"], "e")
with open("fix_commits.csv", "w") as f: db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits)
writer = csv.DictWriter(f, fieldnames=["mercurial_rev", "git_rev", "type"]) compress_file(BUG_FIXING_COMMITS_DB)
writer.writeheader()
writer.writerows(fix_commits)
return fix_commits bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits
return [
bug_fixing_commit
for bug_fixing_commit in bug_fixing_commits
if bug_fixing_commit["type"] in ["r", "d"]
]
def find_bug_introducing_commits(cache_dir, git_repo_dir): def find_bug_introducing_commits(cache_dir, git_repo_dir):
@ -212,7 +266,21 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
logger.info(f"Cloning git repository to {git_repo_dir}...") logger.info(f"Cloning git repository to {git_repo_dir}...")
clone_gecko_dev(git_repo_dir) clone_gecko_dev(git_repo_dir)
fix_commits = find_fix_commits() logger.info("Download previously found bug-introducing commits...")
db.download_version(BUG_INTRODUCING_COMMITS_DB)
if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists(
BUG_INTRODUCING_COMMITS_DB
):
db.download(BUG_INTRODUCING_COMMITS_DB, force=True)
logger.info("Get previously found bug-introducing commits...")
prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB))
prev_bug_introducing_commits_nodes = set(
bug_introducing_commit["bug_fixing_mercurial_rev"]
for bug_introducing_commit in prev_bug_introducing_commits
)
logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...")
commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir) commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir)
git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore) git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore)
@ -220,26 +288,38 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
with open("git_hashes_to_ignore", "w") as f: with open("git_hashes_to_ignore", "w") as f:
f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore) f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore)
total_fix_commits_num = len(fix_commits) bug_fixing_commits = find_bug_fixing_commits()
fix_commits = [
fix_commit logger.info(f"{len(bug_fixing_commits)} commits to analyze")
for fix_commit in fix_commits
if fix_commit["git_rev"] not in git_hashes_to_ignore # Skip already found bug-introducing commits.
bug_fixing_commits = [
bug_fixing_commit
for bug_fixing_commit in bug_fixing_commits
if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes
] ]
logger.info( logger.info(
f"Skipped {total_fix_commits_num - len(fix_commits)} commits as they were in the ignore list" f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones"
) )
logger.info(f"{len(fix_commits)} commits to analyze") bug_fixing_commits = [
bug_fixing_commit
for bug_fixing_commit in bug_fixing_commits
if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore
]
logger.info(
f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list"
)
def _init(git_repo_dir): def _init(git_repo_dir):
global GIT_REPO global GIT_REPO
GIT_REPO = GitRepository(git_repo_dir) GIT_REPO = GitRepository(git_repo_dir)
def find_bic(fix_commit): def find_bic(bug_fixing_commit):
logger.info("Analyzing {}...".format(fix_commit["git_rev"])) logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"]))
commit = GIT_REPO.get_commit(fix_commit["git_rev"]) commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"])
# Skip huge changes, we'll likely be wrong with them. # Skip huge changes, we'll likely be wrong with them.
if len(commit.modifications) > MAX_MODIFICATION_NUMBER: if len(commit.modifications) > MAX_MODIFICATION_NUMBER:
@ -249,48 +329,53 @@ def find_bug_introducing_commits(cache_dir, git_repo_dir):
commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")
) )
logger.info(bug_introducing_modifications) logger.info(bug_introducing_modifications)
bug_introducing_commits = [] bug_introducing_commits = []
for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hashes in bug_introducing_modifications.values():
for bug_introducing_hash in bug_introducing_hashes: for bug_introducing_hash in bug_introducing_hashes:
bug_introducing_commit = fix_commit.copy() bug_introducing_commits.append(
bug_introducing_commit.update(
{ {
"bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
"bug_fixing_git_rev": bug_fixing_commit["git_rev"],
"bug_introducing_mercurial_rev": vcs_map.git_to_mercurial( "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial(
bug_introducing_hash bug_introducing_hash
), ),
"bug_introducing_git_rev": bug_introducing_hash, "bug_introducing_git_rev": bug_introducing_hash,
} }
) )
bug_introducing_commits.append(bug_introducing_commit)
# Add an empty result, just so that we don't reanalyze this again.
if len(bug_introducing_commits) == 0:
bug_introducing_commits.append(
{
"bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"],
"bug_fixing_git_rev": bug_fixing_commit["git_rev"],
"bug_introducing_mercurial_rev": "",
"bug_introducing_git_rev": "",
}
)
return bug_introducing_commits return bug_introducing_commits
with concurrent.futures.ThreadPoolExecutor( with concurrent.futures.ThreadPoolExecutor(
initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1 initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1
) as executor: ) as executor:
results = executor.map(find_bic, fix_commits) bug_introducing_commits = executor.map(find_bic, bug_fixing_commits)
results = tqdm(results, total=len(fix_commits)) bug_introducing_commits = tqdm(
results = list(itertools.chain.from_iterable(results)) bug_introducing_commits, total=len(bug_fixing_commits)
)
bug_introducing_commits = list(
itertools.chain.from_iterable(bug_introducing_commits)
)
total_results_num = len(results) total_results_num = len(bug_introducing_commits)
results = list(filter(None, results)) bug_introducing_commits = list(filter(None, bug_introducing_commits))
logger.info( logger.info(
f"Skipped {total_results_num - len(results)} commits as they were too big" f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big"
) )
with open("bug_introducing_commits.csv", "w") as f: db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits)
writer = csv.DictWriter( compress_file(BUG_INTRODUCING_COMMITS_DB)
f,
fieldnames=[
"mercurial_rev",
"git_rev",
"type",
"bug_introducing_mercurial_rev",
"bug_introducing_git_rev",
],
)
writer.writeheader()
writer.writerows(results)
def main(): def main():