зеркало из https://github.com/mozilla/bugbug.git
Make it possible to run the bugbug-commit-classify script with either a Phabricator diff or a revision
This commit is contained in:
Родитель
8c4c017602
Коммит
936f2ccb3c
|
@ -50,8 +50,8 @@
|
|||
"bugbug-classify-commit",
|
||||
"regressor",
|
||||
"/cache/mozilla-central",
|
||||
"${payload['DIFF_ID']}",
|
||||
"${payload['PHABRICATOR_DEPLOYMENT']}",
|
||||
"--phabricator-deployment=${payload['PHABRICATOR_DEPLOYMENT']}",
|
||||
"--diff-id=${payload['DIFF_ID']}",
|
||||
"--git_repo_dir=/gecko-dev",
|
||||
"--method_defect_predictor_dir=/MethodDefectPredictor"
|
||||
],
|
||||
|
|
|
@ -49,8 +49,8 @@
|
|||
"bugbug-classify-commit",
|
||||
"testselect",
|
||||
"/cache/mozilla-central",
|
||||
"${payload['DIFF_ID']}",
|
||||
"${payload['PHABRICATOR_DEPLOYMENT']}",
|
||||
"--phabricator-deployment=${payload['PHABRICATOR_DEPLOYMENT']}",
|
||||
"--diff-id=${payload['DIFF_ID']}",
|
||||
"--runnable-jobs=${payload['RUNNABLE_JOBS']}"
|
||||
],
|
||||
"image": "mozilla/bugbug-commit-retrieval",
|
||||
|
|
|
@ -222,16 +222,16 @@ class CommitClassifier(object):
|
|||
|
||||
repository.download_commits(self.repo_dir, rev_start)
|
||||
|
||||
def apply_phab(self, hg, phabricator_deployment, diff_id):
|
||||
def has_revision(revision):
|
||||
if not revision:
|
||||
return False
|
||||
try:
|
||||
hg.identify(revision)
|
||||
return True
|
||||
except hglib.error.CommandError:
|
||||
return False
|
||||
def has_revision(self, hg, revision):
|
||||
if not revision:
|
||||
return False
|
||||
try:
|
||||
hg.identify(revision)
|
||||
return True
|
||||
except hglib.error.CommandError:
|
||||
return False
|
||||
|
||||
def apply_phab(self, hg, phabricator_deployment, diff_id):
|
||||
if phabricator_deployment == PHAB_PROD:
|
||||
api_key = get_secret("PHABRICATOR_TOKEN")
|
||||
url = get_secret("PHABRICATOR_URL")
|
||||
|
@ -252,7 +252,7 @@ class CommitClassifier(object):
|
|||
needed_stack.insert(0, patch)
|
||||
|
||||
# Stop as soon as a base revision is available
|
||||
if has_revision(patch.base_revision):
|
||||
if self.has_revision(hg, patch.base_revision):
|
||||
logger.info(
|
||||
f"Stopping at diff {patch.id} and revision {patch.base_revision}"
|
||||
)
|
||||
|
@ -273,7 +273,7 @@ class CommitClassifier(object):
|
|||
|
||||
# Update repo to base revision
|
||||
hg_base = needed_stack[0].base_revision
|
||||
if not has_revision(hg_base):
|
||||
if not self.has_revision(hg, hg_base):
|
||||
logger.warning("Missing base revision {} from Phabricator".format(hg_base))
|
||||
hg_base = "tip"
|
||||
|
||||
|
@ -545,129 +545,147 @@ class CommitClassifier(object):
|
|||
with open("importances.json", "w") as f:
|
||||
json.dump(features, f)
|
||||
|
||||
def classify(self, phabricator_deployment, diff_id, runnable_jobs_path):
|
||||
def classify(
|
||||
self,
|
||||
revision=None,
|
||||
phabricator_deployment=None,
|
||||
diff_id=None,
|
||||
runnable_jobs_path=None,
|
||||
):
|
||||
if revision is not None:
|
||||
assert phabricator_deployment is None
|
||||
assert diff_id is None
|
||||
|
||||
if diff_id is not None:
|
||||
assert phabricator_deployment is not None
|
||||
assert revision is None
|
||||
|
||||
self.update_commit_db()
|
||||
|
||||
with hglib.open(self.repo_dir) as hg:
|
||||
self.apply_phab(hg, phabricator_deployment, diff_id)
|
||||
if phabricator_deployment is not None and diff_id is not None:
|
||||
self.apply_phab(hg, phabricator_deployment, diff_id)
|
||||
|
||||
patch_rev = hg.log(revrange="not public()")[0].node
|
||||
revision = hg.log(revrange="not public()")[0].node.decode("utf-8")
|
||||
|
||||
# Analyze patch.
|
||||
commits = repository.download_commits(
|
||||
self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False
|
||||
self.repo_dir, rev_start=revision, save=False
|
||||
)
|
||||
|
||||
if not self.use_test_history:
|
||||
self.classify_regressor(commits)
|
||||
else:
|
||||
self.classify_test_select(commits, runnable_jobs_path)
|
||||
|
||||
def classify_regressor(self, commits):
|
||||
# We use "clean" (or "dirty") commits as the background dataset for feature importance.
|
||||
# This way, we can see the features which are most important in differentiating
|
||||
# the current commit from the "clean" (or "dirty") commits.
|
||||
probs, importance = self.model.classify(
|
||||
commits[-1],
|
||||
probabilities=True,
|
||||
importances=True,
|
||||
background_dataset=lambda v: self.X[self.y != v],
|
||||
importance_cutoff=0.05,
|
||||
)
|
||||
|
||||
if not self.use_test_history:
|
||||
probs, importance = self.model.classify(
|
||||
commits[-1],
|
||||
probabilities=True,
|
||||
importances=True,
|
||||
background_dataset=lambda v: self.X[self.y != v],
|
||||
importance_cutoff=0.05,
|
||||
)
|
||||
self.generate_feature_importance_data(probs, importance)
|
||||
|
||||
self.generate_feature_importance_data(probs, importance)
|
||||
with open("probs.json", "w") as f:
|
||||
json.dump(probs[0].tolist(), f)
|
||||
|
||||
with open("probs.json", "w") as f:
|
||||
json.dump(probs[0].tolist(), f)
|
||||
if self.model_name == "regressor" and self.method_defect_predictor_dir:
|
||||
self.classify_methods(commits[-1])
|
||||
|
||||
if self.model_name == "regressor" and self.method_defect_predictor_dir:
|
||||
self.classify_methods(commits[-1])
|
||||
def classify_test_select(self, commits, runnable_jobs_path):
|
||||
testfailure_probs = self.testfailure_model.classify(
|
||||
commits[-1], probabilities=True
|
||||
)
|
||||
|
||||
logger.info(f"Test failure risk: {testfailure_probs[0][1]}")
|
||||
|
||||
commit_data = commit_features.merge_commits(commits)
|
||||
|
||||
push_num = self.past_failures_data["push_num"]
|
||||
|
||||
# XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES.
|
||||
all_tasks = self.past_failures_data["all_tasks"]
|
||||
|
||||
if not runnable_jobs_path:
|
||||
runnable_jobs = {task for task in all_tasks}
|
||||
elif runnable_jobs_path.startswith("http"):
|
||||
r = requests.get(runnable_jobs_path)
|
||||
r.raise_for_status()
|
||||
runnable_jobs = r.json()
|
||||
else:
|
||||
testfailure_probs = self.testfailure_model.classify(
|
||||
commits[-1], probabilities=True
|
||||
with open(runnable_jobs_path, "r") as f:
|
||||
runnable_jobs = json.load(f)
|
||||
|
||||
# XXX: For now, only restrict to linux64 test tasks.
|
||||
all_tasks = [
|
||||
t
|
||||
for t in all_tasks
|
||||
if t.startswith("test-linux1804-64/") and "test-verify" not in t
|
||||
]
|
||||
|
||||
# XXX: Remove tasks which are not in runnable jobs right away, so we avoid classifying them.
|
||||
|
||||
commit_tests = []
|
||||
for data in test_scheduling.generate_data(
|
||||
self.past_failures_data, commit_data, push_num, all_tasks, [], []
|
||||
):
|
||||
if not data["name"].startswith("test-"):
|
||||
continue
|
||||
|
||||
commit_test = commit_data.copy()
|
||||
commit_test["test_job"] = data
|
||||
commit_tests.append(commit_test)
|
||||
|
||||
probs = self.model.classify(commit_tests, probabilities=True)
|
||||
selected_indexes = np.argwhere(
|
||||
probs[:, 1] > float(get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD"))
|
||||
)[:, 0]
|
||||
selected_tasks = [commit_tests[i]["test_job"]["name"] for i in selected_indexes]
|
||||
|
||||
with open("failure_risk", "w") as f:
|
||||
f.write(
|
||||
"1"
|
||||
if testfailure_probs[0][1]
|
||||
> float(get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD"))
|
||||
else "0"
|
||||
)
|
||||
|
||||
logger.info(f"Test failure risk: {testfailure_probs[0][1]}")
|
||||
|
||||
commit_data = commit_features.merge_commits(commits)
|
||||
|
||||
push_num = self.past_failures_data["push_num"]
|
||||
|
||||
# XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES.
|
||||
all_tasks = self.past_failures_data["all_tasks"]
|
||||
|
||||
if not runnable_jobs_path:
|
||||
runnable_jobs = {task for task in all_tasks}
|
||||
elif runnable_jobs_path.startswith("http"):
|
||||
r = requests.get(runnable_jobs_path)
|
||||
r.raise_for_status()
|
||||
runnable_jobs = r.json()
|
||||
else:
|
||||
with open(runnable_jobs_path, "r") as f:
|
||||
runnable_jobs = json.load(f)
|
||||
|
||||
# XXX: For now, only restrict to linux64 test tasks.
|
||||
all_tasks = [
|
||||
t
|
||||
for t in all_tasks
|
||||
if t.startswith("test-linux1804-64/") and "test-verify" not in t
|
||||
]
|
||||
|
||||
# XXX: Remove tasks which are not in runnable jobs right away, so we avoid classifying them.
|
||||
|
||||
commit_tests = []
|
||||
for data in test_scheduling.generate_data(
|
||||
self.past_failures_data, commit_data, push_num, all_tasks, [], []
|
||||
# This should be kept in sync with the test scheduling history retriever script.
|
||||
cleaned_selected_tasks = []
|
||||
for selected_task in selected_tasks:
|
||||
if (
|
||||
selected_task.startswith("test-linux64")
|
||||
and selected_task not in runnable_jobs
|
||||
):
|
||||
if not data["name"].startswith("test-"):
|
||||
continue
|
||||
|
||||
commit_test = commit_data.copy()
|
||||
commit_test["test_job"] = data
|
||||
commit_tests.append(commit_test)
|
||||
|
||||
probs = self.model.classify(commit_tests, probabilities=True)
|
||||
selected_indexes = np.argwhere(
|
||||
probs[:, 1] > float(get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD"))
|
||||
)[:, 0]
|
||||
selected_tasks = [
|
||||
commit_tests[i]["test_job"]["name"] for i in selected_indexes
|
||||
]
|
||||
|
||||
with open("failure_risk", "w") as f:
|
||||
f.write(
|
||||
"1"
|
||||
if testfailure_probs[0][1]
|
||||
> float(get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD"))
|
||||
else "0"
|
||||
selected_task = selected_task.replace(
|
||||
"test-linux64-", "test-linux1804-64-"
|
||||
)
|
||||
|
||||
# This should be kept in sync with the test scheduling history retriever script.
|
||||
if (
|
||||
selected_task.startswith("test-linux1804-64-")
|
||||
and selected_task not in runnable_jobs
|
||||
):
|
||||
selected_task = selected_task.replace(
|
||||
"test-linux1804-64-", "test-linux64-"
|
||||
)
|
||||
|
||||
if selected_task in runnable_jobs:
|
||||
cleaned_selected_tasks.append(selected_task)
|
||||
|
||||
# It isn't worth running the build associated to the tests, if we only run three test tasks.
|
||||
if len(cleaned_selected_tasks) < 3:
|
||||
cleaned_selected_tasks = []
|
||||
for selected_task in selected_tasks:
|
||||
if (
|
||||
selected_task.startswith("test-linux64")
|
||||
and selected_task not in runnable_jobs
|
||||
):
|
||||
selected_task = selected_task.replace(
|
||||
"test-linux64-", "test-linux1804-64-"
|
||||
)
|
||||
|
||||
if (
|
||||
selected_task.startswith("test-linux1804-64-")
|
||||
and selected_task not in runnable_jobs
|
||||
):
|
||||
selected_task = selected_task.replace(
|
||||
"test-linux1804-64-", "test-linux64-"
|
||||
)
|
||||
|
||||
if selected_task in runnable_jobs:
|
||||
cleaned_selected_tasks.append(selected_task)
|
||||
|
||||
# It isn't worth running the build associated to the tests, if we only run three test tasks.
|
||||
if len(cleaned_selected_tasks) < 3:
|
||||
cleaned_selected_tasks = []
|
||||
|
||||
with open("selected_tasks", "w") as f:
|
||||
f.writelines(
|
||||
f"{selected_task}\n" for selected_task in cleaned_selected_tasks
|
||||
)
|
||||
with open("selected_tasks", "w") as f:
|
||||
f.writelines(
|
||||
f"{selected_task}\n" for selected_task in cleaned_selected_tasks
|
||||
)
|
||||
|
||||
def classify_methods(self, commit):
|
||||
# Get commit hash from 4 months before the analysis time.
|
||||
|
@ -748,13 +766,14 @@ def main():
|
|||
"repo_dir",
|
||||
help="Path to a Gecko repository. If no repository exists, it will be cloned to this location.",
|
||||
)
|
||||
parser.add_argument("diff_id", help="diff ID to analyze.", type=int)
|
||||
parser.add_argument(
|
||||
"phabricator_deployment",
|
||||
"--phabricator-deployment",
|
||||
help="Which Phabricator deployment to hit.",
|
||||
type=str,
|
||||
choices=[PHAB_PROD, PHAB_DEV],
|
||||
)
|
||||
parser.add_argument("--diff-id", help="diff ID to analyze.", type=int)
|
||||
parser.add_argument("--revision", help="revision to analyze.", type=str)
|
||||
parser.add_argument(
|
||||
"--runnable-jobs",
|
||||
help="Path or URL to a file containing runnable jobs.",
|
||||
|
@ -773,7 +792,9 @@ def main():
|
|||
classifier = CommitClassifier(
|
||||
args.model, args.repo_dir, args.git_repo_dir, args.method_defect_predictor_dir
|
||||
)
|
||||
classifier.classify(args.phabricator_deployment, args.diff_id, args.runnable_jobs)
|
||||
classifier.classify(
|
||||
args.revision, args.phabricator_deployment, args.diff_id, args.runnable_jobs
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Загрузка…
Ссылка в новой задаче