2019-09-11 21:17:02 +03:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import argparse
|
2019-10-18 03:23:53 +03:00
|
|
|
import json
|
2019-09-11 21:17:02 +03:00
|
|
|
import os
|
|
|
|
import subprocess
|
|
|
|
import tarfile
|
2019-09-18 14:10:22 +03:00
|
|
|
from datetime import datetime
|
2019-09-11 21:17:02 +03:00
|
|
|
from logging import INFO, basicConfig, getLogger
|
|
|
|
|
2019-09-18 14:10:22 +03:00
|
|
|
import dateutil.parser
|
|
|
|
from dateutil.relativedelta import relativedelta
|
2019-09-19 02:32:01 +03:00
|
|
|
from tqdm import tqdm
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-11-10 21:57:05 +03:00
|
|
|
from bugbug import commit_features, db, repository, test_scheduling
|
2019-11-18 17:26:03 +03:00
|
|
|
from bugbug.utils import (
|
|
|
|
download_check_etag,
|
|
|
|
open_tar_zst,
|
|
|
|
zstd_compress,
|
|
|
|
zstd_decompress,
|
|
|
|
)
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
basicConfig(level=INFO)
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
2019-09-24 23:42:44 +03:00
|
|
|
JOBS_TO_CONSIDER = ("test-", "build-")
|
2019-11-05 14:31:04 +03:00
|
|
|
JOBS_TO_IGNORE = ("build-docker-image-",)
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-11-07 14:16:52 +03:00
|
|
|
ADR_CACHE_DB = "data/adr_cache.tar"
|
|
|
|
db.register(
|
|
|
|
ADR_CACHE_DB,
|
2019-11-09 00:13:10 +03:00
|
|
|
"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_scheduling_history_push_data.latest/artifacts/public/adr_cache.tar.zst",
|
2019-11-15 02:16:22 +03:00
|
|
|
2,
|
2019-11-07 14:16:52 +03:00
|
|
|
support_files=[],
|
|
|
|
)
|
2019-11-09 00:13:10 +03:00
|
|
|
PUSH_DATA_URL = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.data_test_scheduling_history_push_data.latest/artifacts/public/push_data.json.zst"
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-09-24 23:32:16 +03:00
|
|
|
TRAINING_MONTHS = 6
|
|
|
|
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-11-15 01:04:57 +03:00
|
|
|
def filter_tasks(tasks, all_tasks):
|
2019-11-10 21:57:05 +03:00
|
|
|
return tuple(
|
|
|
|
task
|
|
|
|
for task in tasks
|
2019-11-15 01:04:57 +03:00
|
|
|
if task in all_tasks
|
|
|
|
and any(task.startswith(j) for j in JOBS_TO_CONSIDER)
|
2019-11-10 21:57:05 +03:00
|
|
|
and not any(task.startswith(j) for j in JOBS_TO_IGNORE)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2019-09-11 21:17:02 +03:00
|
|
|
class Retriever(object):
|
2019-10-18 15:08:08 +03:00
|
|
|
def __init__(self):
|
2019-09-18 14:10:22 +03:00
|
|
|
os.makedirs("data", exist_ok=True)
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-10-18 15:08:08 +03:00
|
|
|
def retrieve_push_data(self):
|
2019-09-11 21:17:02 +03:00
|
|
|
# Download previous cache.
|
2019-11-07 14:16:52 +03:00
|
|
|
cache_path = os.path.splitext(ADR_CACHE_DB)[0]
|
|
|
|
if not db.is_old_version(ADR_CACHE_DB):
|
|
|
|
db.download(ADR_CACHE_DB)
|
|
|
|
if os.path.exists(ADR_CACHE_DB):
|
|
|
|
with tarfile.open(ADR_CACHE_DB, "r") as tar:
|
2019-09-18 14:10:22 +03:00
|
|
|
tar.extractall()
|
2019-11-07 14:16:52 +03:00
|
|
|
assert os.path.exists(cache_path), "Decompressed adr cache exists"
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
# Setup adr cache configuration.
|
2019-09-12 13:35:04 +03:00
|
|
|
os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True)
|
2019-09-11 21:17:02 +03:00
|
|
|
with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f:
|
|
|
|
f.write(
|
|
|
|
f"""[adr.cache.stores]
|
2019-11-07 14:16:52 +03:00
|
|
|
file = {{ driver = "file", path = "{os.path.abspath(cache_path)}" }}
|
2019-10-09 17:01:02 +03:00
|
|
|
"""
|
2019-09-11 21:17:02 +03:00
|
|
|
)
|
|
|
|
|
2019-09-24 23:32:16 +03:00
|
|
|
# We'll use the past TRAINING_MONTHS months only for training the model,
|
|
|
|
# but we use 3 months more than that to calculate the failure statistics.
|
2019-09-11 21:17:02 +03:00
|
|
|
subprocess.run(
|
|
|
|
[
|
|
|
|
"run-adr",
|
|
|
|
"ahal/ci-recipes",
|
|
|
|
"recipe",
|
|
|
|
"-o",
|
2019-09-18 14:10:22 +03:00
|
|
|
os.path.abspath("push_data.json"),
|
2019-09-11 21:17:02 +03:00
|
|
|
"-f",
|
|
|
|
"json",
|
|
|
|
"push_data",
|
|
|
|
"--",
|
|
|
|
"--from",
|
2019-09-24 23:32:16 +03:00
|
|
|
f"today-{TRAINING_MONTHS + 3}month",
|
2019-09-11 21:17:02 +03:00
|
|
|
"--to",
|
2019-11-14 21:56:04 +03:00
|
|
|
"today-3day",
|
2019-09-11 21:17:02 +03:00
|
|
|
"--branch",
|
|
|
|
"autoland",
|
|
|
|
],
|
|
|
|
check=True,
|
2019-09-13 11:09:33 +03:00
|
|
|
stdout=subprocess.DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise.
|
2019-09-11 21:17:02 +03:00
|
|
|
)
|
|
|
|
|
2019-11-18 17:26:03 +03:00
|
|
|
with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar:
|
2019-11-07 14:16:52 +03:00
|
|
|
tar.add(cache_path)
|
2019-10-18 15:08:08 +03:00
|
|
|
|
2019-10-18 15:33:53 +03:00
|
|
|
zstd_compress("push_data.json")
|
|
|
|
|
2019-10-18 15:08:08 +03:00
|
|
|
def generate_test_scheduling_history(self):
|
2019-10-18 15:33:53 +03:00
|
|
|
if not os.path.exists("push_data.json"):
|
|
|
|
download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
|
|
|
|
zstd_decompress("push_data.json")
|
|
|
|
assert os.path.exists(
|
|
|
|
"push_data.json"
|
|
|
|
), "Decompressed push data file exists"
|
|
|
|
|
2019-10-18 15:08:08 +03:00
|
|
|
# Get the commits DB.
|
|
|
|
if db.is_old_version(repository.COMMITS_DB) or not db.exists(
|
|
|
|
repository.COMMITS_DB
|
|
|
|
):
|
|
|
|
db.download(repository.COMMITS_DB, force=True)
|
|
|
|
|
2019-09-24 23:32:16 +03:00
|
|
|
HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
|
|
|
|
db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)
|
|
|
|
|
|
|
|
for test_data in test_scheduling.get_test_scheduling_history():
|
|
|
|
pass
|
|
|
|
|
2019-11-10 21:57:05 +03:00
|
|
|
last_node = test_data["revs"][0]
|
2019-10-18 14:59:24 +03:00
|
|
|
else:
|
|
|
|
last_node = None
|
|
|
|
|
2019-11-18 20:33:44 +03:00
|
|
|
def generate_all_data():
|
|
|
|
past_failures = test_scheduling.get_past_failures()
|
2019-10-09 17:02:58 +03:00
|
|
|
|
2019-11-18 20:33:44 +03:00
|
|
|
push_num = past_failures["push_num"] if "push_num" in past_failures else 0
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
# We can start once we get to the last revision we added in the previous run.
|
|
|
|
can_start = True if last_node is None else False
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-11-10 21:57:05 +03:00
|
|
|
commit_map = {}
|
|
|
|
for commit_data in tqdm(repository.get_commits()):
|
|
|
|
if not can_start:
|
|
|
|
if last_node == commit_data["node"]:
|
|
|
|
can_start = True
|
2019-10-22 19:42:38 +03:00
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
continue
|
|
|
|
|
2019-11-10 21:57:05 +03:00
|
|
|
commit_map[commit_data["node"]] = commit_data
|
|
|
|
|
|
|
|
with open("push_data.json", "r") as f:
|
|
|
|
push_data = json.load(f)[1:]
|
|
|
|
|
|
|
|
logger.info(f"push data nodes: {len(push_data)}")
|
|
|
|
|
2019-11-15 01:04:57 +03:00
|
|
|
# In the last 28 pushes, we definitely run all possible tasks.
|
|
|
|
all_tasks_set = set(
|
|
|
|
sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])
|
|
|
|
)
|
2019-11-18 16:46:27 +03:00
|
|
|
# Filter tasks we don't need.
|
|
|
|
all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set)
|
|
|
|
all_tasks_set = set(all_tasks)
|
2019-11-15 01:04:57 +03:00
|
|
|
logger.info(f"{len(all_tasks_set)} tasks run in the last 28 pushes")
|
|
|
|
|
2019-11-18 20:33:44 +03:00
|
|
|
saved_nodes = set()
|
|
|
|
skipped_no_commits = 0
|
|
|
|
skipped_too_big_commits = 0
|
|
|
|
skipped_no_tasks = 0
|
|
|
|
|
2019-11-10 21:57:05 +03:00
|
|
|
# We can start once we get to the last revision we added in the previous run.
|
|
|
|
can_start = True if last_node is None else False
|
|
|
|
|
|
|
|
for i in tqdm(range(len(push_data))):
|
|
|
|
(
|
|
|
|
revisions,
|
2019-11-15 01:04:57 +03:00
|
|
|
push_tasks,
|
2019-11-10 21:57:05 +03:00
|
|
|
possible_regressions,
|
|
|
|
likely_regressions,
|
|
|
|
) = push_data.pop(0)
|
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
if not can_start:
|
2019-11-10 21:57:05 +03:00
|
|
|
if last_node == revisions[0]:
|
|
|
|
can_start = True
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
push_num += 1
|
|
|
|
|
|
|
|
# XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them.
|
|
|
|
commits = tuple(
|
|
|
|
commit_map.pop(revision)
|
|
|
|
for revision in revisions
|
|
|
|
if revision in commit_map
|
|
|
|
)
|
|
|
|
if len(commits) == 0:
|
|
|
|
skipped_no_commits += 1
|
2019-10-18 14:59:24 +03:00
|
|
|
continue
|
|
|
|
|
2019-11-10 21:57:05 +03:00
|
|
|
merged_commits = commit_features.merge_commits(commits)
|
|
|
|
|
|
|
|
# XXX: For now, skip commits which are too large.
|
|
|
|
# In the future we can either:
|
|
|
|
# - Improve shelve perf and go back to consider all files;
|
|
|
|
# - Consider only files which appear with a given frequency, like the "files" feature in commit_features;
|
|
|
|
# - Keep a limit of number of files.
|
2019-11-17 14:24:11 +03:00
|
|
|
if len(merged_commits["files"]) > 20:
|
2019-11-10 21:57:05 +03:00
|
|
|
skipped_too_big_commits += 1
|
2019-09-18 14:10:22 +03:00
|
|
|
continue
|
|
|
|
|
2019-11-18 16:47:24 +03:00
|
|
|
# If we considered all_tasks, we'd generate a huge amount of data.
|
|
|
|
# So we consider only the tasks which run in this push, and the possible and likely regressions
|
|
|
|
# from this push.
|
|
|
|
tasks_to_consider = list(
|
|
|
|
set(push_tasks + possible_regressions + likely_regressions)
|
|
|
|
)
|
|
|
|
tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set)
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-11-18 16:47:24 +03:00
|
|
|
if len(tasks_to_consider) == 0:
|
2019-11-10 21:57:05 +03:00
|
|
|
skipped_no_tasks += 1
|
|
|
|
continue
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-11-18 16:49:34 +03:00
|
|
|
# Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!).
|
|
|
|
if i % 250 == 0:
|
2019-11-10 21:57:05 +03:00
|
|
|
past_failures.sync()
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-11-10 21:57:05 +03:00
|
|
|
pushdate = dateutil.parser.parse(merged_commits["pushdate"])
|
2019-11-05 14:31:04 +03:00
|
|
|
|
2019-11-18 20:33:44 +03:00
|
|
|
for data in test_scheduling.generate_data(
|
|
|
|
past_failures,
|
|
|
|
merged_commits,
|
|
|
|
push_num,
|
|
|
|
tasks_to_consider,
|
|
|
|
possible_regressions,
|
|
|
|
likely_regressions,
|
|
|
|
):
|
2019-09-18 14:10:22 +03:00
|
|
|
if pushdate > HISTORY_DATE_START:
|
2019-11-10 21:57:05 +03:00
|
|
|
saved_nodes.add(i)
|
2019-11-18 20:33:44 +03:00
|
|
|
data["revisions"] = revisions
|
|
|
|
yield data
|
2019-09-18 14:10:22 +03:00
|
|
|
|
|
|
|
logger.info(f"saved push data nodes: {len(saved_nodes)}")
|
2019-11-10 21:57:05 +03:00
|
|
|
logger.info(f"skipped {skipped_no_commits} (no commits in our DB)")
|
|
|
|
logger.info(f"skipped {skipped_too_big_commits} (too big commits)")
|
|
|
|
logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)")
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-11-18 20:33:44 +03:00
|
|
|
past_failures["push_num"] = push_num
|
|
|
|
past_failures.close()
|
|
|
|
|
|
|
|
db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data())
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-09-19 12:21:21 +03:00
|
|
|
zstd_compress(test_scheduling.TEST_SCHEDULING_DB)
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-11-18 17:26:03 +03:00
|
|
|
with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar:
|
2019-11-18 17:10:22 +03:00
|
|
|
tar.add("data/past_failures.lmdb")
|
2019-10-18 14:59:24 +03:00
|
|
|
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
def main():
|
|
|
|
description = "Retrieve and extract the test scheduling history from ActiveData"
|
|
|
|
parser = argparse.ArgumentParser(description=description)
|
|
|
|
|
2019-10-18 15:33:53 +03:00
|
|
|
parser.add_argument(
|
|
|
|
"op", help="Which operation to perform.", choices=["retrieve", "generate"]
|
|
|
|
)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
retriever = Retriever()
|
2019-10-18 15:33:53 +03:00
|
|
|
if args.op == "retrieve":
|
|
|
|
retriever.retrieve_push_data()
|
|
|
|
elif args.op == "generate":
|
|
|
|
retriever.generate_test_scheduling_history()
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|