2019-09-11 21:17:02 +03:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import argparse
|
2019-10-18 03:23:53 +03:00
|
|
|
import json
|
2019-09-11 21:17:02 +03:00
|
|
|
import os
|
2019-10-18 14:59:24 +03:00
|
|
|
import pickle
|
2019-09-11 21:17:02 +03:00
|
|
|
import subprocess
|
|
|
|
import tarfile
|
2019-09-18 14:10:22 +03:00
|
|
|
from datetime import datetime
|
2019-09-11 21:17:02 +03:00
|
|
|
from logging import INFO, basicConfig, getLogger
|
|
|
|
|
2019-09-18 14:10:22 +03:00
|
|
|
import dateutil.parser
|
2019-09-11 21:17:02 +03:00
|
|
|
import requests
|
2019-09-18 14:10:22 +03:00
|
|
|
from dateutil.relativedelta import relativedelta
|
2019-09-19 02:32:01 +03:00
|
|
|
from tqdm import tqdm
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-09-19 12:21:21 +03:00
|
|
|
from bugbug import db, repository, test_scheduling
|
2019-10-18 15:33:53 +03:00
|
|
|
from bugbug.utils import ExpQueue, download_check_etag, zstd_compress, zstd_decompress
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
basicConfig(level=INFO)
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
2019-09-24 23:42:44 +03:00
|
|
|
JOBS_TO_CONSIDER = ("test-", "build-")
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-09-19 12:21:21 +03:00
|
|
|
|
2019-10-20 01:07:12 +03:00
|
|
|
OLD_ADR_CACHE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_test_scheduling_history.latest/artifacts/public/adr_cache.tar.xz"
|
2019-10-18 15:33:53 +03:00
|
|
|
ADR_CACHE_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_test_scheduling_history_push_data.latest/artifacts/public/adr_cache.tar.xz"
|
|
|
|
PUSH_DATA_URL = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_test_scheduling_history_push_data.latest/artifacts/public/push_data.json.zst"
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-09-24 23:32:16 +03:00
|
|
|
TRAINING_MONTHS = 6
|
|
|
|
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
class Retriever(object):
|
2019-10-18 15:08:08 +03:00
|
|
|
def __init__(self):
|
2019-09-18 14:10:22 +03:00
|
|
|
os.makedirs("data", exist_ok=True)
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-10-18 15:08:08 +03:00
|
|
|
def retrieve_push_data(self):
|
2019-09-11 21:17:02 +03:00
|
|
|
# Download previous cache.
|
2019-09-18 14:10:22 +03:00
|
|
|
cache_path = os.path.abspath("data/adr_cache")
|
2019-09-11 21:17:02 +03:00
|
|
|
if not os.path.exists(cache_path):
|
2019-10-19 23:26:38 +03:00
|
|
|
cache_available = True
|
2019-09-11 21:17:02 +03:00
|
|
|
try:
|
2019-10-18 15:33:53 +03:00
|
|
|
download_check_etag(ADR_CACHE_URL, "adr_cache.tar.xz")
|
2019-10-19 23:26:38 +03:00
|
|
|
except requests.exceptions.HTTPError:
|
|
|
|
logger.info("The adr cache is not available yet, trying fallback...")
|
|
|
|
try:
|
|
|
|
download_check_etag(OLD_ADR_CACHE_URL, "adr_cache.tar.xz")
|
|
|
|
except requests.exceptions.HTTPError:
|
2019-10-20 01:06:01 +03:00
|
|
|
logger.info("The adr cache fallback is not available...")
|
2019-10-19 23:26:38 +03:00
|
|
|
cache_available = False
|
|
|
|
|
|
|
|
if cache_available:
|
2019-09-18 14:10:22 +03:00
|
|
|
with tarfile.open("adr_cache.tar.xz", "r:xz") as tar:
|
|
|
|
tar.extractall()
|
|
|
|
assert os.path.exists("data/adr_cache"), "Decompressed adr cache exists"
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
# Setup adr cache configuration.
|
2019-09-12 13:35:04 +03:00
|
|
|
os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True)
|
2019-09-11 21:17:02 +03:00
|
|
|
with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f:
|
|
|
|
f.write(
|
|
|
|
f"""[adr.cache.stores]
|
|
|
|
file = {{ driver = "file", path = "{cache_path}" }}
|
2019-10-09 17:01:02 +03:00
|
|
|
"""
|
2019-09-11 21:17:02 +03:00
|
|
|
)
|
|
|
|
|
2019-09-24 23:32:16 +03:00
|
|
|
# We'll use the past TRAINING_MONTHS months only for training the model,
|
|
|
|
# but we use 3 months more than that to calculate the failure statistics.
|
2019-09-11 21:17:02 +03:00
|
|
|
subprocess.run(
|
|
|
|
[
|
|
|
|
"run-adr",
|
|
|
|
"ahal/ci-recipes",
|
|
|
|
"recipe",
|
|
|
|
"-o",
|
2019-09-18 14:10:22 +03:00
|
|
|
os.path.abspath("push_data.json"),
|
2019-09-11 21:17:02 +03:00
|
|
|
"-f",
|
|
|
|
"json",
|
|
|
|
"push_data",
|
|
|
|
"--",
|
|
|
|
"--from",
|
2019-09-24 23:32:16 +03:00
|
|
|
f"today-{TRAINING_MONTHS + 3}month",
|
2019-09-11 21:17:02 +03:00
|
|
|
"--to",
|
|
|
|
"today-2day",
|
|
|
|
"--branch",
|
|
|
|
"autoland",
|
|
|
|
],
|
|
|
|
check=True,
|
2019-09-13 11:09:33 +03:00
|
|
|
stdout=subprocess.DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise.
|
2019-09-11 21:17:02 +03:00
|
|
|
)
|
|
|
|
|
2019-10-18 15:08:08 +03:00
|
|
|
with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar:
|
|
|
|
tar.add("data/adr_cache")
|
|
|
|
|
2019-10-18 15:33:53 +03:00
|
|
|
zstd_compress("push_data.json")
|
|
|
|
|
2019-10-18 15:08:08 +03:00
|
|
|
def generate_test_scheduling_history(self):
|
2019-10-18 15:33:53 +03:00
|
|
|
if not os.path.exists("push_data.json"):
|
|
|
|
download_check_etag(PUSH_DATA_URL, "push_data.json.zst")
|
|
|
|
zstd_decompress("push_data.json")
|
|
|
|
assert os.path.exists(
|
|
|
|
"push_data.json"
|
|
|
|
), "Decompressed push data file exists"
|
|
|
|
|
2019-10-18 15:08:08 +03:00
|
|
|
# Get the commits DB.
|
|
|
|
if db.is_old_version(repository.COMMITS_DB) or not db.exists(
|
|
|
|
repository.COMMITS_DB
|
|
|
|
):
|
|
|
|
db.download(repository.COMMITS_DB, force=True)
|
|
|
|
|
2019-09-24 23:32:16 +03:00
|
|
|
HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS)
|
2019-09-18 14:10:22 +03:00
|
|
|
|
|
|
|
with open("push_data.json", "r") as f:
|
2019-10-18 03:23:53 +03:00
|
|
|
data = json.load(f)
|
2019-09-18 14:10:22 +03:00
|
|
|
|
|
|
|
push_data = {}
|
|
|
|
for row in data[1:]:
|
|
|
|
# Revision -> (all tasks, possible regressions, likely regressions)
|
|
|
|
push_data[row[0]] = (row[1], row[2], row[3])
|
|
|
|
|
2019-10-22 19:29:53 +03:00
|
|
|
logger.info(f"push data nodes: {len(push_data)}")
|
|
|
|
|
2019-09-18 14:10:22 +03:00
|
|
|
HISTORICAL_TIMESPAN = 56
|
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB):
|
|
|
|
db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True)
|
|
|
|
|
|
|
|
for test_data in test_scheduling.get_test_scheduling_history():
|
|
|
|
pass
|
|
|
|
|
|
|
|
last_node = test_data["rev"]
|
|
|
|
else:
|
|
|
|
last_node = None
|
|
|
|
|
|
|
|
try:
|
|
|
|
with open("data/past_failures.pickle", "rb") as f:
|
|
|
|
past_failures, push_num = pickle.load(f)
|
|
|
|
except FileNotFoundError:
|
|
|
|
past_failures = {}
|
|
|
|
push_num = 0
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-10-09 17:02:58 +03:00
|
|
|
def get_and_update_past_failures(type_, task, items, push_num, is_regression):
|
|
|
|
if type_ not in past_failures:
|
|
|
|
past_failures[type_] = {}
|
|
|
|
|
|
|
|
if task not in past_failures[type_]:
|
|
|
|
past_failures[type_][task] = {}
|
|
|
|
|
|
|
|
values_total = []
|
|
|
|
values_prev_7 = []
|
|
|
|
values_prev_14 = []
|
|
|
|
values_prev_28 = []
|
|
|
|
values_prev_56 = []
|
|
|
|
|
|
|
|
for item in items:
|
|
|
|
if item not in past_failures[type_][task]:
|
2019-10-22 19:38:43 +03:00
|
|
|
cur = past_failures[type_][task][item] = ExpQueue(
|
2019-10-09 17:02:58 +03:00
|
|
|
push_num, HISTORICAL_TIMESPAN + 1, 0
|
|
|
|
)
|
2019-10-22 19:38:43 +03:00
|
|
|
else:
|
|
|
|
cur = past_failures[type_][task][item]
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-10-22 19:38:43 +03:00
|
|
|
value = cur[push_num]
|
2019-10-09 17:02:58 +03:00
|
|
|
|
|
|
|
values_total.append(value)
|
2019-10-22 19:38:43 +03:00
|
|
|
values_prev_7.append(value - cur[push_num - 7])
|
|
|
|
values_prev_14.append(value - cur[push_num - 14])
|
|
|
|
values_prev_28.append(value - cur[push_num - 28])
|
|
|
|
values_prev_56.append(value - cur[push_num - 56])
|
2019-10-09 17:02:58 +03:00
|
|
|
|
|
|
|
if is_regression:
|
2019-10-22 19:38:43 +03:00
|
|
|
cur[push_num] = value + 1
|
2019-10-09 17:02:58 +03:00
|
|
|
|
|
|
|
return (
|
|
|
|
sum(values_total),
|
|
|
|
sum(values_prev_7),
|
|
|
|
sum(values_prev_14),
|
|
|
|
sum(values_prev_28),
|
|
|
|
sum(values_prev_56),
|
|
|
|
)
|
2019-09-18 14:10:22 +03:00
|
|
|
|
|
|
|
def generate_data():
|
2019-10-18 14:59:24 +03:00
|
|
|
nonlocal push_num
|
2019-09-18 14:10:22 +03:00
|
|
|
commits_with_data = set()
|
|
|
|
saved_nodes = set()
|
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
# We can start once we get to the last revision we added in the previous run.
|
|
|
|
can_start = True if last_node is None else False
|
2019-09-19 02:32:01 +03:00
|
|
|
for commit_data in tqdm(repository.get_commits()):
|
2019-09-18 14:10:22 +03:00
|
|
|
node = commit_data["node"]
|
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
if node == last_node:
|
|
|
|
can_start = True
|
|
|
|
continue
|
|
|
|
|
|
|
|
if not can_start:
|
|
|
|
continue
|
|
|
|
|
2019-09-18 14:10:22 +03:00
|
|
|
if node not in push_data:
|
|
|
|
continue
|
|
|
|
|
|
|
|
commits_with_data.add(node)
|
|
|
|
|
|
|
|
commit_push_data = push_data[node]
|
|
|
|
|
|
|
|
for task in commit_push_data[0]:
|
2019-09-24 23:42:44 +03:00
|
|
|
if not any(task.startswith(j) for j in JOBS_TO_CONSIDER):
|
2019-09-18 14:10:22 +03:00
|
|
|
continue
|
|
|
|
|
2019-10-09 17:02:58 +03:00
|
|
|
is_regression = (
|
|
|
|
task in commit_push_data[1] or task in commit_push_data[2]
|
|
|
|
)
|
|
|
|
|
|
|
|
total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures(
|
|
|
|
"all", task, ["all"], push_num, is_regression
|
|
|
|
)
|
|
|
|
|
|
|
|
total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures(
|
|
|
|
"type", task, commit_data["types"], push_num, is_regression
|
2019-09-18 14:10:22 +03:00
|
|
|
)
|
2019-10-09 17:02:58 +03:00
|
|
|
|
|
|
|
total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures(
|
|
|
|
"file", task, commit_data["files"], push_num, is_regression
|
2019-09-18 14:10:22 +03:00
|
|
|
)
|
2019-10-09 17:02:58 +03:00
|
|
|
|
|
|
|
total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures(
|
|
|
|
"directory",
|
|
|
|
task,
|
|
|
|
commit_data["directories"],
|
|
|
|
push_num,
|
|
|
|
is_regression,
|
2019-09-18 14:10:22 +03:00
|
|
|
)
|
2019-10-09 17:02:58 +03:00
|
|
|
|
|
|
|
total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures(
|
|
|
|
"component",
|
|
|
|
task,
|
|
|
|
commit_data["components"],
|
|
|
|
push_num,
|
|
|
|
is_regression,
|
2019-09-18 14:10:22 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
pushdate = dateutil.parser.parse(commit_data["pushdate"])
|
|
|
|
if pushdate > HISTORY_DATE_START:
|
|
|
|
saved_nodes.add(node)
|
|
|
|
|
|
|
|
yield {
|
|
|
|
"rev": node,
|
|
|
|
"name": task,
|
|
|
|
"failures": total_failures,
|
|
|
|
"failures_past_7_pushes": past_7_pushes_failures,
|
|
|
|
"failures_past_14_pushes": past_14_pushes_failures,
|
|
|
|
"failures_past_28_pushes": past_28_pushes_failures,
|
|
|
|
"failures_past_56_pushes": past_56_pushes_failures,
|
2019-10-09 17:02:58 +03:00
|
|
|
"failures_in_types": total_types_failures,
|
|
|
|
"failures_past_7_pushes_in_types": past_7_pushes_types_failures,
|
|
|
|
"failures_past_14_pushes_in_types": past_14_pushes_types_failures,
|
|
|
|
"failures_past_28_pushes_in_types": past_28_pushes_types_failures,
|
|
|
|
"failures_past_56_pushes_in_types": past_56_pushes_types_failures,
|
|
|
|
"failures_in_files": total_files_failures,
|
|
|
|
"failures_past_7_pushes_in_files": past_7_pushes_files_failures,
|
|
|
|
"failures_past_14_pushes_in_files": past_14_pushes_files_failures,
|
|
|
|
"failures_past_28_pushes_in_files": past_28_pushes_files_failures,
|
|
|
|
"failures_past_56_pushes_in_files": past_56_pushes_files_failures,
|
|
|
|
"failures_in_directories": total_directories_failures,
|
|
|
|
"failures_past_7_pushes_in_directories": past_7_pushes_directories_failures,
|
|
|
|
"failures_past_14_pushes_in_directories": past_14_pushes_directories_failures,
|
|
|
|
"failures_past_28_pushes_in_directories": past_28_pushes_directories_failures,
|
|
|
|
"failures_past_56_pushes_in_directories": past_56_pushes_directories_failures,
|
|
|
|
"failures_in_components": total_components_failures,
|
|
|
|
"failures_past_7_pushes_in_components": past_7_pushes_components_failures,
|
|
|
|
"failures_past_14_pushes_in_components": past_14_pushes_components_failures,
|
|
|
|
"failures_past_28_pushes_in_components": past_28_pushes_components_failures,
|
|
|
|
"failures_past_56_pushes_in_components": past_56_pushes_components_failures,
|
2019-09-18 14:10:22 +03:00
|
|
|
"is_possible_regression": task in commit_push_data[1],
|
|
|
|
"is_likely_regression": task in commit_push_data[2],
|
|
|
|
}
|
|
|
|
|
2019-10-22 19:29:53 +03:00
|
|
|
# We no longer need the push data for this node, we can free the memory.
|
|
|
|
del push_data[node]
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-10-22 19:29:53 +03:00
|
|
|
push_num += 1
|
2019-09-18 14:10:22 +03:00
|
|
|
|
|
|
|
logger.info(f"commits linked to push data: {len(commits_with_data)}")
|
|
|
|
|
|
|
|
logger.info(f"saved push data nodes: {len(saved_nodes)}")
|
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data())
|
2019-09-18 14:10:22 +03:00
|
|
|
|
2019-09-19 12:21:21 +03:00
|
|
|
zstd_compress(test_scheduling.TEST_SCHEDULING_DB)
|
2019-09-11 21:17:02 +03:00
|
|
|
|
2019-10-18 14:59:24 +03:00
|
|
|
with open("data/past_failures.pickle", "wb") as f:
|
|
|
|
pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
|
|
|
|
zstd_compress("data/past_failures.pickle")
|
|
|
|
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
def main():
|
|
|
|
description = "Retrieve and extract the test scheduling history from ActiveData"
|
|
|
|
parser = argparse.ArgumentParser(description=description)
|
|
|
|
|
2019-10-18 15:33:53 +03:00
|
|
|
parser.add_argument(
|
|
|
|
"op", help="Which operation to perform.", choices=["retrieve", "generate"]
|
|
|
|
)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
retriever = Retriever()
|
2019-10-18 15:33:53 +03:00
|
|
|
if args.op == "retrieve":
|
|
|
|
retriever.retrieve_push_data()
|
|
|
|
elif args.op == "generate":
|
|
|
|
retriever.generate_test_scheduling_history()
|
2019-09-11 21:17:02 +03:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|