This commit is contained in:
gmierz 2021-04-14 08:30:40 -04:00
Родитель b355731bc6
Коммит 288cb553cc
1 изменённых файлов: 284 добавлений и 0 удалений

284
pageload-summary/summarize.py Executable file
Просмотреть файл

@ -0,0 +1,284 @@
#!/usr/bin/python3
import argparse
import csv
import datetime
import json
import numpy as np
import os
import pathlib
import matplotlib.dates as md
from matplotlib import pyplot as plt
from scipy.stats.mstats import gmean
def summary_parser():
parser = argparse.ArgumentParser(
"This tool can be used to generate a summary of the pageload numbers for a single "
"given subtest, i.e. ContenfulSpeedIndex. We provide the summary through a geomean "
"and you can also perform a comparison with competing browsers using "
"`--compare-browsers`. You must provide data in the CSV format that is returned from "
"this query: https://sql.telemetry.mozilla.org/queries/79289"
)
parser.add_argument("data", metavar="CSV_DATA", type=str,
help="The data to summarize.")
parser.add_argument("--compare-browsers", action="store_true", default=False,
help="Provide a comparison between the browsers found.")
parser.add_argument("--timespan", type=int, default=24,
help="Minimum time between each data point in hours.")
parser.add_argument("--platforms", nargs="*", default=[],
help="Platforms to summarize. Default is all platforms.")
parser.add_argument("--output", type=str, default=os.getcwd(),
help="This is where the data will be saved in JSON format. If the "
"path has a `.json` suffix then we'll use the part immediately "
"before it as the file name.")
return parser
def open_csv_data(path):
"""Opens a CSV data file from a given path."""
rows = []
with path.open() as f:
reader = csv.reader(f)
for row in reader:
rows.append(row)
return rows
def get_data_ind(data, fieldname):
"""Returns an index for the requested field."""
for i, entry in enumerate(data[0]):
if fieldname in entry:
return i
return None
def organize_data(data, platforms):
"""Organizes the data into a format that is easier to handle."""
platform_ind = get_data_ind(data, "platform")
test_ind = get_data_ind(data, "suite")
extra_ind = get_data_ind(data, "extra_options")
tag_ind = get_data_ind(data, "tags")
val_ind = get_data_ind(data, "value")
time_ind = get_data_ind(data, "push_timestamp")
app_ind = get_data_ind(data, "application")
org_data = {}
for entry in data[1:]:
platform = entry[platform_ind]
if platforms and platform not in platforms:
continue
test = entry[test_ind]
app = entry[app_ind]
extras = entry[extra_ind].split()
tags = entry[tag_ind].split()
variants = "e10s"
pl_type = "cold"
# Without this, we might start pulling in data
# from mozperftest tests
if "warm" not in extras and "cold" not in extras:
continue
# Make sure we always ignore live site data
if "live" in extras:
continue
if "warm" in extras:
pl_type = "warm"
if "fission" in extras:
variants += "fission-"
if "webrender" in extras:
variants += "webrender"
# Newer data no longer has the nocondprof option
if "nocondprof" in extras:
extras.remove("nocondprof")
# Older data didn't have this flag
if "visual" not in extras:
extras.append("visual")
if variants != "e10s":
variants = variants.replace("e10s", "")
mod_test_name = f"{test}-{app}" + "-".join(sorted(extras))
test_data = org_data.setdefault(
platform, {}
).setdefault(
app, {}
).setdefault(
variants, {}
).setdefault(
pl_type, {}
).setdefault(
mod_test_name, {}
)
# Make sure we're never mixing data
if "extra_options" in test_data:
assert test_data["extra_options"] == set(list(extras))
else:
test_data["extra_options"] = set(list(extras))
test_data.setdefault("values", {}).setdefault(
entry[time_ind], []
).append(float(entry[val_ind]))
if not org_data:
possible_platforms = set([entry[platform_ind] for entry in data])
raise Exception(
"Could not find any requested platforms in the data. Possible choices are: "
f"{possible_platforms}"
)
return org_data
def temporal_aggregation(times, timespan=24):
"""Aggregates times formatted like `YYYY-mm-dd HH:MM`.
After aggregation, the result will contain lists of all
points that were grouped together. Timespan distancing
starts from the newest data point.
"""
aggr_times = []
diff = datetime.timedelta(hours=timespan)
curr = []
for t in sorted(times)[::-1]:
dt = datetime.datetime.strptime(t, "%Y-%m-%d %H:%M")
if len(curr) == 0:
curr.append(dt)
elif curr[0] - dt < diff:
curr.append(dt)
else:
aggr_times.append([c.strftime("%Y-%m-%d %H:%M") for c in curr])
curr = [dt]
return aggr_times[::-1]
def summarize(data, platforms, timespan):
org_data = organize_data(data, platforms)
summary = {}
for platform, apps in org_data.items():
for app, variants in apps.items():
for variant, pl_types in variants.items():
for pl_type, tests in pl_types.items():
# Get all the push times and aggregate them
all_push_times = []
for _, info in tests.items():
all_push_times.extend(list(info["values"].keys()))
all_push_times = temporal_aggregation(list(set(all_push_times)), timespan)
# Get a summary value for each push time
summarized_vals = []
for times in sorted(all_push_times):
vals = {}
for time in times:
for test, info in tests.items():
if time not in info["values"]:
continue
vals.setdefault(test, []).extend(info["values"][time])
vals = [np.mean(v) for _, v in vals.items()]
summarized_vals.append((times[-1], gmean(np.asarray(vals))))
summary.setdefault(
platform, {}
).setdefault(
app, {}
).setdefault(
variant, {}
)[pl_type] = {
"values": summarized_vals,
}
return summary
def view(summary):
for platform, apps in summary.items():
for app, variants in apps.items():
for variant, pl_types in variants.items():
"""
This is a simple visualization to show the metric. It
can be modified to anything.
"""
plt.figure()
figc = 1
for pl_type, vals in pl_types.items():
plt.subplot(1,2,figc)
figc += 1
variant = variant if variant != "None" else "e10s"
plt.title(platform + f"\n{app}-{pl_type}-{variant}")
times = [
datetime.datetime.strptime(x, "%Y-%m-%d %H:%M")
for x, y in vals["values"]
]
vals = [y for x, y in vals["values"]]
md_times = md.date2num(times)
ax = plt.gca()
xfmt = md.DateFormatter('%Y-%m-%d %H:%M:%S')
ax.xaxis.set_major_formatter(xfmt)
plt.xticks(rotation=25)
plt.plot(md_times, vals)
plt.show()
def main():
args = summary_parser().parse_args()
# Check data path and setup output
data_path = pathlib.Path(args.data)
if not data_path.exists():
raise Exception(f"The given data file doesn't exist: {args.data}")
output_folder = pathlib.Path(args.output)
output_file = "summary.json"
if output_folder.exists() and output_folder.is_file():
print(f"Deleting existing JSON file at: {output_folder}")
output_folder.unlink()
if not output_folder.exists():
if pathlib.Path(output_folder.parts[-1]).suffixes:
# A JSON file name was given
output_file = output_folder.parts[-1]
output_folder = pathlib.Path(*output_folder.parts[:-1])
output_folder.mkdir(parents=True, exist_ok=True)
# Process the data and visualize the results (after saving)
data = open_csv_data(data_path)
results = summarize(data, args.platforms, args.timespan)
with pathlib.Path(output_folder, output_file).open("w") as f:
json.dump(results, f)
view(results)
if __name__ == "__main__":
main()