Bug 1700915: Python scripts that fetch and analyze QM_TRY errors. r=dom-storage-reviewers,janv

These scripts are not shipped with Firefox but serve to analyze the
telemetry data sent by the QM_TRY family of macros.

Differential Revision: https://phabricator.services.mozilla.com/D110277
This commit is contained in:
Jens Stutte 2021-04-08 10:01:36 +00:00
Родитель 9ac846cb5c
Коммит 9f7b34c24e
7 изменённых файлов: 821 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,132 @@
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import sys
import getopt
import utils
import stackanalysis
import fn_anchors
"""
The analysis is based on stack frames of the following form:
[
{
"event_timeabs": 1617121013137,
"session_startabs": 1617120840000,
"build_id": "20210329095128",
"client_id": "0013a68f-9893-461a-93d4-2d7a2f85583f",
"session_id": "8cd37159-bd5c-481c-99ad-9eace9ea726a",
"seq": 1,
"context": "Initialization::TemporaryStorage",
"source_file": "dom/localstorage/ActorsParent.cpp",
"source_line": "1018",
"severity": "ERROR",
"result": "NS_ERROR_FILE_NOT_FOUND"
},
...
]
The location of the input file is expected to be found in the
last item of the list inside qmexecutions.json.
"""
def usage():
print("analyze_qm_faiures.py -w <workdir=.>")
print("")
print("Analyzes the results from fetch_qm_failures.py's JSON file.")
print(
"Writes out several JSON results as files and a bugzilla markup table on stdout."
)
print("-w <workdir>: Working directory, default is '.'")
sys.exit(2)
days = 1
workdir = "."
try:
opts, args = getopt.getopt(sys.argv[1:], "w:", ["workdir="])
for opt, arg in opts:
if opt == "-w":
workdir = arg
except getopt.GetoptError:
usage()
run = utils.getLastRunFromExecutionFile(workdir)
if "numrows" not in run:
print("No previous execution from fetch_qm_failures.py found.")
usage()
if run["numrows"] == 0:
print("The last execution yielded no result.")
infile = run["rawfile"]
def getFname(prefix):
return "{}/{}_until_{}.json".format(workdir, prefix, run["lasteventtime"])
# read rows from JSON
rows = utils.readJSONFile(getFname("qmrows"))
print("Found {} rows of data.".format(len(rows)))
# enrich rows with hg locations
buildids = stackanalysis.extractBuildIDs(rows)
utils.fetchBuildRevisions(buildids)
stackanalysis.constructHGLinks(buildids, rows)
# transform rows to unique stacks
raw_stacks = stackanalysis.collectRawStacks(rows)
all_stacks = stackanalysis.mergeEqualStacks(raw_stacks)
# enrich with function anchors
for stack in all_stacks:
for frame in stack["frames"]:
frame["anchor"] = "{}:{}".format(
frame["source_file"], fn_anchors.getFunctionName(frame["location"])
)
# separate stacks for relevance
error_stacks = []
warn_stacks = []
info_stacks = []
abort_stacks = []
stackanalysis.filterStacksForPropagation(
all_stacks, error_stacks, warn_stacks, info_stacks, abort_stacks
)
run["errorfile"] = getFname("qmerrors")
utils.writeJSONFile(run["errorfile"], error_stacks)
run["warnfile"] = getFname("qmwarnings")
utils.writeJSONFile(run["warnfile"], warn_stacks)
run["infofile"] = getFname("qminfo")
utils.writeJSONFile(run["infofile"], info_stacks)
run["abortfile"] = getFname("qmabort")
utils.writeJSONFile(run["abortfile"], abort_stacks)
utils.updateLastRunToExecutionFile(workdir, run)
# print results to stdout
print("Found {} error stacks.".format(len(error_stacks)))
print("Found {} warning stacks.".format(len(warn_stacks)))
print("Found {} info stacks.".format(len(info_stacks)))
print("Found {} aborted stacks.".format(len(abort_stacks)))
print("")
print("Error stacks:")
print(stackanalysis.printStacks(error_stacks))
print("")
print("Error stacks grouped by anchors:")
anchors = stackanalysis.groupStacksForAnchors(error_stacks)
anchornames = list(anchors.keys())
for a in anchornames:
print(stackanalysis.printStacks(anchors[a]["stacks"]))
print("")
print("")
print("Warning stacks:")
print(stackanalysis.printStacks(warn_stacks))
print("")
print("Info stacks:")
print(stackanalysis.printStacks(info_stacks))

Просмотреть файл

@ -0,0 +1,17 @@
#!/bin/bash
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# This script assumes to have rust-code-analysis-cli in the path.
HG_URL=$1
TEMPDIR=/tmp/fetch_fn_names_$BASHPID
TEMPSRC=$TEMPDIR/src
mkdir $TEMPDIR
echo "" > $TEMPDIR/empty.json
HG_URL=`echo $HG_URL | sed 's/annotate/raw-file/g'`
wget -q -O "$TEMPSRC" $HG_URL
rust-code-analysis-cli -m -O json -o "$TEMPDIR" -p "$TEMPSRC"
CONTENT=`cat $TEMPDIR/*.json`
rm -rf $TEMPDIR
echo $CONTENT

Просмотреть файл

@ -0,0 +1,140 @@
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import sys
import getopt
import telemetry
import utils
"""
The analysis is based on the following query:
https://sql.telemetry.mozilla.org/queries/78691/source?p_day=28&p_month=03&p_year=2021
SELECT UNIX_MILLIS(timestamp) AS submit_timeabs,
session_start_time,
submission_date,
build_id,
client_id,
session_id,
event_timestamp,
CAST(mozfun.map.get_key(event_map_values, "seq") AS INT64) AS seq,
mozfun.map.get_key(event_map_values, "context") AS context,
mozfun.map.get_key(event_map_values, "source_file") AS source_file,
mozfun.map.get_key(event_map_values, "source_line") AS source_line,
mozfun.map.get_key(event_map_values, "severity") AS severity,
mozfun.map.get_key(event_map_values, "result") AS result,
FROM telemetry.events
WHERE submission_date >= CAST('{{ year }}-{{ month }}-{{ day }}' AS DATE)
AND event_category='dom.quota.try'
AND build_id >= '{{ build }}'
AND UNIX_MILLIS(timestamp) > {{ last }}
ORDER BY submit_timeabs
LIMIT 600000
We fetch events in chronological order, as we want to keep track of where we already
arrived with our analysis. To accomplish this we write our runs into qmexecutions.json.
[
{
"workdir": ".",
"daysback": 1,
"numrows": 17377,
"lasteventtime": 1617303855145,
"rawfile": "./qmrows_until_1617303855145.json"
}
]
lasteventtime is the highest value of event_timeabs we found in our data.
analyze_qm_failures instead needs the rows to be ordered by
client_id
session_id
seq
Thus we sort the rows accordingly before writing them.
"""
def usage():
print(
"fetch_qm_faiures.py -k <apikey> -b <minimum build=20210329000000>"
"-d <days back=1> -l <last event time> -w <workdir=.>"
)
print("")
print("Invokes the query 78691 and stores the result in a JSON file.")
print("-k <apikey>: Your personal telemetry API key (not the query key!).")
print("-d <daysback>: Number of days to go back. Default is 1.")
print("-b <minimum build>: The lowest build id we will fetch data for.")
print("-l <last event time>: Fetch only events after this. Default is 0.")
print("-w <workdir>: Working directory, default is '.'")
sys.exit(2)
days = 1
lasteventtime = 0
key = "undefined"
workdir = "."
minbuild = "20210329000000"
try:
opts, args = getopt.getopt(
sys.argv[1:],
"k:b:d:l:w:",
["key=", "build=", "days=", "lasteventtime=", "workdir="],
)
for opt, arg in opts:
if opt == "-k":
key = arg
elif opt == "-d":
days = int(arg)
elif opt == "-l":
lasteventtime = int(arg)
elif opt == "-b":
minbuild = arg
elif opt == "-w":
workdir = arg
except getopt.GetoptError:
usage()
if key == "undefined":
usage()
start = utils.dateback(days)
year = start.year
month = start.month
day = start.day
run = {}
lastrun = utils.getLastRunFromExecutionFile(workdir)
if "lasteventtime" in lastrun:
lasteventtime = lastrun["lasteventtime"]
run["workdir"] = workdir
run["daysback"] = days
run["minbuild"] = minbuild
p_params = "p_year={:04d}&p_month={:02d}&p_day={:02d}&p_build={}" "&p_last={}".format(
year, month, day, minbuild, lasteventtime
)
print(p_params)
result = telemetry.query(key, 78691, p_params)
rows = result["query_result"]["data"]["rows"]
run["numrows"] = len(rows)
if run["numrows"] > 0:
lasteventtime = telemetry.getLastEventTimeAbs(rows)
run["lasteventtime"] = lasteventtime
rows.sort(
key=lambda row: "{}.{}.{:06d}".format(
row["client_id"], row["session_id"], int(row["seq"])
),
reverse=False,
)
outfile = "{}/qmrows_until_{}.json".format(workdir, lasteventtime)
utils.writeJSONFile(outfile, rows)
run["rawfile"] = outfile
else:
print("No results found, maybe next time.")
run["lasteventtime"] = lasteventtime
utils.addNewRunToExecutionFile(workdir, run)

Просмотреть файл

@ -0,0 +1,64 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import json
import subprocess
cached_functions = {}
def getMetricsJson(src_url):
print("Fetching source for function extraction: {}".format(src_url))
metrics = subprocess.check_output(["./fetch_fn_names.sh", src_url])
try:
return json.loads(metrics)
except ValueError:
return {"kind": "empty", "name": "anonymous", "spaces": []}
def getSpaceFunctionsRecursive(metrics_space):
functions = []
if (
metrics_space["kind"] == "function"
and metrics_space["name"]
and metrics_space["name"] != "<anonymous>"
):
functions.append(
{
"name": metrics_space["name"],
"start_line": int(metrics_space["start_line"]),
"end_line": int(metrics_space["end_line"]),
}
)
for space in metrics_space["spaces"]:
functions += getSpaceFunctionsRecursive(space)
return functions
def getSourceFunctions(src_url):
if src_url not in cached_functions:
metrics_space = getMetricsJson(src_url)
cached_functions[src_url] = getSpaceFunctionsRecursive(metrics_space)
return cached_functions[src_url]
def getFunctionName(location):
location.replace("annotate", "raw-file")
pieces = location.split("#l")
src_url = pieces[0]
line = int(pieces[1])
closest_name = "<Unknown {}>".format(line)
closest_start = 0
functions = getSourceFunctions(src_url)
for fn in functions:
if (
fn["start_line"] > closest_start
and line >= fn["start_line"]
and line <= fn["end_line"]
):
closest_start = fn["start_line"]
closest_name = fn["name"]
return closest_name

Просмотреть файл

@ -0,0 +1,327 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Given a set of rows, find all distinct build ids
def extractBuildIDs(rows):
buildids = {}
for row in rows:
id = row["build_id"]
if id in buildids:
buildids[id] = buildids[id] + 1
else:
buildids[id] = 1
return buildids
# Given a set of build ids and rows, enrich each row by an hg link.
# Relys on the result of utils.fetchBuildRevisions in buildids.
def constructHGLinks(buildids, rows):
for row in rows:
id = row["build_id"]
if id in buildids:
row["location"] = (
buildids[id] + "/" + row["source_file"] + "#l" + row["source_line"]
)
else:
row["location"] = id + "/" + row["source_file"] + "#l" + row["source_line"]
topmost_stackframes = set()
def isTopmostFrame(frame):
f = (frame["location"], frame["result"])
return f in topmost_stackframes
def addTopmostFrame(frame):
f = (frame["location"], frame["result"])
if not isTopmostFrame(frame):
# print("Found new topmost frame {}.".format(frame))
topmost_stackframes.add(f)
frame["topmost"] = True
# A topmost frame is considered to initiate a new
# raw stack. We collect all candidates before we
# actually apply them. This implies, that we should
# run this function on a "large enough" sample of rows
# to be more accurate.
# As a side effect, we mark all rows that are part of
# a "complete" session (a session, that started within
# our data scope).
def collectTopmostFrames(rows):
prev_cid = "unset"
prev_sid = "unset"
prev_ctx = "unset"
prev_sev = "ERROR"
session_complete = False
after_severity_downgrade = False
for row in rows:
cid = row["client_id"]
sid = row["session_id"]
ctx = row["context"]
seq = row["seq"]
sev = row["severity"]
# If we have a new session, ensure it is complete from start,
# otherwise we will ignore it entirely.
if cid != prev_cid or sid != prev_sid:
if seq == 1:
session_complete = True
else:
session_complete = False
row["session_complete"] = session_complete
if session_complete:
# If we change client, session or context, we can be sure to have
# a new topmost frame.
if seq == 1 or cid != prev_cid or sid != prev_sid or ctx != prev_ctx:
addTopmostFrame(row)
after_severity_downgrade = False
# If we just had a severity downgrade, we assume that we wanted
# to break the error propagation after this point and split, too
elif after_severity_downgrade:
addTopmostFrame(row)
after_severity_downgrade = False
elif prev_sev == "ERROR" and sev != "ERROR":
after_severity_downgrade = True
prev_cid = cid
prev_sid = sid
prev_ctx = ctx
prev_sev = sev
# Should be ms. We've seen quite some runtime between stackframes
# in the wild. We might want to consider to make this configurable.
# In general we prefer local context over letting slip through some
# topmost frame unrecognized, assuming that fixing the issues one by
# one they will uncover succesively. This is achieved by a rather
# high delta value.
delta = 500
prev_event_time = -99999
for row in rows:
et = int(row["event_timestamp"])
if row["session_complete"]:
if "topmost" not in row and et - prev_event_time >= delta:
addTopmostFrame(row)
prev_event_time = et
else:
prev_event_time = -99999
def getFrameKey(frame):
return "{}.{}|".format(frame["location"], frame["result"])
def getStackKey(stack):
stack_key = ""
for frame in stack["frames"]:
stack_key += getFrameKey(frame)
return hash(stack_key)
# A "raw stack" is a list of frames, that:
# - share the same build_id (implicitely through location)
# - share the same client_id
# - share the same session_id
# - has a growing sequence number
# - stops at the first downgrade of severity from ERROR to else
# - XXX: contains each location at most once (no recursion)
# - appears to be in a reasonable short timeframe
# Calculates also a hash key to identify identical stacks
def collectRawStacks(rows):
collectTopmostFrames(rows)
raw_stacks = []
stack = {
"stack_id": "unset",
"client_id": "unset",
"session_id": "unset",
"submit_timeabs": "unset",
"frames": [{"location": "unset"}],
}
stack_id = 1
first = True
for row in rows:
if isTopmostFrame(row):
if not first:
stack["stack_key"] = getStackKey(stack)
raw_stacks.append(stack)
stack_id += 1
stack = {
"stack_id": stack_id,
"client_id": row["client_id"],
"session_id": row["session_id"],
"submit_timeabs": row["submit_timeabs"],
"context": row["context"],
"frames": [],
}
stack["frames"].append(
{
"location": row["location"],
"source_file": row["source_file"],
"source_line": row["source_line"],
"seq": row["seq"],
"severity": row["severity"],
"result": row["result"],
}
)
first = False
return raw_stacks
# Merge all stacks that have the same hash key and count occurences.
# Relys on the ordering per client_id/session_id for correct counting.
def mergeEqualStacks(raw_stacks):
merged_stacks = {}
last_client_id = "none"
last_session_id = "none"
for stack in raw_stacks:
stack_key = stack["stack_key"]
merged_stack = stack
if stack_key in merged_stacks:
merged_stack = merged_stacks[stack_key]
if stack["client_id"] != last_client_id:
last_client_id = stack["client_id"]
merged_stack["client_count"] += 1
if stack["session_id"] != last_session_id:
last_session_id = stack["session_id"]
merged_stack["session_count"] += 1
merged_stack["hit_count"] += 1
else:
merged_stack["client_count"] = 1
last_client_id = merged_stack["client_id"]
merged_stack["session_count"] = 1
last_session_id = merged_stack["session_id"]
merged_stack["hit_count"] = 1
merged_stacks[stack_key] = merged_stack
merged_list = list(merged_stacks.values())
merged_list.sort(key=lambda x: x.get("client_count"), reverse=True)
return merged_list
# Split the list of stacks into:
# - aborted (has at least one frame with NS_ERROR_ABORT)
# - info/warning (has at least one frame with that severity)
# - error (has only error frames)
def filterStacksForPropagation(
all_stacks, error_stacks, warn_stacks, info_stacks, abort_stacks
):
for stack in all_stacks:
warn = list(filter(lambda x: x["severity"] == "WARNING", stack["frames"]))
info = list(filter(lambda x: x["severity"] == "INFO", stack["frames"]))
abort = list(filter(lambda x: x["result"] == "NS_ERROR_ABORT", stack["frames"]))
if len(abort) > 0:
abort_stacks.append(stack)
elif len(info) > 0:
info_stacks.append(stack)
elif len(warn) > 0:
warn_stacks.append(stack)
else:
error_stacks.append(stack)
# Bugzilla comment markup
def printStacks(stacks):
out = ""
row_format = "{} | {} | {} | {} | {}\n"
out += row_format.format("Clients", "Sessions", "Hits", "Anchor", "Stack")
out += row_format.format("-------", "-------", "--------", "--------", "--------")
for stack in stacks:
framestr = ""
first = True
for frame in stack["frames"]:
if not first:
framestr += " <- "
framestr += "[{}#{}:{}]({})".format(
frame["source_file"],
frame["source_line"],
frame["result"],
frame["location"],
)
first = False
out += row_format.format(
stack["client_count"],
stack["session_count"],
stack["hit_count"],
stack["frames"][0]["anchor"],
framestr,
)
return out
def groupStacksForAnchors(stacks):
anchors = {}
for stack in stacks:
anchor_name = stack["frames"][0]["anchor"]
if anchor_name in anchors:
anchors[anchor_name]["stacks"].append(stack)
else:
anchor = {"anchor": anchor_name, "stacks": [stack]}
anchors[anchor_name] = anchor
return anchors
"""
def getSummaryForAnchor(anchor):
return "[QM_TRY] Errors in function {}".format(anchor)
def searchBugForAnchor(bugzilla_key, anchor):
summary = getSummaryForAnchor(anchor)
bug_url = "https://bugzilla.mozilla.org/rest/bug?" \
"summary={}&api_key={}".format(summary, bugzilla_key)
return requests.get(url=bug_url).json()["bugs"]
def createBugForAnchor(bugzilla_key, anchor):
summary = getSummaryForAnchor(anchor)
bug_url = "https://bugzilla.mozilla.org/rest/bug?" \
"Bugzilla_api_key={}".format(bugzilla_key)
body = {
"product" : "Core",
"component" : "Storage: Quota Manager",
"version" : "unspecified",
"summary" : summary,
"description" : "This bug collects errors reported by QM_TRY"
"macros for function {}.".format(anchor),
}
resp = requests.post(url=bug_url, json=body)
if resp.status_code != 200:
print(resp)
return 0
id = resp.json()["id"]
print("Added new bug {}:".format(id))
return id
def ensureBugForAnchor(bugzilla_key, anchor):
buglist = searchBugForAnchor(bugzilla_key, anchor)
if (len(buglist) > 0):
id = buglist[0]["id"]
print("Found existing bug {}:".format(id))
return id
return createBugForAnchor(bugzilla_key, anchor)
def addCommentForAnchor(bugzilla_key, anchor, stacks):
id = ensureBugForAnchor(bugzilla_key, anchor)
if (id <= 0):
print("Unable to create a bug for {}.".format(anchor))
return
comment = printStacks(stacks)
print("")
print("Add comment to bug {}:".format(id))
print(comment)
def addCommentsForStacks(bugzilla_key, stacks):
anchors = groupStacksForAnchors(stacks)
for anchor in anchors:
addCommentForAnchor(bugzilla_key, anchors[anchor]["anchor"], anchors[anchor]["stacks"])
"""

Просмотреть файл

@ -0,0 +1,53 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import requests
import time
def query(key, query, p_params):
headers = {"Authorization": "Key {}".format(key)}
start_url = "https://sql.telemetry.mozilla.org/api/" "queries/{}/refresh?{}".format(
query, p_params
)
print(start_url)
resp = requests.post(url=start_url, headers=headers)
job = resp.json()["job"]
jid = job["id"]
print("Started job {}".format(jid))
poll_url = "https://sql.telemetry.mozilla.org/api/" "jobs/{}".format(jid)
print(poll_url)
poll = True
status = 0
qresultid = 0
while poll:
print(".", end="", flush=True)
resp = requests.get(url=poll_url, headers=headers)
status = resp.json()["job"]["status"]
if status > 2:
print(resp.json())
poll = False
qresultid = resp.json()["job"]["query_result_id"]
else:
time.sleep(0.2)
print(".")
print("Finished with status {}".format(status))
if status == 3:
fetch_url = (
"https://sql.telemetry.mozilla.org/api/"
"queries/78691/results/{}.json".format(qresultid)
)
print(fetch_url)
resp = requests.get(url=fetch_url, headers=headers)
return resp.json()
return {"query_result": {"data": {"rows": {}}}}
def getLastEventTimeAbs(rows):
if len(rows) == 0:
return 0
return rows[len(rows) - 1]["submit_timeabs"]

Просмотреть файл

@ -0,0 +1,88 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import json
import datetime
import requests
def readJSONFile(FileName):
f = open(FileName, "r")
p = json.load(f)
f.close()
return p
def writeJSONFile(FileName, Content):
with open(FileName, "w") as outfile:
json.dump(Content, outfile, indent=4)
def dateback(days):
today = datetime.date.today()
delta = datetime.timedelta(days)
return today - delta
def lastweek():
today = datetime.date.today()
delta = datetime.timedelta(days=7)
return today - delta
# Given a set of build ids, fetch the repository base URL for each id.
def fetchBuildRevisions(buildids):
buildhub_url = "https://buildhub.moz.tools/api/search"
delids = {}
for bid in buildids:
print("Fetching revision for build {}.".format(bid))
body = {"size": 1, "query": {"term": {"build.id": bid}}}
resp = requests.post(url=buildhub_url, json=body)
hits = resp.json()["hits"]["hits"]
if len(hits) > 0:
buildids[bid] = (
hits[0]["_source"]["source"]["repository"]
+ "/annotate/"
+ hits[0]["_source"]["source"]["revision"]
)
else:
print("No revision for build.id {}".format(bid))
delids[bid] = "x"
for bid in delids:
buildids.pop(bid)
def readExecutionFile(workdir):
exefile = "{}/qmexecutions.json".format(workdir)
try:
return readJSONFile(exefile)
except OSError:
return []
def writeExecutionFile(workdir, executions):
exefile = "{}/qmexecutions.json".format(workdir)
try:
writeJSONFile(exefile, executions)
except OSError:
print("Error writing execution record.")
def getLastRunFromExecutionFile(workdir):
executions = readExecutionFile(workdir)
if len(executions) > 0:
return executions[len(executions) - 1]
return {}
def updateLastRunToExecutionFile(workdir, run):
executions = readExecutionFile(workdir)
executions[len(executions) - 1] = run
writeExecutionFile(workdir, executions)
def addNewRunToExecutionFile(workdir, run):
executions = readExecutionFile(workdir)
executions.append(run)
writeExecutionFile(workdir, executions)