Python: CG trace: reconstruct call expr from bytecode

So we can differentiate multiple calls in one line.
This commit is contained in:
Rasmus Wriedt Larsen 2020-07-20 11:28:05 +02:00
Родитель d46b410111
Коммит c2748bf7cf
9 изменённых файлов: 447 добавлений и 71 удалений

Просмотреть файл

@ -1,6 +1,105 @@
<root>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="1" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="2" call_inst_index="12" externalcallee_module="builtins" externalcallee_qualname="len" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="4" call_inst_index="28" externalcallee_module="None" externalcallee_qualname="list.append" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py" call_linenum="7" call_inst_index="44" externalcallee_module="sys" externalcallee_qualname="getdefaultencoding" externalcallee_is_builtin="True"/>
<info>
<cg_trace_version>0.0.2</cg_trace_version>
<args>--xml example-traces/builtins.xml example/builtins.py</args>
<exit_status>completed</exit_status>
<elapsed>0.00 seconds</elapsed>
<utctimestamp>2020-07-20T09:27:00</utctimestamp>
</info>
<recorded_calls>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>1</linenum>
<inst_index>4</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>print</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>print</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>2</linenum>
<inst_index>12</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>len</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>len</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>4</linenum>
<inst_index>28</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeAttribute>
<attr_name>append</attr_name>
<object>
<BytecodeVariableName>
<name>l</name>
</BytecodeVariableName>
</object>
</BytecodeAttribute>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>None</module>
<qualname>list.append</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/builtins.py</filename>
<linenum>7</linenum>
<inst_index>44</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeAttribute>
<attr_name>getdefaultencoding</attr_name>
<object>
<BytecodeVariableName>
<name>sys</name>
</BytecodeVariableName>
</object>
</BytecodeAttribute>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>sys</module>
<qualname>getdefaultencoding</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
</recorded_calls>
</root>

Просмотреть файл

@ -1,8 +1,137 @@
<root>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="2" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="5" call_inst_index="4" externalcallee_module="builtins" externalcallee_qualname="print" externalcallee_is_builtin="True"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="1" pythoncallee_funcname="foo"/>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" pythoncallee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" pythoncallee_linenum="4" pythoncallee_funcname="bar"/>
<info>
<cg_trace_version>0.0.2</cg_trace_version>
<args>--xml example-traces/simple.xml example/simple.py</args>
<exit_status>completed</exit_status>
<elapsed>0.00 seconds</elapsed>
<utctimestamp>2020-07-20T09:27:01</utctimestamp>
</info>
<recorded_calls>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>2</linenum>
<inst_index>4</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>print</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>print</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>5</linenum>
<inst_index>4</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>print</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<ExternalCallee>
<module>builtins</module>
<qualname>print</qualname>
<is_builtin>True</is_builtin>
</ExternalCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>7</linenum>
<inst_index>18</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>foo</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>1</linenum>
<funcname>foo</funcname>
</PythonCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>8</linenum>
<inst_index>24</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>bar</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>4</linenum>
<funcname>bar</funcname>
</PythonCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>10</linenum>
<inst_index>30</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>foo</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>1</linenum>
<funcname>foo</funcname>
</PythonCallee>
</recorded_call>
<recorded_call>
<Call>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>10</linenum>
<inst_index>36</inst_index>
<bytecode_expr>
<BytecodeCall>
<function>
<BytecodeVariableName>
<name>bar</name>
</BytecodeVariableName>
</function>
</BytecodeCall>
</bytecode_expr>
</Call>
<PythonCallee>
<filename>/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py</filename>
<linenum>4</linenum>
<funcname>bar</funcname>
</PythonCallee>
</recorded_call>
</recorded_calls>
</root>

Просмотреть файл

@ -4,7 +4,7 @@ from setuptools import find_packages, setup
setup(
name="cg_trace",
version="0.0.1",
version="0.0.2", # Remember to update src/cg_trace/__init__.py
description="Call graph tracing",
packages=find_packages("src"),
package_dir={"": "src"},

Просмотреть файл

@ -1,6 +1,6 @@
import sys
__version__ = "0.0.1"
__version__ = "0.0.2" # remember to update setup.py
# Since the virtual machine opcodes changed in 3.6, not going to attempt to support
# anything before that. Using dataclasses, which is a new feature in Python 3.7

Просмотреть файл

@ -0,0 +1,135 @@
import dataclasses
import dis
import logging
from dis import Instruction
from types import FrameType
from typing import List
LOGGER = logging.getLogger(__name__)
# See https://docs.python.org/3/library/dis.html#python-bytecode-instructions for
# details on the bytecode instructions
# TODO: read https://opensource.com/article/18/4/introduction-python-bytecode
class BytecodeExpr:
"""An expression reconstructed from Python bytecode
"""
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeVariableName(BytecodeExpr):
name: str
def __str__(self):
return self.name
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeAttribute(BytecodeExpr):
attr_name: str
object: BytecodeExpr
def __str__(self):
return f"{self.object}.{self.attr_name}"
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeCall(BytecodeExpr):
function: BytecodeExpr
def __str__(self):
return f"{self.function}()"
@dataclasses.dataclass(frozen=True, eq=True, order=True)
class BytecodeUnknown(BytecodeExpr):
opname: str
def __str__(self):
return f"<{self.opname}>"
def find_inst_that_added_elem_to_stack(
instructions: List[Instruction], start_index: int, stack_pos: int
):
"""Backwards traverse instructions
Backwards traverse the instructions starting at `start_index` until we find the
instruction that added the element at stack position `stack_pos` (where 0 means top
of stack). For example, if the instructions are:
```
0: LOAD_GLOBAL 0 (func)
1: LOAD_CONST 1 (42)
2: CALL_FUNCTION 1
```
We can look for the function that is called by invoking this function with
`start_index = 1` and `stack_pos = 1`. It will see that `LOAD_CONST` added the top
element to the stack, and find that `LOAD_GLOBAL` was the instruction to add element
in stack position 1 to the stack -- so the index 0 is returned.
It is assumed that if `stack_pos == 0` then the instruction you are looking for is
the one at `instructions[start_index]`. This might not hold, in case of using `NOP`
instructions.
"""
LOGGER.debug(f"find_inst_that_added_elem_to_stack {start_index=} {stack_pos=}")
assert stack_pos >= 0
for inst in reversed(instructions[: start_index + 1]):
if stack_pos == 0:
LOGGER.debug(f"Found it: {inst}")
return instructions.index(inst)
LOGGER.debug(f"Skipping {inst}")
stack_pos -= dis.stack_effect(inst.opcode, inst.arg)
raise Exception("inst_index_for_stack_diff failed")
def expr_from_instruction(instructions: List[Instruction], index: int) -> BytecodeExpr:
inst = instructions[index]
LOGGER.debug(f"expr_from_instruction: {inst} {index=}")
if inst.opname in ["LOAD_GLOBAL", "LOAD_FAST", "LOAD_NAME"]:
return BytecodeVariableName(inst.argval)
# https://docs.python.org/3/library/dis.html#opcode-LOAD_METHOD
# https://docs.python.org/3/library/dis.html#opcode-LOAD_ATTR
elif inst.opname in ["LOAD_METHOD", "LOAD_ATTR"]:
attr_name = inst.argval
obj_index = find_inst_that_added_elem_to_stack(instructions, index - 1, 0)
obj_expr = expr_from_instruction(instructions, obj_index)
return BytecodeAttribute(attr_name=attr_name, object=obj_expr)
# https://docs.python.org/3/library/dis.html#opcode-CALL_FUNCTION
elif inst.opname in ["CALL_FUNCTION", "CALL_METHOD", "CALL_FUNCTION_KW"]:
assert index > 0
assert isinstance(inst.arg, int)
if inst.opname in ["CALL_FUNCTION", "CALL_METHOD"]:
num_stack_elems = inst.arg
elif inst.opname == "CALL_FUNCTION_KW":
num_stack_elems = inst.arg + 1
func_index = find_inst_that_added_elem_to_stack(
instructions, index - 1, num_stack_elems
)
func_expr = expr_from_instruction(instructions, func_index)
return BytecodeCall(function=func_expr)
else:
# LOAD_BUILD_CLASS is included here intentionally for now, since I don't really
# know what to do about it.
LOGGER.warning(f"Don't know how to handle this type of instruction: {inst}")
return BytecodeUnknown(inst.opname)
def expr_from_frame(frame: FrameType) -> BytecodeExpr:
bytecode = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
LOGGER.debug(f"bytecode: \n{bytecode.dis()}")
instructions = list(iter(bytecode))
last_instruction_index = [inst.offset for inst in instructions].index(frame.f_lasti)
return expr_from_instruction(instructions, last_instruction_index)

Просмотреть файл

@ -4,7 +4,6 @@ import argparse
def parse(args):
parser = argparse.ArgumentParser()
parser.add_argument("--csv")
parser.add_argument("--xml")
parser.add_argument("progname", help="file to run as main program")

Просмотреть файл

@ -1,57 +1,40 @@
import csv
import dataclasses
from typing import Dict
from lxml import etree
class Exporter:
def dataclass_to_xml(obj, parent):
obj_elem = etree.SubElement(parent, obj.__class__.__name__)
for field in dataclasses.fields(obj):
field_elem = etree.SubElement(obj_elem, field.name)
value = getattr(obj, field.name)
if isinstance(value, (str, int)) or value is None:
field_elem.text = str(value)
elif dataclasses.is_dataclass(value):
dataclass_to_xml(value, field_elem)
else:
raise ValueError(
f"Can't export key {field.name!r} with value {value!r} (type {type(value)}"
)
class XMLExporter:
@staticmethod
def export(recorded_calls, outfile_path):
raise NotImplementedError()
@staticmethod
def dataclass_to_dict(obj):
d = dataclasses.asdict(obj)
prefix = obj.__class__.__name__.lower()
return {f"{prefix}_{key}": val for (key, val) in d.items()}
class CSVExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
with open(outfile_path, "w", newline="") as csv_file:
writer = None
for (call, callee) in sorted(recorded_calls):
data = {
**Exporter.dataclass_to_dict(call),
**Exporter.dataclass_to_dict(callee),
}
if writer is None:
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
writer.writeheader()
writer.writerow(data)
print(f"output written to {outfile_path}")
class XMLExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
def export(outfile_path, recorded_calls, info: Dict[str, str]):
root = etree.Element("root")
for (call, callee) in sorted(recorded_calls):
data = {
**Exporter.dataclass_to_dict(call),
**Exporter.dataclass_to_dict(callee),
}
info_elem = etree.SubElement(root, "info")
for k, v in info.items():
etree.SubElement(info_elem, k).text = v
rc = etree.SubElement(root, "recorded_call")
for k, v in data.items():
# xml library only supports serializing attributes that have string values
rc.set(k, str(v))
rcs = etree.SubElement(root, "recorded_calls")
for (call, callee) in sorted(recorded_calls):
rc = etree.SubElement(rcs, "recorded_call")
dataclass_to_xml(call, rc)
dataclass_to_xml(callee, rc)
tree = etree.ElementTree(root)
tree.write(outfile_path, encoding="utf-8", pretty_print=True)

Просмотреть файл

@ -1,10 +1,12 @@
import logging
import os
import sys
import time
from datetime import datetime
from io import StringIO
from cg_trace import cmdline, tracer
from cg_trace.exporter import CSVExporter, XMLExporter
from cg_trace import __version__, cmdline, tracer
from cg_trace.exporter import XMLExporter
def record_calls(code, globals):
@ -17,15 +19,19 @@ def record_calls(code, globals):
sys.stderr = captured_stderr
cgt = tracer.CallGraphTracer()
cgt.run(code, globals, globals)
exit_status = cgt.run(code, globals, globals)
sys.stdout = real_stdout
sys.stderr = real_stderr
return sorted(cgt.recorded_calls), captured_stdout, captured_stderr
return sorted(cgt.recorded_calls), captured_stdout, captured_stderr, exit_status
def main(args=None) -> int:
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
from . import bytecode_reconstructor
logging.getLogger(bytecode_reconstructor.__name__).setLevel(logging.INFO)
if args is None:
# first element in argv is program name
@ -49,14 +55,27 @@ def main(args=None) -> int:
"__cached__": None,
}
recorded_calls, captured_stdout, captured_stderr = record_calls(code, globs)
start = time.time()
recorded_calls, captured_stdout, captured_stderr, exit_status = record_calls(
code, globs
)
end = time.time()
elapsed_formatted = f"{end-start:.2f} seconds"
if opts.csv:
CSVExporter.export(recorded_calls, opts.csv)
elif opts.xml:
XMLExporter.export(recorded_calls, opts.xml)
if opts.xml:
XMLExporter.export(
opts.xml,
recorded_calls,
info={
"cg_trace_version": __version__,
"args": " ".join(args),
"exit_status": exit_status,
"elapsed": elapsed_formatted,
"utctimestamp": datetime.utcnow().replace(microsecond=0).isoformat(),
},
)
else:
print("Recorded calls:")
print(f"--- Recorded calls (in {elapsed_formatted}) ---")
for (call, callee) in recorded_calls:
print(f"{call} --> {callee}")

Просмотреть файл

@ -1,11 +1,12 @@
import dataclasses
import dis
import logging
import os
import sys
from types import FrameType
from typing import Optional
from cg_trace.bytecode_reconstructor import BytecodeExpr, expr_from_frame
LOGGER = logging.getLogger(__name__)
@ -42,18 +43,26 @@ class Call:
filename: str
linenum: int
inst_index: int
bytecode_expr: BytecodeExpr
def __str__(self):
d = dataclasses.asdict(self)
del d["bytecode_expr"]
normal_fields = ", ".join(f"{k}={v!r}" for k, v in d.items())
return f"{type(self).__name__}({normal_fields}, bytecode_expr≈{repr(str(self.bytecode_expr))})"
@classmethod
def from_frame(cls, frame: FrameType):
code = frame.f_code
b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
LOGGER.debug(f"bytecode: \n{b.dis()}")
bytecode_expr = expr_from_frame(frame)
return cls(
filename=canonic_filename(code.co_filename),
linenum=frame.f_lineno,
inst_index=frame.f_lasti,
bytecode_expr=bytecode_expr,
)
@ -165,9 +174,11 @@ class CallGraphTracer:
sys.setprofile(self.profilefunc)
exec(code, globals, locals)
sys.setprofile(None)
return "completed"
except Exception:
sys.setprofile(None)
LOGGER.info("Exception occurred while running program:", exc_info=True)
return "exception occurred"
def profilefunc(self, frame: FrameType, event: str, arg):
# ignore everything until the first call, since that is `exec` from the `run`
@ -191,6 +202,7 @@ class CallGraphTracer:
LOGGER.debug(f"profilefunc {event=}")
if event == "call":
assert frame.f_back is not None
# in call, the `frame` argument is new the frame for entering the callee
call = Call.from_frame(frame.f_back)
callee = PythonCallee.from_frame(frame)