зеркало из https://github.com/github/codeql.git
Merge pull request #3911 from RasmusWL/python-call-graph-tracing
Approved by tausbn
This commit is contained in:
Коммит
0bee0687cb
|
@ -0,0 +1,17 @@
|
|||
# Recorded Call Graph Metrics
|
||||
|
||||
also known as _call graph tracing_.
|
||||
|
||||
Execute a python program and for each call being made, record the call and callee. This allows us to compare call graph resolution from static analysis with actual data -- that is, can we statically determine the target of each actual call correctly.
|
||||
|
||||
This is still in the early stages, and currently only supports a very minimal working example (to show that this approach might work).
|
||||
|
||||
The next hurdle is being able to handle multiple calls on the same line, such as
|
||||
|
||||
- `foo(); bar()`
|
||||
- `foo(bar())`
|
||||
- `foo().bar()`
|
||||
|
||||
## How do I give it a spin?
|
||||
|
||||
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory.
|
|
@ -0,0 +1,222 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""Call Graph tracing.
|
||||
|
||||
Execute a python program and for each call being made, record the call and callee. This
|
||||
allows us to compare call graph resolution from static analysis with actual data -- that
|
||||
is, can we statically determine the target of each actual call correctly.
|
||||
|
||||
If there is 100% code coverage from the Python execution, it would also be possible to
|
||||
look at the precision of the call graph resolutions -- that is, do we expect a function to
|
||||
be able to be called in a place where it is not? Currently not something we're looking at.
|
||||
"""
|
||||
|
||||
# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/
|
||||
|
||||
# TODO: Know that a call to a C-function was made. See
|
||||
# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as
|
||||
# test
|
||||
|
||||
# For inspiration, look at these projects:
|
||||
# - https://github.com/joerick/pyinstrument (capture call-stack every <n> ms for profiling)
|
||||
# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution)
|
||||
|
||||
import argparse
|
||||
import bdb
|
||||
from io import StringIO
|
||||
import sys
|
||||
import os
|
||||
import dis
|
||||
import dataclasses
|
||||
import csv
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
# Copy-Paste and uncomment for interactive ipython sessions
|
||||
# import IPython; IPython.embed(); sys.exit()
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class Call():
|
||||
"""A call
|
||||
"""
|
||||
filename: str
|
||||
linenum: int
|
||||
inst_index: int
|
||||
|
||||
@classmethod
|
||||
def from_frame(cls, frame, debugger: bdb.Bdb):
|
||||
code = frame.f_code
|
||||
|
||||
# Uncomment to see the bytecode
|
||||
# b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
|
||||
# print(b.dis(), file=sys.__stderr__)
|
||||
|
||||
return cls(
|
||||
filename = debugger.canonic(code.co_filename),
|
||||
linenum = frame.f_lineno,
|
||||
inst_index = frame.f_lasti,
|
||||
)
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class Callee():
|
||||
"""A callee (Function/Lambda/???)
|
||||
|
||||
should (hopefully) be uniquely identified by its name and location (filename+line
|
||||
number)
|
||||
"""
|
||||
funcname: str
|
||||
filename: str
|
||||
linenum: int
|
||||
|
||||
@classmethod
|
||||
def from_frame(cls, frame, debugger: bdb.Bdb):
|
||||
code = frame.f_code
|
||||
return cls(
|
||||
funcname = code.co_name,
|
||||
filename = debugger.canonic(code.co_filename),
|
||||
linenum = frame.f_lineno,
|
||||
)
|
||||
|
||||
|
||||
class CallGraphTracer(bdb.Bdb):
|
||||
"""Tracer that records calls being made
|
||||
|
||||
It would seem obvious that this should have extended `trace` library
|
||||
(https://docs.python.org/3/library/trace.html), but that part is not extensible --
|
||||
however, the basic debugger (bdb) is, and provides maybe a bit more help than just
|
||||
using `sys.settrace` directly.
|
||||
"""
|
||||
|
||||
recorded_calls: set
|
||||
|
||||
def __init__(self):
|
||||
self.recorded_calls = set()
|
||||
super().__init__()
|
||||
|
||||
def user_call(self, frame, argument_list):
|
||||
call = Call.from_frame(frame.f_back, self)
|
||||
callee = Callee.from_frame(frame, self)
|
||||
|
||||
# _print(f'{call} -> {callee}')
|
||||
self.recorded_calls.add((call, callee))
|
||||
|
||||
|
||||
################################################################################
|
||||
# Export
|
||||
################################################################################
|
||||
|
||||
|
||||
class Exporter:
|
||||
|
||||
@staticmethod
|
||||
def export(recorded_calls, outfile_path):
|
||||
raise NotImplementedError()
|
||||
|
||||
@staticmethod
|
||||
def dataclass_to_dict(obj):
|
||||
d = dataclasses.asdict(obj)
|
||||
prefix = obj.__class__.__name__.lower()
|
||||
return {f"{prefix}_{key}": val for (key, val) in d.items()}
|
||||
|
||||
|
||||
class CSVExporter(Exporter):
|
||||
|
||||
@staticmethod
|
||||
def export(recorded_calls, outfile_path):
|
||||
with open(outfile_path, 'w', newline='') as csv_file:
|
||||
writer = None
|
||||
for (call, callee) in recorded_calls:
|
||||
data = {
|
||||
**Exporter.dataclass_to_dict(call),
|
||||
**Exporter.dataclass_to_dict(callee)
|
||||
}
|
||||
|
||||
if writer is None:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
|
||||
writer.writeheader()
|
||||
|
||||
writer.writerow(data)
|
||||
|
||||
|
||||
print(f'output written to {outfile_path}')
|
||||
|
||||
# embed(); sys.exit()
|
||||
|
||||
|
||||
class XMLExporter(Exporter):
|
||||
|
||||
@staticmethod
|
||||
def export(recorded_calls, outfile_path):
|
||||
|
||||
root = ET.Element('root')
|
||||
|
||||
for (call, callee) in recorded_calls:
|
||||
data = {
|
||||
**Exporter.dataclass_to_dict(call),
|
||||
**Exporter.dataclass_to_dict(callee)
|
||||
}
|
||||
|
||||
rc = ET.SubElement(root, 'recorded_call')
|
||||
# this xml library only supports serializing attributes that have string values
|
||||
rc.attrib = {k: str(v) for k, v in data.items()}
|
||||
|
||||
tree = ET.ElementTree(root)
|
||||
tree.write(outfile_path, encoding='utf-8')
|
||||
|
||||
|
||||
################################################################################
|
||||
# __main__
|
||||
################################################################################
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
|
||||
parser.add_argument('--csv')
|
||||
parser.add_argument('--xml')
|
||||
|
||||
parser.add_argument('progname', help='file to run as main program')
|
||||
parser.add_argument('arguments', nargs=argparse.REMAINDER,
|
||||
help='arguments to the program')
|
||||
|
||||
opts = parser.parse_args()
|
||||
|
||||
# These details of setting up the program to be run is very much inspired by `trace`
|
||||
# from the standard library
|
||||
sys.argv = [opts.progname, *opts.arguments]
|
||||
sys.path[0] = os.path.dirname(opts.progname)
|
||||
|
||||
with open(opts.progname) as fp:
|
||||
code = compile(fp.read(), opts.progname, 'exec')
|
||||
|
||||
# try to emulate __main__ namespace as much as possible
|
||||
globs = {
|
||||
'__file__': opts.progname,
|
||||
'__name__': '__main__',
|
||||
'__package__': None,
|
||||
'__cached__': None,
|
||||
}
|
||||
|
||||
real_stdout = sys.stdout
|
||||
real_stderr = sys.stderr
|
||||
captured_stdout = StringIO()
|
||||
|
||||
sys.stdout = captured_stdout
|
||||
cgt = CallGraphTracer()
|
||||
cgt.run(code, globs, globs)
|
||||
sys.stdout = real_stdout
|
||||
|
||||
if opts.csv:
|
||||
CSVExporter.export(cgt.recorded_calls, opts.csv)
|
||||
elif opts.xml:
|
||||
XMLExporter.export(cgt.recorded_calls, opts.xml)
|
||||
else:
|
||||
for (call, callee) in cgt.recorded_calls:
|
||||
print(f'{call} -> {callee}')
|
||||
|
||||
print('--- captured stdout ---')
|
||||
print(captured_stdout.getvalue(), end='')
|
|
@ -0,0 +1,10 @@
|
|||
def foo():
|
||||
print('foo')
|
||||
|
||||
def bar():
|
||||
print('bar')
|
||||
|
||||
foo()
|
||||
bar()
|
||||
|
||||
foo(); bar()
|
|
@ -0,0 +1,6 @@
|
|||
<root>
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
|
||||
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
|
||||
</root>
|
|
@ -0,0 +1,9 @@
|
|||
import RecordedCalls
|
||||
|
||||
from ValidRecordedCall rc, Call call, Function callee, CallableValue calleeValue
|
||||
where
|
||||
call = rc.getCall() and
|
||||
callee = rc.getCallee() and
|
||||
calleeValue.getScope() = callee and
|
||||
calleeValue.getACall() = call.getAFlowNode()
|
||||
select call, "-->", callee
|
|
@ -0,0 +1,36 @@
|
|||
import python
|
||||
|
||||
class RecordedCall extends XMLElement {
|
||||
RecordedCall() { this.hasName("recorded_call") }
|
||||
|
||||
string call_filename() { result = this.getAttributeValue("call_filename") }
|
||||
|
||||
int call_linenum() { result = this.getAttributeValue("call_linenum").toInt() }
|
||||
|
||||
int call_inst_index() { result = this.getAttributeValue("call_inst_index").toInt() }
|
||||
|
||||
Call getCall() {
|
||||
// TODO: handle calls spanning multiple lines
|
||||
result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _)
|
||||
}
|
||||
|
||||
string callee_filename() { result = this.getAttributeValue("callee_filename") }
|
||||
|
||||
int callee_linenum() { result = this.getAttributeValue("callee_linenum").toInt() }
|
||||
|
||||
string callee_funcname() { result = this.getAttributeValue("callee_funcname") }
|
||||
|
||||
Function getCallee() {
|
||||
result.getLocation().hasLocationInfo(this.callee_filename(), this.callee_linenum(), _, _, _)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Class of recorded calls where we can uniquely identify both the `call` and the `callee`.
|
||||
*/
|
||||
class ValidRecordedCall extends RecordedCall {
|
||||
ValidRecordedCall() {
|
||||
strictcount(this.getCall()) = 1 and
|
||||
strictcount(this.getCallee()) = 1
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
import RecordedCalls
|
||||
|
||||
from RecordedCall rc
|
||||
where not rc instanceof ValidRecordedCall
|
||||
select "Could not uniquely identify this recorded call (either call or callee was not uniquely identified)",
|
||||
rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callee_filename(),
|
||||
rc.callee_linenum(), rc.callee_funcname()
|
|
@ -0,0 +1,4 @@
|
|||
name: codeql-python-recorded-call-graph-metrics
|
||||
version: 0.0.1
|
||||
libraryPathDependencies: codeql-python
|
||||
extractor: python
|
|
@ -0,0 +1,23 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
DB="cg-trace-example-db"
|
||||
SRC="example/"
|
||||
XMLDIR="$SRC"
|
||||
PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
|
||||
|
||||
|
||||
./cg_trace.py --xml example/simple.xml example/simple.py
|
||||
|
||||
rm -rf "$DB"
|
||||
|
||||
|
||||
codeql database init --source-root="$SRC" --language=python "$DB"
|
||||
codeql database trace-command --working-dir="$SRC" "$DB" "$PYTHON_EXTRACTOR/tools/autobuild.sh"
|
||||
codeql database index-files --language xml --include-extension .xml --working-dir="$XMLDIR" "$DB"
|
||||
codeql database finalize "$DB"
|
||||
|
||||
set +x
|
||||
echo "Created database '$DB'"
|
Загрузка…
Ссылка в новой задаче