Merge pull request #3911 from RasmusWL/python-call-graph-tracing

Approved by tausbn
This commit is contained in:
semmle-qlci 2020-07-14 15:33:45 +01:00 коммит произвёл GitHub
Родитель f8c03dcae6 f1601d643a
Коммит 0bee0687cb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
9 изменённых файлов: 334 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,17 @@
# Recorded Call Graph Metrics
also known as _call graph tracing_.
Execute a python program and for each call being made, record the call and callee. This allows us to compare call graph resolution from static analysis with actual data -- that is, can we statically determine the target of each actual call correctly.
This is still in the early stages, and currently only supports a very minimal working example (to show that this approach might work).
The next hurdle is being able to handle multiple calls on the same line, such as
- `foo(); bar()`
- `foo(bar())`
- `foo().bar()`
## How do I give it a spin?
Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory.

Просмотреть файл

@ -0,0 +1,222 @@
#!/usr/bin/env python3
"""Call Graph tracing.
Execute a python program and for each call being made, record the call and callee. This
allows us to compare call graph resolution from static analysis with actual data -- that
is, can we statically determine the target of each actual call correctly.
If there is 100% code coverage from the Python execution, it would also be possible to
look at the precision of the call graph resolutions -- that is, do we expect a function to
be able to be called in a place where it is not? Currently not something we're looking at.
"""
# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/
# TODO: Know that a call to a C-function was made. See
# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as
# test
# For inspiration, look at these projects:
# - https://github.com/joerick/pyinstrument (capture call-stack every <n> ms for profiling)
# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution)
import argparse
import bdb
from io import StringIO
import sys
import os
import dis
import dataclasses
import csv
import xml.etree.ElementTree as ET
# Copy-Paste and uncomment for interactive ipython sessions
# import IPython; IPython.embed(); sys.exit()
@dataclasses.dataclass(frozen=True)
class Call():
"""A call
"""
filename: str
linenum: int
inst_index: int
@classmethod
def from_frame(cls, frame, debugger: bdb.Bdb):
code = frame.f_code
# Uncomment to see the bytecode
# b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
# print(b.dis(), file=sys.__stderr__)
return cls(
filename = debugger.canonic(code.co_filename),
linenum = frame.f_lineno,
inst_index = frame.f_lasti,
)
@dataclasses.dataclass(frozen=True)
class Callee():
"""A callee (Function/Lambda/???)
should (hopefully) be uniquely identified by its name and location (filename+line
number)
"""
funcname: str
filename: str
linenum: int
@classmethod
def from_frame(cls, frame, debugger: bdb.Bdb):
code = frame.f_code
return cls(
funcname = code.co_name,
filename = debugger.canonic(code.co_filename),
linenum = frame.f_lineno,
)
class CallGraphTracer(bdb.Bdb):
"""Tracer that records calls being made
It would seem obvious that this should have extended `trace` library
(https://docs.python.org/3/library/trace.html), but that part is not extensible --
however, the basic debugger (bdb) is, and provides maybe a bit more help than just
using `sys.settrace` directly.
"""
recorded_calls: set
def __init__(self):
self.recorded_calls = set()
super().__init__()
def user_call(self, frame, argument_list):
call = Call.from_frame(frame.f_back, self)
callee = Callee.from_frame(frame, self)
# _print(f'{call} -> {callee}')
self.recorded_calls.add((call, callee))
################################################################################
# Export
################################################################################
class Exporter:
@staticmethod
def export(recorded_calls, outfile_path):
raise NotImplementedError()
@staticmethod
def dataclass_to_dict(obj):
d = dataclasses.asdict(obj)
prefix = obj.__class__.__name__.lower()
return {f"{prefix}_{key}": val for (key, val) in d.items()}
class CSVExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
with open(outfile_path, 'w', newline='') as csv_file:
writer = None
for (call, callee) in recorded_calls:
data = {
**Exporter.dataclass_to_dict(call),
**Exporter.dataclass_to_dict(callee)
}
if writer is None:
writer = csv.DictWriter(csv_file, fieldnames=data.keys())
writer.writeheader()
writer.writerow(data)
print(f'output written to {outfile_path}')
# embed(); sys.exit()
class XMLExporter(Exporter):
@staticmethod
def export(recorded_calls, outfile_path):
root = ET.Element('root')
for (call, callee) in recorded_calls:
data = {
**Exporter.dataclass_to_dict(call),
**Exporter.dataclass_to_dict(callee)
}
rc = ET.SubElement(root, 'recorded_call')
# this xml library only supports serializing attributes that have string values
rc.attrib = {k: str(v) for k, v in data.items()}
tree = ET.ElementTree(root)
tree.write(outfile_path, encoding='utf-8')
################################################################################
# __main__
################################################################################
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--csv')
parser.add_argument('--xml')
parser.add_argument('progname', help='file to run as main program')
parser.add_argument('arguments', nargs=argparse.REMAINDER,
help='arguments to the program')
opts = parser.parse_args()
# These details of setting up the program to be run is very much inspired by `trace`
# from the standard library
sys.argv = [opts.progname, *opts.arguments]
sys.path[0] = os.path.dirname(opts.progname)
with open(opts.progname) as fp:
code = compile(fp.read(), opts.progname, 'exec')
# try to emulate __main__ namespace as much as possible
globs = {
'__file__': opts.progname,
'__name__': '__main__',
'__package__': None,
'__cached__': None,
}
real_stdout = sys.stdout
real_stderr = sys.stderr
captured_stdout = StringIO()
sys.stdout = captured_stdout
cgt = CallGraphTracer()
cgt.run(code, globs, globs)
sys.stdout = real_stdout
if opts.csv:
CSVExporter.export(cgt.recorded_calls, opts.csv)
elif opts.xml:
XMLExporter.export(cgt.recorded_calls, opts.xml)
else:
for (call, callee) in cgt.recorded_calls:
print(f'{call} -> {callee}')
print('--- captured stdout ---')
print(captured_stdout.getvalue(), end='')

Просмотреть файл

@ -0,0 +1,10 @@
def foo():
print('foo')
def bar():
print('bar')
foo()
bar()
foo(); bar()

Просмотреть файл

@ -0,0 +1,6 @@
<root>
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
<recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
</root>

Просмотреть файл

@ -0,0 +1,9 @@
import RecordedCalls
from ValidRecordedCall rc, Call call, Function callee, CallableValue calleeValue
where
call = rc.getCall() and
callee = rc.getCallee() and
calleeValue.getScope() = callee and
calleeValue.getACall() = call.getAFlowNode()
select call, "-->", callee

Просмотреть файл

@ -0,0 +1,36 @@
import python
class RecordedCall extends XMLElement {
RecordedCall() { this.hasName("recorded_call") }
string call_filename() { result = this.getAttributeValue("call_filename") }
int call_linenum() { result = this.getAttributeValue("call_linenum").toInt() }
int call_inst_index() { result = this.getAttributeValue("call_inst_index").toInt() }
Call getCall() {
// TODO: handle calls spanning multiple lines
result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _)
}
string callee_filename() { result = this.getAttributeValue("callee_filename") }
int callee_linenum() { result = this.getAttributeValue("callee_linenum").toInt() }
string callee_funcname() { result = this.getAttributeValue("callee_funcname") }
Function getCallee() {
result.getLocation().hasLocationInfo(this.callee_filename(), this.callee_linenum(), _, _, _)
}
}
/**
* Class of recorded calls where we can uniquely identify both the `call` and the `callee`.
*/
class ValidRecordedCall extends RecordedCall {
ValidRecordedCall() {
strictcount(this.getCall()) = 1 and
strictcount(this.getCallee()) = 1
}
}

Просмотреть файл

@ -0,0 +1,7 @@
import RecordedCalls
from RecordedCall rc
where not rc instanceof ValidRecordedCall
select "Could not uniquely identify this recorded call (either call or callee was not uniquely identified)",
rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callee_filename(),
rc.callee_linenum(), rc.callee_funcname()

Просмотреть файл

@ -0,0 +1,4 @@
name: codeql-python-recorded-call-graph-metrics
version: 0.0.1
libraryPathDependencies: codeql-python
extractor: python

Просмотреть файл

@ -0,0 +1,23 @@
#!/bin/bash
set -e
set -x
DB="cg-trace-example-db"
SRC="example/"
XMLDIR="$SRC"
PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
./cg_trace.py --xml example/simple.xml example/simple.py
rm -rf "$DB"
codeql database init --source-root="$SRC" --language=python "$DB"
codeql database trace-command --working-dir="$SRC" "$DB" "$PYTHON_EXTRACTOR/tools/autobuild.sh"
codeql database index-files --language xml --include-extension .xml --working-dir="$XMLDIR" "$DB"
codeql database finalize "$DB"
set +x
echo "Created database '$DB'"