Merge pull request #3911 from RasmusWL/python-call-graph-tracing

Approved by tausbn
2020-07-14 15:33:45 +01:00 · 2020-07-14 15:33:45 +01:00 · 0bee0687cb
--- a/python/tools/recorded-call-graph-metrics/README.md
+++ b/python/tools/recorded-call-graph-metrics/README.md
@ -0,0 +1,17 @@
+# Recorded Call Graph Metrics
+
+also known as _call graph tracing_.
+
+Execute a python program and for each call being made, record the call and callee. This allows us to compare call graph resolution from static analysis with actual data -- that is, can we statically determine the target of each actual call correctly.
+
+This is still in the early stages, and currently only supports a very minimal working example (to show that this approach might work).
+
+The next hurdle is being able to handle multiple calls on the same line, such as
+
+- `foo(); bar()`
+- `foo(bar())`
+- `foo().bar()`
+
+## How do I give it a spin?
+
+Run the `recreate-db.sh` script to create the database `cg-trace-example-db`, which will include the `example/simple.xml` trace from executing the `example/simple.py` code. Then run the queries inside the `ql/` directory.
--- a/python/tools/recorded-call-graph-metrics/cg_trace.py
+++ b/python/tools/recorded-call-graph-metrics/cg_trace.py
@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+
+"""Call Graph tracing.
+
+Execute a python program and for each call being made, record the call and callee. This
+allows us to compare call graph resolution from static analysis with actual data -- that
+is, can we statically determine the target of each actual call correctly.
+
+If there is 100% code coverage from the Python execution, it would also be possible to
+look at the precision of the call graph resolutions -- that is, do we expect a function to
+be able to be called in a place where it is not? Currently not something we're looking at.
+"""
+
+# read: https://eli.thegreenplace.net/2012/03/23/python-internals-how-callables-work/
+
+# TODO: Know that a call to a C-function was made. See
+# https://docs.python.org/3/library/bdb.html#bdb.Bdb.trace_dispatch. Maybe use `lxml` as
+# test
+
+# For inspiration, look at these projects:
+# - https://github.com/joerick/pyinstrument (capture call-stack every <n> ms for profiling)
+# - https://github.com/gak/pycallgraph (display call-graph with graphviz after python execution)
+
+import argparse
+import bdb
+from io import StringIO
+import sys
+import os
+import dis
+import dataclasses
+import csv
+import xml.etree.ElementTree as ET
+
+# Copy-Paste and uncomment for interactive ipython sessions
+# import IPython; IPython.embed(); sys.exit()
+
+
+@dataclasses.dataclass(frozen=True)
+class Call():
+    """A call
+    """
+    filename: str
+    linenum: int
+    inst_index: int
+
+    @classmethod
+    def from_frame(cls, frame, debugger: bdb.Bdb):
+        code = frame.f_code
+
+        # Uncomment to see the bytecode
+        # b = dis.Bytecode(frame.f_code, current_offset=frame.f_lasti)
+        # print(b.dis(), file=sys.__stderr__)
+
+        return cls(
+            filename = debugger.canonic(code.co_filename),
+            linenum = frame.f_lineno,
+            inst_index = frame.f_lasti,
+        )
+
+
+@dataclasses.dataclass(frozen=True)
+class Callee():
+    """A callee (Function/Lambda/???)
+
+    should (hopefully) be uniquely identified by its name and location (filename+line
+    number)
+    """
+    funcname: str
+    filename: str
+    linenum: int
+
+    @classmethod
+    def from_frame(cls, frame, debugger: bdb.Bdb):
+        code = frame.f_code
+        return cls(
+            funcname = code.co_name,
+            filename = debugger.canonic(code.co_filename),
+            linenum = frame.f_lineno,
+        )
+
+
+class CallGraphTracer(bdb.Bdb):
+    """Tracer that records calls being made
+
+    It would seem obvious that this should have extended `trace` library
+    (https://docs.python.org/3/library/trace.html), but that part is not extensible --
+    however, the basic debugger (bdb) is, and provides maybe a bit more help than just
+    using `sys.settrace` directly.
+    """
+
+    recorded_calls: set
+
+    def __init__(self):
+        self.recorded_calls = set()
+        super().__init__()
+
+    def user_call(self, frame, argument_list):
+        call = Call.from_frame(frame.f_back, self)
+        callee = Callee.from_frame(frame, self)
+
+        # _print(f'{call}  -> {callee}')
+        self.recorded_calls.add((call, callee))
+
+
+################################################################################
+# Export
+################################################################################
+
+
+class Exporter:
+
+    @staticmethod
+    def export(recorded_calls, outfile_path):
+        raise NotImplementedError()
+
+    @staticmethod
+    def dataclass_to_dict(obj):
+        d = dataclasses.asdict(obj)
+        prefix = obj.__class__.__name__.lower()
+        return {f"{prefix}_{key}": val for (key, val) in d.items()}
+
+
+class CSVExporter(Exporter):
+
+    @staticmethod
+    def export(recorded_calls, outfile_path):
+        with open(outfile_path, 'w', newline='') as csv_file:
+            writer = None
+            for (call, callee) in recorded_calls:
+                data = {
+                    **Exporter.dataclass_to_dict(call),
+                    **Exporter.dataclass_to_dict(callee)
+                }
+
+                if writer is None:
+                    writer = csv.DictWriter(csv_file, fieldnames=data.keys())
+                    writer.writeheader()
+
+                writer.writerow(data)
+
+
+        print(f'output written to {outfile_path}')
+
+        # embed(); sys.exit()
+
+
+class XMLExporter(Exporter):
+
+    @staticmethod
+    def export(recorded_calls, outfile_path):
+
+        root = ET.Element('root')
+
+        for (call, callee) in recorded_calls:
+            data = {
+                **Exporter.dataclass_to_dict(call),
+                **Exporter.dataclass_to_dict(callee)
+            }
+
+            rc = ET.SubElement(root, 'recorded_call')
+            # this xml library only supports serializing attributes that have string values
+            rc.attrib = {k: str(v) for k, v in data.items()}
+
+        tree = ET.ElementTree(root)
+        tree.write(outfile_path, encoding='utf-8')
+
+
+################################################################################
+# __main__
+################################################################################
+
+
+if __name__ == "__main__":
+
+
+    parser = argparse.ArgumentParser()
+
+
+    parser.add_argument('--csv')
+    parser.add_argument('--xml')
+
+    parser.add_argument('progname', help='file to run as main program')
+    parser.add_argument('arguments', nargs=argparse.REMAINDER,
+            help='arguments to the program')
+
+    opts = parser.parse_args()
+
+    # These details of setting up the program to be run is very much inspired by `trace`
+    # from the standard library
+    sys.argv = [opts.progname, *opts.arguments]
+    sys.path[0] = os.path.dirname(opts.progname)
+
+    with open(opts.progname) as fp:
+        code = compile(fp.read(), opts.progname, 'exec')
+
+    # try to emulate __main__ namespace as much as possible
+    globs = {
+        '__file__': opts.progname,
+        '__name__': '__main__',
+        '__package__': None,
+        '__cached__': None,
+    }
+
+    real_stdout = sys.stdout
+    real_stderr = sys.stderr
+    captured_stdout = StringIO()
+
+    sys.stdout = captured_stdout
+    cgt = CallGraphTracer()
+    cgt.run(code, globs, globs)
+    sys.stdout = real_stdout
+
+    if opts.csv:
+        CSVExporter.export(cgt.recorded_calls, opts.csv)
+    elif opts.xml:
+        XMLExporter.export(cgt.recorded_calls, opts.xml)
+    else:
+        for (call, callee) in cgt.recorded_calls:
+            print(f'{call}  -> {callee}')
+
+    print('--- captured stdout ---')
+    print(captured_stdout.getvalue(), end='')
--- a/python/tools/recorded-call-graph-metrics/example/simple.py
+++ b/python/tools/recorded-call-graph-metrics/example/simple.py
@ -0,0 +1,10 @@
+def foo():
+    print('foo')
+
+def bar():
+    print('bar')
+
+foo()
+bar()
+
+foo(); bar()
--- a/python/tools/recorded-call-graph-metrics/example/simple.xml
+++ b/python/tools/recorded-call-graph-metrics/example/simple.xml
@ -0,0 +1,6 @@
+<root>
+    <recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="7" call_inst_index="18" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
+    <recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="8" call_inst_index="24" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
+    <recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="30" callee_funcname="foo" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="1" />
+    <recorded_call call_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" call_linenum="10" call_inst_index="36" callee_funcname="bar" callee_filename="/home/rasmus/code/ql/python/tools/recorded-call-graph-metrics/example/simple.py" callee_linenum="4" />
+</root>
--- a/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql
+++ b/python/tools/recorded-call-graph-metrics/ql/PointsToFound.ql
@ -0,0 +1,9 @@
+import RecordedCalls
+
+from ValidRecordedCall rc, Call call, Function callee, CallableValue calleeValue
+where
+  call = rc.getCall() and
+  callee = rc.getCallee() and
+  calleeValue.getScope() = callee and
+  calleeValue.getACall() = call.getAFlowNode()
+select call, "-->", callee
--- a/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll
+++ b/python/tools/recorded-call-graph-metrics/ql/RecordedCalls.qll
@ -0,0 +1,36 @@
+import python
+
+class RecordedCall extends XMLElement {
+  RecordedCall() { this.hasName("recorded_call") }
+
+  string call_filename() { result = this.getAttributeValue("call_filename") }
+
+  int call_linenum() { result = this.getAttributeValue("call_linenum").toInt() }
+
+  int call_inst_index() { result = this.getAttributeValue("call_inst_index").toInt() }
+
+  Call getCall() {
+    // TODO: handle calls spanning multiple lines
+    result.getLocation().hasLocationInfo(this.call_filename(), this.call_linenum(), _, _, _)
+  }
+
+  string callee_filename() { result = this.getAttributeValue("callee_filename") }
+
+  int callee_linenum() { result = this.getAttributeValue("callee_linenum").toInt() }
+
+  string callee_funcname() { result = this.getAttributeValue("callee_funcname") }
+
+  Function getCallee() {
+    result.getLocation().hasLocationInfo(this.callee_filename(), this.callee_linenum(), _, _, _)
+  }
+}
+
+/**
+ * Class of recorded calls where we can uniquely identify both the `call` and the `callee`.
+ */
+class ValidRecordedCall extends RecordedCall {
+  ValidRecordedCall() {
+    strictcount(this.getCall()) = 1 and
+    strictcount(this.getCallee()) = 1
+  }
+}
--- a/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql
+++ b/python/tools/recorded-call-graph-metrics/ql/UnidentifiedRecordedCalls.ql
@ -0,0 +1,7 @@
+import RecordedCalls
+
+from RecordedCall rc
+where not rc instanceof ValidRecordedCall
+select "Could not uniquely identify this recorded call (either call or callee was not uniquely identified)",
+  rc.call_filename(), rc.call_linenum(), rc.call_inst_index(), "-->", rc.callee_filename(),
+  rc.callee_linenum(), rc.callee_funcname()
--- a/python/tools/recorded-call-graph-metrics/ql/qlpack.yml
+++ b/python/tools/recorded-call-graph-metrics/ql/qlpack.yml
@ -0,0 +1,4 @@
+name: codeql-python-recorded-call-graph-metrics
+version: 0.0.1
+libraryPathDependencies: codeql-python
+extractor: python
--- a/python/tools/recorded-call-graph-metrics/recreate-db.sh
+++ b/python/tools/recorded-call-graph-metrics/recreate-db.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -e
+set -x
+
+DB="cg-trace-example-db"
+SRC="example/"
+XMLDIR="$SRC"
+PYTHON_EXTRACTOR=$(codeql resolve extractor --language=python)
+
+
+./cg_trace.py --xml example/simple.xml example/simple.py
+
+rm -rf "$DB"
+
+
+codeql database init --source-root="$SRC" --language=python "$DB"
+codeql database trace-command --working-dir="$SRC" "$DB" "$PYTHON_EXTRACTOR/tools/autobuild.sh"
+codeql database index-files --language xml --include-extension .xml --working-dir="$XMLDIR" "$DB"
+codeql database finalize "$DB"
+
+set +x
+echo "Created database '$DB'"