Technical Report Number 2 (#248)
The overall output of my work in 2017.
This commit is contained in:
Родитель
57fbaf62e4
Коммит
3febc2c49e
|
@ -0,0 +1,29 @@
|
|||
# Ignore LaTeX (and Latexmk) files
|
||||
*.aux
|
||||
*.lof
|
||||
*.log
|
||||
*.lot
|
||||
*.fls
|
||||
*.out
|
||||
*.toc
|
||||
*.fmt
|
||||
*.fot
|
||||
*.cb
|
||||
*.cb2
|
||||
*.dvi
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.blg
|
||||
*-blx.aux
|
||||
*-blx.bib
|
||||
*.run.xml
|
||||
*.fdb_latexmk
|
||||
*.synctex
|
||||
*.synctex(busy)
|
||||
*.synctex.gz
|
||||
*.synctex.gz(busy)
|
||||
*.pdfsync
|
||||
**/auto/*
|
||||
.R*
|
||||
*.ent
|
||||
tr02.pdf
|
|
@ -0,0 +1,32 @@
|
|||
# This Makefile uses latexmk as far as possible
|
||||
# because it understands LaTeX files better than
|
||||
# a hand-coded makefile. latexmk should be in any
|
||||
# modern Tex Live distribution.
|
||||
# Info: http://mg.readthedocs.io/latexmk.html
|
||||
|
||||
.PHONY: tr02.pdf all clean cleanall pres
|
||||
|
||||
all : tr02.pdf
|
||||
|
||||
pres : $(addprefix scripts/,modifications_pres.pdf overheads_pres.pdf)
|
||||
|
||||
clean :
|
||||
latexmk -c
|
||||
|
||||
cleanall :
|
||||
latexmk -C
|
||||
$(MAKE) -C scripts clean
|
||||
|
||||
GEN_TEX_SCRIPTS=$(addprefix scripts/,bm_results_tab.tex bm_results_macros.tex)
|
||||
GEN_GRAPHS=$(addprefix scripts/,modifications.pdf overheads.pdf)
|
||||
GEN_RESULTS=$(GEN_TEX_SCRIPTS) $(GEN_GRAPHS)
|
||||
|
||||
tr02.pdf : $(GEN_RESULTS)
|
||||
latexmk -pdf -use-make tr02.tex
|
||||
|
||||
scripts/%.pdf : scripts
|
||||
$(MAKE) -C scripts $(@F)
|
||||
scripts/%.tex : scripts
|
||||
$(MAKE) -C scripts $(@F)
|
||||
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
# Putting the Checks into Checked C
|
||||
## Checked C Technical Report Number 2
|
||||
|
||||
This document is current as of October 2017, and omits a description of null-terminated pointers.
|
||||
|
||||
### Building
|
||||
|
||||
To build this document fully, you'll need to install [R](https://www.r-project.org) and put `Rscript` on your path, as well as LaTeX.
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
Checked C is an extension to C that aims to provide a route for
|
||||
programmers to upgrade their existing C programs to a safer language
|
||||
without losing the low-level control they enjoy. Checked C currently
|
||||
only addresses unsafe code with spatial memory violations such as
|
||||
buffer overruns and out-of-bounds memory accesses. \\[1em]
|
||||
|
||||
Checked C addresses these memory safety problems by adding new pointer
|
||||
and array types to C, and a method for annotating pointers with
|
||||
expressions denoting the bounds of the objects they reference in
|
||||
memory. To ensure memory safety, the Checked C compiler uses a mixture
|
||||
of static and dynamic checks over these additions to the C language. \\[1em]
|
||||
|
||||
This Technical Report concerns these Dynamic Checks, and the
|
||||
algorithms and infrastructure required to support them, including: the
|
||||
soundness property Checked C is aiming to preserve, propagation rules
|
||||
for bounds information, and the code generation algorithm for the
|
||||
checks themselves. This report includes an evaluation of these dynamic
|
||||
bounds checks, their overhead, and their interaction with a
|
||||
state-of-the-art optimizer.
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,77 @@
|
|||
\chapter{C Syntax}
|
||||
\label{app:c-syntax}
|
||||
|
||||
This appendix contains a definition of the relevant parts of C Syntax
|
||||
for this report. This is based on the C11 Specification, but written
|
||||
with mathematical symbols rather than ASCII text.
|
||||
|
||||
\begin{align*}
|
||||
\text{LValue Expressions} \\*
|
||||
e_l \gis {} & x \tag*{Variables} \\
|
||||
\gor {} & \deref e_v \tag*{Dereference} \\
|
||||
\gor {} & e_v[e_v] \tag*{Index} \\
|
||||
\gor {} & e_l \member m \tag*{Struct Member Access} \\
|
||||
\gor {} & e_v \arrow m \tag*{Pointer Member Access} \\[1em]
|
||||
%
|
||||
\text{Value Expressions} \\*
|
||||
e_v \gis {} & \addrof e_l \tag*{Address-of} \\
|
||||
\gor {} & e_l \assign e_v \tag*{Assignment} \\
|
||||
\gor {} & e_l \compassign e_v \tag*{Compound Assignment} \\
|
||||
\gor {} & e_l \inc \mathrel{\mid} \inc e_l \tag*{Increment} \\
|
||||
\gor {} & e_l \dec \mathrel{\mid} \dec e_l \tag*{Decrement} \\
|
||||
\gor {} & e_v \member m \tag*{Struct Member Access} \\
|
||||
\gor {} & e_v , e_v \tag*{Comma Expression} \\
|
||||
\gor {} & e_v \binop e_v \tag*{Binary Operation} \\
|
||||
\gor {} & e_v \mathrel{R} e_v \tag*{Binary Relational Operation} \\
|
||||
\gor {} & \unop e_v \tag*{Unary Operation} \\
|
||||
\gor {} & e_v(\overline{e_v}) \tag*{Function Call} \\
|
||||
\gor {} & (t)e_v \tag*{Cast} \\
|
||||
\gor {} & e_v \mathrel{?} e_v : e_v \tag*{Conditional Operator} \\
|
||||
\gor {} & l \tag*{Literal Value} \\
|
||||
\gor {} & e_l \tag*{Lvalue \& Array Conversion} \displaybreak\\[1em]
|
||||
%
|
||||
\text{Binary Operators} \\*
|
||||
\binop \gis {} & + \tag*{Addition} \\
|
||||
\gor {} & - \tag*{Subtraction} \\
|
||||
\gor {} & * \tag*{Multiplication} \\
|
||||
\gor {} & / \tag*{Division} \\
|
||||
\gor {} & \% \tag*{Modulus} \\
|
||||
\gor {} & \& \tag*{Bitwise AND} \\
|
||||
\gor {} & \mid \tag*{Bitwise OR} \\
|
||||
\gor {} & \wedge \tag*{Bitwise XOR} \\
|
||||
\gor {} & \gg \tag*{Bitwise Shift Right} \\
|
||||
\gor {} & \ll \tag*{Bitwise Shift Left} \\[1em]
|
||||
%
|
||||
\text{Relational Operators} \\*
|
||||
R \gis {} & \&\& \tag*{Logical AND} \\
|
||||
\gor {} & \mid\mid \tag*{Logical OR} \\
|
||||
\gor {} & \mathord{==} \tag*{Equality} \\
|
||||
\gor {} & \mathord{!=} \tag*{Negated Equality} \\
|
||||
\gor {} & \mathord{>} \mathrel{\mid}
|
||||
\mathord{\ge} \mathrel{\mid}
|
||||
\mathord{<} \mathrel{\mid}
|
||||
\mathord{\le} \tag*{Comparison} \\[1em]
|
||||
%
|
||||
\text{Unary Operators} \\*
|
||||
\unop \gis {} & + \tag*{Unary Plus} \\
|
||||
\gor {} & - \tag*{Numerical Negation} \\
|
||||
\gor {} & \sim \tag*{Bitwise Negation} \\
|
||||
\gor {} & ! \tag*{Logical Negation} \\[1em]
|
||||
%
|
||||
\text{Literals} \\*
|
||||
l \gis {} & \mathtt{NULL} \tag*{Null Pointer Literal} \\
|
||||
\gor {} & \mathrm{integers} \tag*{Integer Literals} \\
|
||||
\gor {} & \mathrm{floats} \tag*{Float Literals} \\
|
||||
\gor {} & \{ l , \dots \} \tag*{Array \& Struct Literals} \\[1em]
|
||||
%
|
||||
\text{Types} \\*
|
||||
t \gis {} & \dots \\[1em]
|
||||
%
|
||||
\text{Struct Members} \\*
|
||||
m \gis {} & \dots \\
|
||||
\end{align*}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,339 @@
|
|||
\ProvidesPackage{checked_c}
|
||||
|
||||
%
|
||||
% My Package for Checked C documents.
|
||||
%
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Dependencies
|
||||
%
|
||||
\RequirePackage{etoolbox}
|
||||
|
||||
\RequirePackage[T1]{fontenc}
|
||||
\RequirePackage{microtype}
|
||||
|
||||
\RequirePackage{hyperref}
|
||||
|
||||
\RequirePackage{fancyhdr}
|
||||
\RequirePackage{paralist}
|
||||
|
||||
\RequirePackage{listings}
|
||||
\RequirePackage{listing-label}
|
||||
|
||||
\RequirePackage{color}
|
||||
\RequirePackage{xcolor}
|
||||
|
||||
\RequirePackage{longtable}
|
||||
\RequirePackage{booktabs}
|
||||
\RequirePackage{caption}
|
||||
|
||||
\RequirePackage{mathtools}
|
||||
\RequirePackage{amssymb}
|
||||
\RequirePackage{amsfonts}
|
||||
|
||||
\RequirePackage{endnotes}
|
||||
\RequirePackage{hyperendnotes}
|
||||
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Package Options
|
||||
%
|
||||
|
||||
% [draft] option switches off errors for \cn and \note{}
|
||||
\newtoggle{final}
|
||||
\toggletrue{final}
|
||||
|
||||
\DeclareOption{draft}{\togglefalse{final}}
|
||||
|
||||
\ProcessOptions
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Typography Setup
|
||||
%
|
||||
|
||||
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
|
||||
|
||||
\hypersetup{%
|
||||
unicode=true,
|
||||
pdfborder={0 0 0},
|
||||
bookmarksnumbered,
|
||||
hyperfigures,
|
||||
pdfcreator={},
|
||||
pdfproducer={},
|
||||
pdfnewwindow
|
||||
}
|
||||
\urlstyle{tt}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Page Layout
|
||||
%
|
||||
|
||||
% Paragraphs
|
||||
\setlength{\parindent}{0pt}
|
||||
\setlength{\parskip}{4pt plus 2pt minus 1pt}
|
||||
|
||||
|
||||
|
||||
% Headers
|
||||
\AtBeginDocument{%
|
||||
\pagestyle{fancy}
|
||||
\fancyhead[RO]{\textit{\leftmark}}
|
||||
\fancyhead[LO]{}
|
||||
}
|
||||
|
||||
|
||||
% TOC
|
||||
\setcounter{tocdepth}{1}
|
||||
|
||||
|
||||
% Figures
|
||||
\setlength{\captionmargin}{1em}
|
||||
\numberwithin{figure}{chapter}
|
||||
\AtBeginDocument{
|
||||
\numberwithin{lstlisting}{chapter}
|
||||
}
|
||||
|
||||
% References
|
||||
\renewcommand*{\sectionautorefname}{Section}
|
||||
\renewcommand*{\chapterautorefname}{Chapter}
|
||||
|
||||
% Quotations
|
||||
\def\signed #1{{\leavevmode\unskip\nobreak\hfil\penalty50\hskip2em
|
||||
\hbox{}\nobreak\hfill --- #1%
|
||||
\parfillskip=0pt \finalhyphendemerits=0 \endgraf}}
|
||||
|
||||
% Bibliography
|
||||
\bibliographystyle{plain}
|
||||
\patchcmd{\thebibliography}{%
|
||||
\chapter*{\bibname}\@mkboth{\MakeUppercase\bibname}{\MakeUppercase\bibname}}{%
|
||||
\chapter{\bibname}}{}{}
|
||||
|
||||
% Layout Magic
|
||||
\clubpenalty=10000
|
||||
|
||||
% Endnotes
|
||||
\def\footnote{\endnote}
|
||||
\def\enotesize{\normalsize}
|
||||
\def\enoteheading{\chapter{End Notes}}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Helpers for TODOs
|
||||
%
|
||||
\iftoggle{final}{%
|
||||
\newcommand{\note}[1]{\PackageError{checked_c}{Note in Final Version of Document}{}}
|
||||
\newcommand{\cn}[0]{\PackageError{checked_c}{Citation Needed for Final Version of Document}{}}
|
||||
}{
|
||||
\newcommand{\note}[1]{\textit{\textcolor{orange}{Note: #1}}}
|
||||
\newcommand{\cn}[0]{\textsuperscript{[\textcolor{blue}{citation needed}]}}
|
||||
}
|
||||
|
||||
|
||||
%
|
||||
% Colours
|
||||
%
|
||||
|
||||
\colorlet{c-green}{green!60!black}
|
||||
\colorlet{c-blue}{blue!80!black}
|
||||
\colorlet{c-teal}{green!50!blue}
|
||||
\colorlet{c-orange}{orange!90!black}
|
||||
\colorlet{c-black}{black}
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% C Grammar (for math mode)
|
||||
%
|
||||
|
||||
% Some symbols for C grammar
|
||||
\newcommand{\binop}[0]{\oplus}
|
||||
\newcommand{\unop}[0]{\otimes}
|
||||
\newcommand{\addrof}[0]{\mathord{\&}}
|
||||
\newcommand{\deref}[0]{\mathord{*}}
|
||||
\newcommand{\member}[0]{\mathord{.}}
|
||||
\newcommand{\arrow}[0]{\mathord{\rightarrow}}
|
||||
\newcommand{\assign}[0]{\mathop{=}}
|
||||
\newcommand{\compassign}[0]{\mathop{\mathrel{\binop}=}}
|
||||
\newcommand{\inc}[0]{\mathord{++}}
|
||||
\newcommand{\dec}[0]{\mathord{-{}-}}
|
||||
|
||||
% Helpers for Propagation Rules (in math mode)
|
||||
\DeclareMathOperator{\cast}{cast}
|
||||
\DeclareMathOperator{\typeof}{type}
|
||||
\DeclareMathOperator{\arr}{array}
|
||||
\newcommand{\lbounds}[0]{\textsl{\color{c-green}lvalue-bounds}}
|
||||
\newcommand{\vbounds}[0]{\textsl{\color{c-green}value-bounds}}
|
||||
\newcommand{\ltbounds}[0]{\textsl{\color{c-green}lvalue-target-bounds}}
|
||||
\newcommand{\abounds}[0]{\textsl{\color{c-green}array-bounds}}
|
||||
\newcommand{\sbounds}[0]{\textsl{\color{c-green}single-bounds}}
|
||||
\newcommand{\mbounds}[0]{\textrm{\color{c-blue}bounds}}
|
||||
\newcommand{\srbounds}[0]{\textsl{\color{c-green}struct-rel-bounds}}
|
||||
\newcommand{\dbounds}[0]{\textsl{\color{c-green}declared-bounds}}
|
||||
\newcommand{\rbounds}[0]{\textsl{\color{c-green}declared-return-bounds}}
|
||||
\newcommand{\gis}[0]{\mathrel{\Coloneqq}}
|
||||
\newcommand{\gor}[0]{\mathrel{\mid}}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Listings Extensions
|
||||
%
|
||||
|
||||
% Corrections and Extensions to Listing's C Languages
|
||||
\lstloadlanguages{C}
|
||||
\lstdefinelanguage[ISO]{C}[]{C}%
|
||||
{morekeywords={restrict,inline,size_t,alignas,alignof,bool,complex,imaginary,noreturn,static_assert,thread_local}}
|
||||
\lstdefinelanguage[Checked]{C}[ISO]{C}%
|
||||
{morekeywords={[2]unchecked,_Unchecked,checked,_Checked,%
|
||||
array_ptr,_Array_ptr,ptr,_Ptr,%
|
||||
nt_array_ptr,_Nt_array_ptr,nt_checked,_Nt_checked,%
|
||||
count,byte_count,bounds,%
|
||||
dynamic_check,_Dynamic_check,%
|
||||
dynamic_bounds_cast,_Dynamic_bounds_cast,%
|
||||
assume_bounds_cast,_Assume_bounds_cast},
|
||||
}
|
||||
|
||||
\makeatletter
|
||||
% Simple Language for LLVM IR
|
||||
\lstdefinelanguage{LLVM}{
|
||||
sensitive=false,
|
||||
alsoletter=.\%,
|
||||
comment=[l]{;},
|
||||
string=[b]{"},
|
||||
keywordsprefix=@llvm,
|
||||
keywords=[1]{ icmp, br, and, bitcast, call, unreachable },
|
||||
keywords=[2]{ null, ne, eq, ule, ult, to },
|
||||
keywords=[3]{ i1, i32, void, label, metadata},
|
||||
}
|
||||
\makeatother
|
||||
|
||||
% Styles
|
||||
\lstdefinestyle{display}{
|
||||
basicstyle=\small\ttfamily,
|
||||
keywordstyle=\color{c-teal},
|
||||
keywordstyle={[2]\color{c-green}},
|
||||
commentstyle=\color{c-orange},
|
||||
identifierstyle=\color{c-blue},
|
||||
stringstyle=\color{c-orange},
|
||||
showstringspaces=false,
|
||||
escapechar=~,
|
||||
tabsize=2,
|
||||
captionpos=b,
|
||||
flexiblecolumns,
|
||||
keepspaces,
|
||||
xleftmargin=25pt,
|
||||
xrightmargin=25pt,
|
||||
breaklines=true,
|
||||
breakatwhitespace=true,
|
||||
}
|
||||
|
||||
% Defaults
|
||||
\lstset{%
|
||||
language=[Checked]C,%
|
||||
style=display,%
|
||||
}
|
||||
|
||||
% Code Environments
|
||||
%TC:envir code [ignore] xall
|
||||
\lstnewenvironment{code}[1][]%
|
||||
{\begingroup\lstset{style=display}\lstset{#1}}%
|
||||
{\endgroup}
|
||||
|
||||
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% C and Checked C expressions in prose/maths
|
||||
%
|
||||
|
||||
\newcommand{\ensuremathtext}[1]{\ifmmode\text{#1}\else#1\fi}
|
||||
|
||||
% Inline Styling
|
||||
\newcommand{\basestyle}{\ttfamily}
|
||||
\newcommand{\exprstyle}{\basestyle\color{c-black}}
|
||||
\newcommand{\mvstyle}{\basestyle\itshape}
|
||||
\newcommand{\kwstyle}{\basestyle\color{c-teal}}
|
||||
\newcommand{\tystyle}{\basestyle\color{c-green}}
|
||||
|
||||
% Metavars, keywords, types
|
||||
\newcommand{\expr}[1]{\ensuremathtext{\begingroup\exprstyle#1\endgroup}}
|
||||
\newcommand{\mv}[1]{\ensuremathtext{\begingroup\mvstyle#1\endgroup}}
|
||||
\newcommand{\kw}[1]{\ensuremathtext{\begingroup\kwstyle#1\endgroup}}
|
||||
\newcommand{\ty}[1]{\ensuremathtext{\begingroup\tystyle#1\endgroup}}
|
||||
|
||||
|
||||
% Keywords
|
||||
\newcommand{\sizeof}[1]{\expr{\kw{sizeof}(#1)}}
|
||||
\newcommand{\NULL}{\kw{NULL}}
|
||||
\newcommand{\calloc}[2]{\expr{\kw{calloc}(#1, #2)}}
|
||||
|
||||
\newcommand{\kwchecked}[0]{\ty{checked}}
|
||||
\newcommand{\kwunchecked}[0]{\ty{unchecked}}
|
||||
\newcommand{\kwdynamiccheck}[0]{\ty{dynamic\_check}}
|
||||
\newcommand{\kwbounds}[0]{\ty{bounds}}
|
||||
\newcommand{\kwbytecount}[0]{\ty{byte\_count}}
|
||||
\newcommand{\kwcount}[0]{\ty{count}}
|
||||
\newcommand{\kwboundschecked}[0]{\ty{BOUNDS\_CHECKED}}
|
||||
\newcommand{\kwassumecast}[0]{\ty{assume\_bounds\_cast}}
|
||||
\newcommand{\kwdynamiccast}[0]{\ty{dynamic\_bounds\_cast}}
|
||||
|
||||
|
||||
%
|
||||
% type macros
|
||||
%
|
||||
\newcommand{\void}{\kw{void}}
|
||||
|
||||
% array_ptr type macros
|
||||
%
|
||||
\newcommand{\Arrayptr}[1]{\expr{\ty{array\_ptr}<#1>}}
|
||||
\newcommand{\ArrayptrT}{\Arrayptr{\mv{T}}}
|
||||
|
||||
% ptr type macros
|
||||
%
|
||||
\newcommand{\Ptr}[1]{\expr{\ty{ptr}<#1>}}
|
||||
\newcommand{\PtrT}{\Ptr{\mv{T}}}
|
||||
|
||||
% checked array macros
|
||||
%
|
||||
\newcommand{\Checkedarr}[2]{\expr{#1~\ty{checked}[#2]}}
|
||||
\newcommand{\CheckedarrT}[1]{\Checkedarr{\mv{T}}{#1}}
|
||||
|
||||
% nt_array_ptr type macros
|
||||
%
|
||||
\newcommand{\Ntarrayptr}[1]{\expr{\ty{nt\_array\_ptr}<#1>}}
|
||||
\newcommand{\NtarrayptrT}{\Ntarrayptr{\mv{T}}}
|
||||
|
||||
|
||||
% nt_checked array macros
|
||||
%
|
||||
\newcommand{\Ntcheckedarr}[2]{\expr{#1~\ty{nt\_checked}[#2]}}
|
||||
\newcommand{\NtcheckedarrT}[1]{\Ntcheckedarr{\mv{T}}{#1}}
|
||||
|
||||
|
||||
% unchecked pointers
|
||||
%
|
||||
\newcommand{\uncheckedptr}[1]{\kw{#1*}}
|
||||
\newcommand{\uncheckedptrT}{\uncheckedptr{\mv{T}}}
|
||||
\newcommand{\uncheckedarr}[2]{\expr{#1[#2]}}
|
||||
\newcommand{\uncheckedarrT}[1]{\uncheckedarr{\mv{T}}{#1}}
|
||||
|
||||
%
|
||||
% bounds expression macros
|
||||
%
|
||||
\newcommand{\bounds}[2]{\kw{bounds(\expr{#1}, \expr{#2})}}
|
||||
\newcommand{\boundscount}[1]{\kw{count(\expr{#1})}}
|
||||
\newcommand{\boundsbytecount}[1]{\kw{byte\_count(\expr{#1})}}
|
||||
\newcommand{\boundsany}{\kw{bounds(\expr{any})}}
|
||||
\newcommand{\boundsunknown}{\kw{bounds(\expr{unknown})}}
|
||||
|
||||
%
|
||||
% bounds cast macros
|
||||
%
|
||||
\newcommand{\dynamiccast}[4]{\expr{\kwdynamiccast<#1>(#2, #3, #4)}}
|
||||
\newcommand{\assumecast}[4]{\expr{\kwassumecast<#1>(#2, #3, #4)}}
|
||||
|
||||
%
|
||||
% bounds declaration macros
|
||||
%
|
||||
\newcommand{\boundsdecl}[2]{\expr{#1~:~#2}}
|
||||
%
|
||||
% computed bounds for expressions
|
||||
%
|
||||
\newcommand{\boundsinfer}[2]{\expr{#1}~$\vdash$~\expr{#2}}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,319 @@
|
|||
\chapter{Checks}
|
||||
\label{sec:checks}
|
||||
|
||||
This section describes how we have implemented our prototype Checked C
|
||||
compiler. Our compiler, based on the Clang/LLVM toolchain, is
|
||||
available online at \url{https://github.com/Microsoft/checkedc-clang}.
|
||||
|
||||
\section{Design Requirements}
|
||||
|
||||
The Checked C project has certain aims for the language which
|
||||
influence the design of its checks, for instance where we rely on
|
||||
static checks and where we rely on dynamic checks.
|
||||
|
||||
The main aim of Checked C is that we can upgrade code incrementally
|
||||
from C to Checked C, which means, among other things, that we must
|
||||
be layout compatible with existing C code. We may not, for instance,
|
||||
change the pointer representation, as this would break the data layout
|
||||
of existing code and change the calling convention.
|
||||
|
||||
A second aim is that we should keep any overhead of dynamic checks
|
||||
only to where the programmer accesses memory and they are required for
|
||||
soundness. This means that any overhead should be understandable and
|
||||
predictable. On the other hand we will have to perform checks
|
||||
statically at compile-time on pointer assignments and function calls
|
||||
to ensure soundness.
|
||||
|
||||
A final aim is that Checked C should be able to be used wherever C
|
||||
can currently be used, including embedded devices. This means we do
|
||||
not require that programs are linked to runtime libraries which
|
||||
implement the checks, so that executable size remains small.
|
||||
|
||||
\section{Static Checks}
|
||||
\label{sec:static-checks}
|
||||
|
||||
At assignments into variables with bounds, Checked C requires that the
|
||||
bounds of the value being assigned into the variable imply the bounds
|
||||
of the variable. In particular, this means that the bounds of a
|
||||
variable are a (non-strict) sub-range of the bounds of the expression
|
||||
being assigned into it. This ensures that any future memory accesses
|
||||
via the newly-assigned variable cannot access memory outside the
|
||||
bounds of the original expression.
|
||||
|
||||
C's semantics for function calls behave as if the programmer has
|
||||
assigned the argument expressions to the parameter declarations, and
|
||||
so are covered by this same static checking, only without any implied
|
||||
mutual ordering, and bounds on function parameters can reference other
|
||||
declared parameters.
|
||||
|
||||
We also require checks on updates to variables and members involved in
|
||||
bounds checks, such that they continue to denote the bounds of these
|
||||
objects in memory. For example, increasing the value in a length
|
||||
member of a struct also containing an pointer to an array of that
|
||||
length, without a corresponding \lstinline|realloc| or other knowledge
|
||||
that the bounds of the array are actually longer, could lead to an
|
||||
out-of-bounds memory access, even if this operation is performed
|
||||
within a checked region.
|
||||
|
||||
Lastly, a static check is required when casting an expression to a
|
||||
\PtrT type. This check ensures that the value being cast is within the
|
||||
implied bounds of the \PtrT type so that this checked singleton
|
||||
pointer can be dereferenced without needing to check its range.
|
||||
|
||||
In all these cases, where the compiler cannot prove these facts, it
|
||||
may use the explicit dynamically-checked cast operators described
|
||||
in~\autoref{sec:dynamiccastops} to prove these facts dynamically
|
||||
rather than statically.
|
||||
|
||||
Further discussion of these static checks, and how exactly we
|
||||
``prove'' anything about C programs to the compiler will be described
|
||||
in a future Checked C Technical Report.
|
||||
|
||||
|
||||
\section{Dynamic Checks}
|
||||
\label{sec:dynamic-checks}
|
||||
|
||||
As mentioned previously, we are only interested in inserting dynamic
|
||||
checks in expressions that actually access memory. We can also only
|
||||
insert dynamic checks where we have bounds expressions for the pointer
|
||||
we're accessing memory through.
|
||||
|
||||
As described in~\autoref{sec:cs-value-semantics}, all C memory reads
|
||||
are via lvalue conversions, and all writes are via assignment or
|
||||
increment operators. This means we can add the bounds checks during
|
||||
the evaluation of the lvalue sub-expressions of these operators, if the
|
||||
accesses dereference checked pointers. This sidesteps generating
|
||||
dynamic checks on the lvalue sub-expression of the C $\addrof$
|
||||
operator, which does not access memory (it only computes a pointer
|
||||
value).
|
||||
|
||||
In particular, we will add dynamic checks into the evaluation of the
|
||||
lvalue expression in lvalue conversions; on the left of the assignment
|
||||
and compound assignment operators; and on the evaluation of the lvalue
|
||||
expression in increment and decrement operators. We also add dynamic checks to
|
||||
the base expression in a struct member access, if the base expression
|
||||
is an lvalue, to ensure each level of access into the struct is within
|
||||
bounds, as required by our narrowing rules in~\autoref{sec:narrowing}.
|
||||
|
||||
We currently use a lazy propagation algorithm which will only
|
||||
propagate bounds information from declarations to uses which require
|
||||
the bounds for either static or dynamic checks. This avoids computing
|
||||
bounds for unchecked memory accesses.
|
||||
|
||||
\subsection{Generating Dynamic Checks}
|
||||
\label{sec:dynamic-checks-impl}
|
||||
|
||||
The Clang/LLVM toolchain consists of a C compiler (Clang itself), a
|
||||
general-purpose optimizer and code generator, and various other
|
||||
compiler tools. The central part of the compiler is LLVM's
|
||||
Intermediate Representation (LLVM IR), a typed, high-level assembly
|
||||
language which abstracts away many machine and compilation details
|
||||
while still providing a language that is at an abstraction level
|
||||
closer to assembly languages.
|
||||
|
||||
The architecture of the Clang/LLVM toolchain is fairly conventional.
|
||||
Our fork of Clang parses Checked C, performs Checked C-specific static
|
||||
checks, and then generates unoptimized LLVM IR. LLVM then analyses and
|
||||
optimizes LLVM IR, before generating platform-specific assembly. Our
|
||||
fork of the Clang/LLVM toolchain only includes changes to Clang, we
|
||||
have not needed to change LLVM.
|
||||
|
||||
Clang's LLVM IR Generator is actually made up of five mutually
|
||||
recursive code generators which process the Checked C syntax tree. In
|
||||
terms of the C semantics above, four of these code generators generate
|
||||
code for value expressions---namely scalar values (such as integers,
|
||||
characters and floats), aggregate values (such as arrays and structs),
|
||||
vector values (for vectorized code), and complex number values. The
|
||||
code generator we are interested in, however, is the fifth, which is
|
||||
responsible for generating the LLVM IR for lvalue expressions.
|
||||
|
||||
In C, we already know the uses of C lvalues that access memory
|
||||
(described in~\autoref{sec:dynamic-checks}), so we extend the code
|
||||
generation of LLVM for these lvalues to insert the required dynamic
|
||||
checks. There are three kinds of checks we currently insert, non-null
|
||||
checks (required for any checked pointer); bounds checks (required
|
||||
only for lvalues that access memory through a checked array pointer);
|
||||
and dynamic cast checks (required for casts
|
||||
from~\autoref{sec:dynamiccastops}). If a pointer requires both a
|
||||
non-null check and a bounds check, the non-null check is performed
|
||||
first.
|
||||
|
||||
\paragraph{LLVM IR} The following is a quick introduction to LLVM
|
||||
IR~\cite{LLVMLangRef}. LLVM IR is typed, with instructions containing
|
||||
explicit argument type annotations. The two types we care about here
|
||||
are LLVM's pointer type, denoted \lstinline[language=LLVM]|T*|, and
|
||||
LLVM's boolean type, denoted \lstinline[language=LLVM]|i1|. Both
|
||||
Checked C's checked pointers, and C's unchecked pointers translate to
|
||||
LLVM pointers. LLVM pointers are a different type to LLVM's integers,
|
||||
but can be compared using the same integer comparison operators.
|
||||
|
||||
LLVM's integer comparison instruction, \lstinline[language=LLVM]|icmp|
|
||||
is parameterized by the kind of comparison, here we use
|
||||
\lstinline[language=LLVM]|eq| (equal), \lstinline[language=LLVM]|ne|
|
||||
(not equal), \lstinline[language=LLVM]|ult| (unsigned less than), and
|
||||
\lstinline[language=LLVM]|ule| (unsigned less than or equal to).
|
||||
Unlike in C, LLVM's integer comparisons are fully defined for
|
||||
pointers, even if the pointers point to separate objects.
|
||||
|
||||
LLVM has a high-level \lstinline[language=LLVM]|call| instruction for
|
||||
calling other functions and intrinsics. LLVM's conditional branch
|
||||
instruction, \lstinline[language=LLVM]|br| always takes a boolean
|
||||
condition, and two labels, the first for if the condition is true, the
|
||||
second for when it is false.
|
||||
|
||||
\paragraph{Non-null checks} As shown in~\autoref{llvm:nonnull}, these
|
||||
have fairly conventional generated code, involving comparing the
|
||||
computed pointer to \lstinline[language=LLVM]|null| (the LLVM keyword
|
||||
for the null pointer), branching on the result. Importantly this
|
||||
branch has to happen before we access memory using the computed value.
|
||||
|
||||
One optimization we perform with non-null checks is for pointer member
|
||||
expressions, $e_v \arrow m$. We could check that the pointer computed
|
||||
for the member, $m$, is non-null, but this computed pointer will be
|
||||
different for each member in the struct, meaning different non-null
|
||||
checks per-pointer, that will potentially not be optimized away.
|
||||
Instead, we do a non-null check on the struct base, $e_v$, which
|
||||
increases check redundancy, allowing more optimization, without
|
||||
affecting soundness.\endnote{In C, the \NULL{} pointer does not point
|
||||
to any object, and you may not use a pointer into one object to
|
||||
compute the pointer into another object unless the former contains the
|
||||
latter object. Therefore computing a pointer offset from \NULL{} is
|
||||
very much undefined behaviour. LLVM matches this by allowing computed
|
||||
offsets from \lstinline[language=LLVM]|null| to be optimized to
|
||||
\lstinline[language=LLVM]|null|.}
|
||||
|
||||
\begin{code}[language=LLVM,float=ht,label=llvm:nonnull,
|
||||
caption={Example Non-null check generated by the Checked C compiler.}]
|
||||
; ... function prefix
|
||||
%ptr = ; ... computes pointer value
|
||||
%ptr_non_null = icmp ne <ty>* %ptr, null
|
||||
br i1 %ptr.non_null label %success, label %fail
|
||||
|
||||
%success:
|
||||
; ... function continues
|
||||
; uses of %ptr are valid
|
||||
|
||||
%fail:
|
||||
call void @llvm.trap()
|
||||
unreachable
|
||||
\end{code}
|
||||
|
||||
\paragraph{Bounds Checks} As shown in~\autoref{llvm:bounds}, these are
|
||||
very similar to the non-null checks. After we have computed the
|
||||
pointer value (which has usually happened before a non-null check not
|
||||
shown in the example), we compute LLVM values for the upper bound and
|
||||
lower bound directly from the parts of the bounds expression as
|
||||
propagated by the algorithm in~\autoref{sec:propagation-rules}.
|
||||
Importantly, the pointer can be equal to the lower bound, but it has
|
||||
to be strictly less than the upper bound, to agree with the invariants
|
||||
in~\autoref{sec:canonical-forms}.
|
||||
|
||||
\begin{code}[language=LLVM,float=ht,label=llvm:bounds,
|
||||
caption={Example Bounds check generated by the Checked C compiler.}]
|
||||
; ... function prefix
|
||||
%ptr = ; ... computes pointer value
|
||||
%ptr_lb = ; ... computes pointer lower bound
|
||||
%ptr_ub = ; ... computes pointer upper bound
|
||||
%ptr_in_lb = icmp ule <ty>* %ptr_lb, %ptr ; lower check
|
||||
%ptr_in_ub = icmp ult <ty>* %ptr, %ptr_ub ; upper check
|
||||
%ptr_in_bounds = and i1 %ptr_in_lb, %ptr_in_ub
|
||||
br i1 %ptr_in_bounds label %success, label %fail
|
||||
|
||||
%success:
|
||||
; ... function continues
|
||||
; loads and stores to %ptr are valid
|
||||
|
||||
%fail:
|
||||
call void @llvm.trap()
|
||||
unreachable
|
||||
\end{code}
|
||||
|
||||
\paragraph{Dynamic Cast Checks} In this case the checks are a good
|
||||
deal more complicated. If the value being cast is \NULL{}, then there
|
||||
is no need to check the bounds, as the null pointer is within any
|
||||
bounds. Otherwise, the code ensures that the cast bounds are a
|
||||
sub-range of the bounds of the pointer. If either check succeeds, the
|
||||
pointer is cast to the new LLVM type, and execution continues.
|
||||
|
||||
\begin{code}[language=LLVM,float=ht,label=llvm:cast,
|
||||
caption={Example Bounds Cast Check generated by the Checked C compiler.}]
|
||||
; ... function prefix
|
||||
%ptr = ; ... computes pointer value
|
||||
%ptr_null = icmp eq <ty>* %ptr, null
|
||||
br i1 %ptr_null, label %success, label %subrange_check
|
||||
|
||||
%subrange_check:
|
||||
%ptr_lb = ; ... computes pointer lower bound
|
||||
%ptr_ub = ; ... computes pointer upper bound
|
||||
%cast_lb = ; ... computes cast lower bound
|
||||
%cast_ub = ; ... computes cast upper bound
|
||||
%cast_in_lb = icmp ule <ty>* %ptr_lb, %cast_lb ; lower check
|
||||
%cast_in_ub = icmp ule <ty>* %cast_ub, %ptr_ub ; upper check
|
||||
%cast_subrange = and i1 %cast_in_lb, %cast_in_ub
|
||||
br i1 %cast_subrange, label %success, label %fail
|
||||
|
||||
%success:
|
||||
%cast_ptr = bitcast <ty>* %ptr to <ty>*
|
||||
; ... function continues
|
||||
; loads and stores to %cast_ptr are valid
|
||||
|
||||
%fail:
|
||||
call void @llvm.trap()
|
||||
unreachable
|
||||
\end{code}
|
||||
|
||||
\paragraph{Trap Blocks} In all these checks we generate ``trap
|
||||
blocks'', which are the LLVM Basic Blocks that generate and signal the
|
||||
run-time exception for when the check fails. These consist of two
|
||||
instructions, a call to the LLVM trap intrinsic,
|
||||
\lstinline[language=LLVM]|@llvm.trap()|, which the LLVM code generator
|
||||
knows how to turn into platform-specific trap instructions; and an
|
||||
\lstinline[language=LLVM]|unreachable| instruction so that the LLVM
|
||||
static analyser knows that execution does not continue after this
|
||||
branch.
|
||||
|
||||
We insert trap blocks at the end of the function so that the
|
||||
successful control flow is emphasized over the failure control flow.
|
||||
We also insert one of these per dynamic check (rather than all the
|
||||
dynamic checks sharing a single trap block per-function) so that
|
||||
debugging when optimizations are disabled is far easier. The Clang
|
||||
code generator automatically takes care of generating debug
|
||||
information for our generated code.
|
||||
|
||||
\paragraph{Gotchas} As we use the scalar value code generator to
|
||||
generate the values for the upper and lower bounds checks, we have to
|
||||
be very careful that bounds checks are not recursive. A program is
|
||||
allowed to dereference checked pointers to compute bounds, however it
|
||||
is not allowed to dereference itself to compute its own bounds---doing
|
||||
so would lead to an infinite loop in check generation. We can almost
|
||||
entirely prevent this by only allowing bounds expressions to reference
|
||||
prior declarations, but care is needed around potential pointer
|
||||
aliasing.
|
||||
|
||||
|
||||
\subsection{Operating System Support For Dynamic Checks}
|
||||
|
||||
We include no OS-specific behaviour for our dynamic checks, despite
|
||||
the fact that modern operating systems provide coarse-grained fault
|
||||
mechanisms for some amount of memory safety.
|
||||
|
||||
For instance, Linux, Mac OS X, and Windows will all signal a run-time
|
||||
error if a program attempts to access unmapped memory, including the
|
||||
first page at the \NULL{} address. Our Checked C compiler could rely on this mechanism for performing some of the above dynamic checks, but does not.
|
||||
|
||||
Our compiler avoids these mechanisms for two reasons. The first is
|
||||
that we want to be able to deploy Checked C anywhere, including
|
||||
platforms without a full operating system to perform this check on our
|
||||
behalf. Secondly, the POSIX standard includes a mechanism for allowing
|
||||
the program to override the handling behaviour of or to ignore this
|
||||
run-time error. Doing so could affect program soundness.
|
||||
|
||||
We definitely cannot use this mechanism to implement bounds checks, as
|
||||
the ranges of mapped memory are too coarse-grained for our propagation
|
||||
and narrowing algorithm.
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% TeX-master: "tr02"
|
||||
%%% mode: latex
|
||||
%%% End:
|
|
@ -0,0 +1,25 @@
|
|||
\chapter{Conclusion}
|
||||
\label{sec:conclusion}
|
||||
|
||||
This report has described how I implemented the dynamic checks in the
|
||||
Checked C compiler. I have presented a flow-insensitive algorithm for
|
||||
propagating bounds information through expressions, such that these
|
||||
bounds can be checked when memory is accessed, and has shown one
|
||||
possible compilation of these bounds expressions into dynamic checks.
|
||||
|
||||
I have also shown that, for a set of pointer-intensive benchmarks,
|
||||
though the programmer will have to modify on average
|
||||
\ResultLinesModifiedMean of each benchmark in order to have the
|
||||
soundness Checked C guarantees, this gives a run-time overhead of on
|
||||
average \ResultRunTimeMean and at most \ResultRunTimeMax.
|
||||
|
||||
This work is an important step towards proving the viability and
|
||||
usefulness of Checked C.
|
||||
|
||||
|
||||
%\section{Future Work}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,186 @@
|
|||
\chapter{Preliminary Evaluation}
|
||||
\label{sec:evaluation}
|
||||
|
||||
I converted two existing C benchmark suites as an initial evaluation
|
||||
of the consequences of porting code to Checked C (with assistance from
|
||||
David Tarditi and researchers at Samsung Research). We quantify both
|
||||
the changes required for the code to become checked, and the overhead
|
||||
imposed on compilation, running time, and executable size.
|
||||
|
||||
\section{Benchmarks}
|
||||
|
||||
\begin{table}[ht]
|
||||
\input{tables/bmdesc}
|
||||
\end{table}
|
||||
|
||||
I chose the Olden \cite{Rogers1995Olden} and Ptrdist
|
||||
\cite{Austin1994Ptrdist} benchmark suites, described in
|
||||
Table~\ref{tab:bmdesc}, because they are specifically designed to test
|
||||
pointer-intensive applications, and they are the same benchmarks used
|
||||
to evaluate both Deputy~\cite{Feng2006} and CCured~\cite{Necula2005}.
|
||||
|
||||
We evaluate Checked C using these benchmarks in two ways. First, we
|
||||
quantify the number and type of source code changes required to
|
||||
convert these benchmarks from C to Checked C. Second, we quantify the
|
||||
overhead of the run-time checks on benchmark run time, compile time,
|
||||
and executable size. The evaluation results are presented in
|
||||
Table~\ref{tab:bmresults}.
|
||||
|
||||
\paragraph{Experimental Setup} These were produced using a 12-Core
|
||||
Intel Xeon X5650 2.66GHz, with 24GB of RAM, running Red Hat Enterprise
|
||||
Linux 6. All compilation and benchmarking was done without
|
||||
parallelism. We ran each benchmark 21 times with and without the
|
||||
Checked C changes using the test sizes from the LLVM versions of these
|
||||
benchmarks. We report the median; we observed little variance.
|
||||
|
||||
\paragraph{Excluded Benchmarks} We were unable to convert the
|
||||
\emph{voronoi} benchmark from the Olden suite due to bounds
|
||||
propagation limitations detailed in~\autoref{sec:prop-limitations}. We
|
||||
were unable to convert the \emph{bc} from the Ptrdist suite due to
|
||||
lack of time. They are excluded from any conversion results.
|
||||
|
||||
\begin{table}[ht]
|
||||
\input{tables/bmresults}
|
||||
\end{table}
|
||||
|
||||
\section{Code Modifications}
|
||||
\label{sec:eval-code-changes}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{scripts/modifications}
|
||||
\caption{Code Modifications}
|
||||
\label{fig:cm-plot}
|
||||
\end{figure}
|
||||
|
||||
On average, the changes modified \ResultLinesModifiedMean of benchmark
|
||||
lines of code. Most of these changes were in declarations,
|
||||
initializers, and type definitions rather than in the program logic.
|
||||
In the evaluation of Deputy~\cite{Condit2007}, the reported figure of
|
||||
lines changed ranges between 0.5\% and 11\% for the same benchmarks,
|
||||
showing they have a lower annotation burden than Checked C.
|
||||
|
||||
We modified the benchmarks to use checked blocks and the top-level
|
||||
checked pragma. We placed code that could not be checked because it
|
||||
used unchecked pointers, or assignments where our static analysis
|
||||
could not currently verify that the assignment was valid, in unchecked
|
||||
blocks. On average, about \ResultLinesUncheckedMean of the code
|
||||
remained unchecked after conversion, with a minimum and maximum of
|
||||
\ResultLinesUncheckedMin and \ResultLinesUncheckedMax. The causes were
|
||||
almost entirely variable argument functions such as
|
||||
\lstinline|printf|.
|
||||
|
||||
We manually inspected changes and divided them into \emph{easy}
|
||||
changes and \emph{hard} changes. Easy changes include:
|
||||
|
||||
\begin{itemize}
|
||||
\item replacing included headers with their checked versions;
|
||||
\item converting a \uncheckedptrT{} to a \PtrT{};
|
||||
\item adding the \kwchecked{} keyword to an array declaration;
|
||||
\item introducing a \kwchecked{} or \kwunchecked{} region;
|
||||
\item adding an initializer; and
|
||||
\item replacing a call to \lstinline|malloc| with a call to
|
||||
\lstinline|calloc|.
|
||||
\end{itemize}
|
||||
|
||||
Hard changes are all other changes, including changing a
|
||||
\uncheckedptrT{} to a \ArrayptrT{} and adding a bounds declaration,
|
||||
adding structs, struct members, and local variables to represent
|
||||
run-time bounds information, and code modernization.
|
||||
|
||||
We distinguish between the two because we believe easy changes can be
|
||||
automated (as with our automated \PtrT{} conversion tool) or made
|
||||
unnecessary in the future by relaxing requirements such as the
|
||||
additions of initializers.
|
||||
|
||||
In all of our benchmarks, we found the majority of changes were easy.
|
||||
In six of the benchmarks, the only hard changes were adding bounds
|
||||
annotations relating to the parameters of \lstinline|main|. In yacr2
|
||||
there are a lot of bounds declarations that are all exactly the same
|
||||
where global variables are passed as arguments, inflating the number
|
||||
of ``hard'' changes.
|
||||
|
||||
\paragraph{Layout Changes} In three benchmarks---em3d, mst, and
|
||||
yacr2---we had to add intermediate structs so that we could represent
|
||||
the bounds on \ArrayptrT{}s nested inside arrays. In mst we also had
|
||||
to add a member to a struct to represent the bounds on an
|
||||
\ArrayptrT{}. In the first case, this is because we cannot represent
|
||||
the bounds on nested \ArrayptrT{}s, in the second case this is because
|
||||
we only allow bounds on members to reference other members in the same
|
||||
struct. In em3d and anagram we also added local temporary variables to
|
||||
represent bounds information.
|
||||
|
||||
\section{Performance Results}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{scripts/overheads}
|
||||
\caption{Performance Overheads}
|
||||
\label{fig:oh-plot}
|
||||
\end{figure}
|
||||
|
||||
An important concern about run-time checking for C is the effect on
|
||||
performance and compile time. The average run-time overhead introduced
|
||||
by added dynamic checks was \ResultRunTimeMean. In more than half of
|
||||
the benchmarks the overhead was less than 1\%. We believe this to be
|
||||
an acceptably low overhead that better static analysis may reduce even
|
||||
further.
|
||||
|
||||
In all but two benchmarks---treadd and ft---the added overhead matches
|
||||
(meaning performance is within 2\% of) or betters that of Deputy. For
|
||||
yacr2 and em3d, Checked C does substantially better than Deputy, whose
|
||||
overheads are 98\% and 56\%, respectively. Checked C's overhead
|
||||
betters or matches that reported by CCured in every case but ft.
|
||||
|
||||
On average, the compile-time overhead added by using Checked C is
|
||||
\ResultCompileTimeMean. The maximum overhead is \ResultCompileTimeMax,
|
||||
and the minimum is \ResultCompileTimeMin faster than compiling with C.
|
||||
We have spent no time at all optimising compile-time for Checked C. In
|
||||
particular, treeadd is a major outlier because the program is so
|
||||
short.
|
||||
|
||||
We also evaluated code size overhead, by looking at the change in the
|
||||
size of \lstinline|.text| section of the executable. This excludes data
|
||||
that might be stripped, like debugging information. Across the
|
||||
benchmarks, there is an average \ResultExecutableSizeMean code size
|
||||
overhead from the introduction of dynamic checks. Ten of the programs
|
||||
have a code size increase of less than 10\%.
|
||||
|
||||
\section{Evaluation}
|
||||
|
||||
All considered, we offer equivalent or better performance than either
|
||||
CCured or Deputy provide, at the outlay of having to convert a larger
|
||||
amount of code. We believe this to be a good trade-off, partially
|
||||
because we feel most of these changes can be automated, and partially
|
||||
because we provide the programmer greater control over how they
|
||||
express their bounds in terms of other program data. The latter in
|
||||
particular allows the programmer to work with their optimizer to
|
||||
ensure run-time overhead, especially in tight loops, remains low.
|
||||
|
||||
\subsection{Interaction with the LLVM Optimizer}
|
||||
|
||||
Currently we have done no specific work within our compiler to elide
|
||||
or optimize our dynamic checks. Any checks that Clang/LLVM currently
|
||||
removes, it does so using its existing optimization passes.
|
||||
|
||||
We expected the LLVM optimizer to be a lot worse at optimizing dynamic
|
||||
checks than it turned out to be. In only two of our benchmarks did we
|
||||
have to hand-optimize these programs to reduce the overhead of dynamic
|
||||
checking to approximately 0\%. This was accomplished by a mixture of
|
||||
being more specific with annotated bounds (including introducing
|
||||
temporary variables) and hoisting checks from inner loops using
|
||||
explicit dynamic checks.
|
||||
|
||||
Expressions that caused most difficulty for removing dynamic checks
|
||||
was code involving (at least) two pointer dereferences, as this incurs
|
||||
an unavoidable non-null check on the inner pointer, even if the outer
|
||||
pointer can be proven to be non-null. This can cause overhead in loops
|
||||
iterating through, for example, linked lists or graphs. We are working
|
||||
on a proposal to add nullness (and non-null) annotations to pointers
|
||||
in Checked C, which would allow for these inner non-null checks to be
|
||||
elided in many cases.
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,387 @@
|
|||
\chapter{Example}
|
||||
\label{sec:example}
|
||||
|
||||
\begin{figure}[th]
|
||||
\centering
|
||||
\begin{bytefield}[bitwidth=4em]{4}
|
||||
\begin{rightwordgroup}{Length Bytes}
|
||||
\bitbox{1}{Length} & \bitbox[tlb]{3}{Payload \ldots}
|
||||
\end{rightwordgroup}
|
||||
\end{bytefield}
|
||||
\caption{Echo Request Format}
|
||||
\label{fig:echoformat}
|
||||
\end{figure}
|
||||
|
||||
We are going to walk through converting a C program into Checked C.
|
||||
The program is part of a server that receives a request, in the format
|
||||
shown in~\autoref{fig:echoformat}. It will then parse the message, and
|
||||
respond with a copy of the message, in the same format. The original C
|
||||
code will contain a vulnerability very similar to that in
|
||||
Heartbleed~\cite{Heartbleed2014}, which we will show that the Checked
|
||||
C version prevents.
|
||||
|
||||
\section{Unchecked Program}
|
||||
|
||||
\begin{code}[label=ex1:unchecked,float=t,caption={Unchecked Example}]
|
||||
typedef struct req_t { // Request Struct
|
||||
~\lstleftlabel{ex1:len}~ size_t length; // user-provided
|
||||
~\lstleftlabel{ex1:pllen}~ size_t payload_len; // parser-provided
|
||||
~\lstleftlabel{ex1:pl}~ char *payload;
|
||||
// ...
|
||||
} req_t;
|
||||
|
||||
typedef struct resp_t { // Response Struct
|
||||
size_t payload_len;
|
||||
char *payload;
|
||||
// ...
|
||||
} resp_t;
|
||||
|
||||
bool echo(req_t *req ~\lstlabel{ex1:req}~, resp_t *resp ~\lstlabel{ex1:resp}~) { // Handler
|
||||
char *resp_data = malloc(req->length); ~\lstlabel{ex1:malloc}~
|
||||
|
||||
resp->payload_len = req->length; ~\lstlabel{ex1:bleed1}~
|
||||
resp->payload = resp_data;
|
||||
|
||||
// memcpy(resp->payload, req->payload, req->length); ~\lstlabel{ex1:memcpy}~
|
||||
for (size_t i = 0; i < req->length ~\lstlabel{ex1:bleed2}~; i++) {
|
||||
resp->payload[i] = req->payload[i] ~\lstlabel{ex1:overflow}~;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
\end{code}
|
||||
|
||||
The C function is shown in~\autoref{ex1:unchecked}. Our function,
|
||||
\lstinline|echo|, is modelled as a response handler, which is provided
|
||||
with a complete request struct~\lstref{ex1:req} containing data from
|
||||
the request parser, and an incomplete response
|
||||
struct~\lstref{ex1:resp} which the handler will construct the response
|
||||
data into.
|
||||
|
||||
This example has two vulnerabilities:
|
||||
\begin{enumerate}
|
||||
\item The first is based on Heartbleed~\cite{Heartbleed2014}, a
|
||||
notorious OpenSSL vulnerability disclosed in April 2014.
|
||||
|
||||
In the request, the user is providing a payload~\lstref{ex1:pl} to be
|
||||
echoed back to them, and its length~\lstref{ex1:len}. The correct
|
||||
behaviour of the handler is to copy the data from the payload buffer
|
||||
into a buffer in the response~\lstref{ex1:resp}.
|
||||
|
||||
There is no requirement that the length provided in the request is
|
||||
correct, and if the programmer uses this user-provided length, as they
|
||||
have at~\lstref{ex1:malloc}, \lstref{ex1:bleed1},
|
||||
and~\lstref{ex1:bleed2}, then they may read beyond the bounds of the
|
||||
payload buffer at~\lstref{ex1:overflow}. If the programmer had used
|
||||
the length provided by their parser~\lstref{ex1:pllen}, then this
|
||||
buffer overflow could have been avoided.
|
||||
|
||||
In the case of Heartbleed, this buffer overflow allowed attackers to
|
||||
view arbitrary segments of server memory.
|
||||
|
||||
\item The second vulnerability is a common error made by C
|
||||
programmers, that of not checking function return values. If the call
|
||||
to \lstinline|malloc| at~\lstref{ex1:malloc} returns \NULL, then the
|
||||
allocation failed, and you may not use that pointer to access memory.
|
||||
|
||||
\end{enumerate}
|
||||
|
||||
We have inlined the call to \lstinline|memcpy| at~\lstref{ex1:memcpy}
|
||||
to show where a loop like this would dynamically fail. The behaviour
|
||||
is slightly different if we do not inline \lstinline|memcpy|, and will
|
||||
be described later in~\autoref{sec:check-funct-calls}.
|
||||
|
||||
\section{Conversion to Checked C}
|
||||
|
||||
In order to make this program safer, we will convert it into Checked
|
||||
C. This will protect us from both the vulnerabilities that it includes
|
||||
- the out-of-bounds array access, and accessing memory via the null
|
||||
pointer returned from \lstinline|malloc|.
|
||||
|
||||
\begin{code}[label=ex2:checked,float=t,caption={Checked Example (Converted from \autoref{ex1:unchecked})}]
|
||||
typedef struct req_t { // Request Struct
|
||||
size_t length; // user-provided
|
||||
size_t payload_len; // parser-provided
|
||||
~\lstleftlabel{ex2:reqdecl}~ array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} req_t;
|
||||
|
||||
typedef struct resp_t { // Response Struct
|
||||
size_t payload_len;
|
||||
~\lstleftlabel{ex2:respdecl}~ array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} resp_t;
|
||||
|
||||
bool echo(ptr<req_t> req ~\lstlabel{ex2:reqptr}~, ptr<resp_t> resp ~\lstlabel{ex2:respptr}~) { // Handler
|
||||
~\lstleftlabel{ex2:bufdecl}~ array_ptr<char> resp_data : count(req->length) = malloc(req->length);
|
||||
|
||||
resp->payload_len = req->length;
|
||||
resp->payload = resp_data;
|
||||
|
||||
// memcpy(resp->payload, req->payload, req->length);
|
||||
for (size_t i = 0; i < req->length; i++) {
|
||||
resp->payload[i] = req->payload[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
\end{code}
|
||||
|
||||
This conversion will proceed in two steps:
|
||||
\begin{enumerate}
|
||||
\item First, we will convert any declarations from their existing
|
||||
unchecked C types to their corresponding Checked C types, which relies
|
||||
on analysing how these declarations are used.
|
||||
|
||||
In~\autoref{ex2:checked}, we start by converting the types
|
||||
at~\lstref{ex2:reqdecl}, \lstref{ex2:respdecl},
|
||||
and~\lstref{ex2:bufdecl} from unchecked pointers into checked array
|
||||
pointers, as these are used as arrays. We also
|
||||
change~\lstref{ex2:reqptr} and~\lstref{ex2:respptr} to checked
|
||||
singleton pointers, as we know these pointers are not used as arrays,
|
||||
they are directly accessed.
|
||||
|
||||
\item Second, we will annotate any
|
||||
array pointers with a description of their bounds in order to allow us
|
||||
to enforce these bounds at compile- or run-time.
|
||||
|
||||
From the specification of the message format, we know that the payload
|
||||
is variable-length, the length field is not trustworthy, but we should
|
||||
relate the parser-provided length in the request (or the
|
||||
handler-provided length in the response) to the payload buffer. This
|
||||
is done with the bounds declarations at~\lstref{ex2:reqdecl}
|
||||
and~\lstref{ex2:respdecl}, which reference the other members of the
|
||||
struct. We also annotate the local variable used as a temporary for
|
||||
the result of malloc, at~\lstref{ex2:bufdecl}, with its intended size.
|
||||
\end{enumerate}
|
||||
|
||||
\section{Compiler-inserted Dynamic Checks}
|
||||
|
||||
In order to preserve the soundness condition proposed
|
||||
in~\autoref{sec:soundness}, the Checked C compiler must insert dynamic
|
||||
checks which ensure these invariants hold, if it cannot statically
|
||||
prove these invariants hold at compile-time. To show how this works,
|
||||
\autoref{ex3:checked-explicit} is the same code
|
||||
as~\autoref{ex2:checked}, but with all implicit dynamic checks
|
||||
explicitly shown with the \kwdynamiccheck{} operator, and without the
|
||||
compiler attempting to prove any of them.
|
||||
|
||||
\begin{code}[label=ex3:checked-explicit,float=t,caption={Checked Example with Explicit Checks (Based on \autoref{ex2:checked})}]
|
||||
typedef struct req_t { // Request Struct
|
||||
size_t length; // user-provided
|
||||
size_t payload_len; // parser-provided
|
||||
array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} req_t;
|
||||
|
||||
typedef struct resp_t { // Response Struct
|
||||
size_t payload_len;
|
||||
array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} resp_t;
|
||||
|
||||
bool echo(ptr<req_t> req, ptr<resp_t> resp) { // Handler
|
||||
~\lstleftlabel{ex3:reqck}~ dynamic_check(req != NULL);
|
||||
array_ptr<char> resp_data : count(req->length) = malloc(req->length);
|
||||
|
||||
~\lstleftlabel{ex3:respck}~ dynamic_check(resp != NULL);
|
||||
resp->payload_len = req->length;
|
||||
dynamic_check(resp != NULL);
|
||||
resp->payload = resp_data;
|
||||
|
||||
for (size_t i = 0; dynamic_check(req != NULL), i < req->length; i++) {
|
||||
dynamic_check(req != NULL);
|
||||
dynamic_check(req->payload != NULL);
|
||||
~\lstleftlabel{ex3:reqpllb}~ dynamic_check(req->payload <= &(req->payload[i]));
|
||||
~\lstleftlabel{ex3:reqplub}~ dynamic_check(&(req->payload[i]) < (req->payload + req->payload_len));
|
||||
dynamic_check(resp != NULL);
|
||||
~\lstleftlabel{ex3:respplck}~ dynamic_check(resp->payload != NULL);
|
||||
~\lstleftlabel{ex3:resppllb}~ dynamic_check(resp->payload <= &(resp->payload[i]));
|
||||
~\lstleftlabel{ex3:respplub}~ dynamic_check(&(resp->payload[i]) < (resp->payload + resp->payload_len));
|
||||
resp->payload[i] = req->payload[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
\end{code}
|
||||
|
||||
Note first the checks at~\lstref{ex3:reqck} and~\lstref{ex3:respck}
|
||||
which ensure the checked singleton pointers are not \NULL. We also
|
||||
check the checked array pointer at~\lstref{ex3:respplck}, which
|
||||
ensures we do not access memory using the pointer returned from
|
||||
\lstinline|malloc| if it is \NULL.
|
||||
|
||||
At~\lstref{ex3:reqpllb} and~\lstref{ex3:reqplub} we check the bounds
|
||||
of the access to the request payload against its bounds. Here the
|
||||
bounds are computed using information from the same struct that the
|
||||
pointer is a member of. We check the invariants both for the read of
|
||||
the request payload, and (at~\lstref{ex3:resppllb}
|
||||
and~\lstref{ex3:respplub}) for the write into the response payload.
|
||||
|
||||
Returning to our bugs, \lstinline|malloc| returning \NULL{} will be
|
||||
caught by~\lstref{ex3:respplck}, and attempting to read the request
|
||||
payload out-of-bounds will be caught by~\lstref{ex3:reqplub}.
|
||||
|
||||
However, to claim that the program in~\autoref{ex3:checked-explicit}
|
||||
is now bug-free would be incorrect, unless the programmer wishes for
|
||||
the program to crash on bad inputs.
|
||||
|
||||
\section{Optimized Checks}
|
||||
|
||||
Performing those checks in this inner loop is also not particularly
|
||||
practical, as these checks are slow. If a compiler can prove it is
|
||||
safe, they are free to move, optimize, or delete these dynamic checks
|
||||
if they can prove the program will remain sound.
|
||||
\autoref{ex4:checked-optimised} shows a version
|
||||
of~\autoref{ex3:checked-explicit} where some checks have been hoisted
|
||||
and some have been removed.
|
||||
|
||||
|
||||
\begin{code}[label=ex4:checked-optimised,float=t,caption={Optimised Checked Example with Explicit Checks (Based on \autoref{ex3:checked-explicit})}]
|
||||
typedef struct req_t { // Request Struct
|
||||
size_t length; // user-provided
|
||||
size_t payload_len; // parser-provided
|
||||
array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} req_t;
|
||||
|
||||
typedef struct resp_t { // Response Struct
|
||||
size_t payload_len;
|
||||
array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} resp_t;
|
||||
|
||||
bool echo(ptr<req_t> req, ptr<resp_t> resp) { // Handler
|
||||
dynamic_check(req != NULL);
|
||||
array_ptr<char> resp_data : count(req->length) = malloc(req->length);
|
||||
|
||||
dynamic_check(resp != NULL);
|
||||
resp->payload_len = req->length;
|
||||
resp->payload = resp_data;
|
||||
|
||||
~\lstleftlabel{ex4:reqplck}~ dynamic_check(req->payload != NULL);
|
||||
~\lstleftlabel{ex4:respplck}~ dynamic_check(resp->payload != NULL);
|
||||
for (size_t i = 0; i < req->length; i++) {
|
||||
~\lstleftlabel{ex4:reqplub}~ dynamic_check(i < req->payload_len);
|
||||
resp->payload[i] = req->payload[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
\end{code}
|
||||
|
||||
In this example, the compiler has deleted four non-null checks, it has
|
||||
hoisted two loop-invariant non-null checks out of the inner loop, and
|
||||
it has deleted 3 bounds checks that can be proven to be unneeded using
|
||||
information about the range of \lstinline|i| within the loop.
|
||||
|
||||
The remaining check inside the loop has been simplified according to
|
||||
the semantics of C, and is exactly the check that fails if an attacker
|
||||
provides a length that is larger than their payload length. In the
|
||||
unchecked code, this is the condition that would cause a buffer
|
||||
overflow.
|
||||
|
||||
We currently make no guarantees that we are able to perform any of
|
||||
these check elisions or optimizations, but it is our aim to be able to
|
||||
eventually.
|
||||
|
||||
\section{Checked Function Calls}
|
||||
\label{sec:check-funct-calls}
|
||||
|
||||
Had the programmer not written their own copy loop, and instead used
|
||||
\lstinline|memcpy|, their results would have overall been the same,
|
||||
but static analysis would have told them slightly sooner about their
|
||||
error.
|
||||
|
||||
In Checked C we ship a copy of the C standard library declarations, annotated with interoperation types so that they may be used in Checked contexts. The declaration of \lstinline|memcpy| used in checked scopes is shown in~\autoref{ex5:memcpy}.
|
||||
|
||||
\begin{code}[label=ex5:memcpy,float=ht,caption={Checked Type of \texttt{memcpy}}]
|
||||
void *memcpy(void * restrict dest : byte_count(n),
|
||||
const void * restrict src : byte_count(n),
|
||||
size_t n)
|
||||
: bounds(dest, dest + n);
|
||||
\end{code}
|
||||
|
||||
In this case, the compiler will not insert dynamic checks if the
|
||||
programmer calls \lstinline|memcpy|, as has been commented out
|
||||
in~\autoref{ex2:checked}, because none of the checked pointer
|
||||
arguments are being used to access memory at the call (the non-null
|
||||
checks of \lstinline|resp| and \lstinline|req| will remain as these
|
||||
are accessed to find the value of the pointers in the call
|
||||
expression).
|
||||
|
||||
Instead the compiler must statically prove that the bounds calculated
|
||||
for the \lstinline|resp->payload| argument are at least as wide as the
|
||||
declared bounds of the \lstinline|dest| parameter, and also for
|
||||
\lstinline|req->payload| and \lstinline|src|. In this case, both
|
||||
argument values are of \lstinline|char*| type, so
|
||||
\lstinline|byte_count(x)| expressions are equivalent to
|
||||
\lstinline|count(x)| expressions.
|
||||
|
||||
Using congruence, the compiler should be able to prove this property
|
||||
for \lstinline|dest|, but it will be unable to for \lstinline|src|, as
|
||||
it has no information to relate \lstinline|req->payload_len| and
|
||||
\lstinline|req->length|. This will cause a compile-time error.
|
||||
|
||||
This example is exactly what the dynamic cast operators
|
||||
from~\autoref{sec:dynamiccastops} are for. Providing the argument
|
||||
\lstinline|dynamic_cast<array_ptr<char>>(req->payload, req->payload, req->payload + req->length)|
|
||||
for the \lstinline|dest| parameter, which tells the compiler that the
|
||||
first argument has bounds $[\texttt{req} \arrow \texttt{payload},$
|
||||
$\texttt{req} \arrow \texttt{payload} + \texttt{req} \arrow
|
||||
\texttt{length} )$, would satisfy the static check (preventing the
|
||||
compiler error), but this would also insert a dynamic check before the
|
||||
call to make sure that these bounds are larger than the original
|
||||
bounds of \lstinline|req->payload|. If a user submits a packet where
|
||||
the length field is larger than the payload length, then this check
|
||||
would fail before the call to \lstinline|memcpy|.
|
||||
|
||||
\section{Corrected Checked C Program}
|
||||
|
||||
Returning to the notion of making sure this program is not-only memory
|
||||
safe but also bug-free, \autoref{ex6:corrected} shows the corrected
|
||||
program, where we have: validated the relationship of
|
||||
\lstinline|req->length| and \lstinline|req->payload_len|
|
||||
at~\lstref{ex6:valid1}, validated the return value of
|
||||
\lstinline|malloc| at~\lstref{ex6:valid2}, and replaced all further
|
||||
used of \lstinline|req->length| with \lstinline|req->payload_len|
|
||||
at~\lstref{ex6:rep0}, \lstref{ex6:rep1}, \lstref{ex6:rep2},
|
||||
and~\lstref{ex6:rep3}.
|
||||
|
||||
|
||||
\begin{code}[label=ex6:corrected,float=t,caption={Corrected Checked Example (Based on \autoref{ex2:checked})}]
|
||||
typedef struct req_t { // Request Struct
|
||||
size_t length; // user-provided
|
||||
size_t payload_len; // parser-provided
|
||||
array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} req_t;
|
||||
|
||||
typedef struct resp_t { // Response Struct
|
||||
size_t payload_len;
|
||||
array_ptr<char> payload : count(payload_len);
|
||||
// ...
|
||||
} resp_t;
|
||||
|
||||
bool echo(ptr<req_t> req, ptr<resp_t> resp) { // Handler
|
||||
~\lstleftlabel{ex6:valid1}~ if (req->payload_len < req->length)
|
||||
return false;
|
||||
|
||||
array_ptr<char> resp_data : count(req->payload_len ~\lstlabel{ex6:rep0}~) = malloc(req->payload_len ~\lstlabel{ex6:rep1}~);
|
||||
~\lstleftlabel{ex6:valid2}~ if (resp_data == NULL)
|
||||
return false;
|
||||
|
||||
resp->payload_len = req->payload_len; ~\lstlabel{ex6:rep2}~
|
||||
resp->payload = resp_data;
|
||||
|
||||
for (size_t i = 0; i < req->payload_len ~\lstlabel{ex6:rep3}~; i++) {
|
||||
resp->payload[i] = req->payload[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
\end{code}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,29 @@
|
|||
\begin{align*}
|
||||
\text{LValue Expressions} \qquad
|
||||
e_l \Coloneqq {} & x \tag*{Variables} \\
|
||||
\mid {} & \deref e_v \tag*{Dereference} \\
|
||||
\mid {} & e_v[e_v] \tag*{Index} \\
|
||||
\mid {} & e_l \member m \tag*{Struct Member Access} \\
|
||||
\mid {} & e_v \arrow m \tag*{Pointer Member Access} \\[1em]
|
||||
\text{Value Expressions} \qquad
|
||||
e_v \Coloneqq {} & \addrof e_l \tag*{Address-of} \\
|
||||
\mid {} & e_l \assign e_v \tag*{Assignment} \\
|
||||
\mid {} & e_l \compassign e_v & \tag*{Compound Assignment} \\
|
||||
\mid {} & e_l \inc \mathrel{\mid} \inc e_l \tag*{Increment} \\
|
||||
\mid {} & e_l \dec \mathrel{\mid} \dec e_l \tag*{Decrement} \\
|
||||
\mid {} & e_v \member m \tag*{Struct Member Access} \\
|
||||
\mid {} & \hdots \tag*{Other Value Expressions} \\
|
||||
\mid {} & e_l \tag*{Lvalue \& Array Conversion}
|
||||
\end{align*}%
|
||||
\caption[Lvalue expressions and their usage in C]{Lvalue expressions
|
||||
and their usage in C. A more complete description of C's expression
|
||||
syntax, including a definition of $\binop$ (binary operators), is
|
||||
contained in~\autoref{app:c-syntax}. Note that a Struct Member Access
|
||||
can be an lvalue or a value depending on whether the sub-expression is
|
||||
an lvalue or a value (respectively).}
|
||||
\label{fig:lvalue-expressions}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,157 @@
|
|||
% From Ulrich Dirr via https://tex.stackexchange.com/a/8474/45937
|
||||
%%% hyperendnotes.sty
|
||||
\makeatletter
|
||||
\newif\ifenotelinks
|
||||
\newcounter{Hendnote}
|
||||
% Redefining portions of endnotes-package:
|
||||
\let\savedhref\href
|
||||
\let\savedurl\url
|
||||
\def\endnotemark{%
|
||||
\@ifnextchar[\@xendnotemark{%
|
||||
\stepcounter{endnote}%
|
||||
\protected@xdef\@theenmark{\theendnote}%
|
||||
\protected@xdef\@theenvalue{\number\c@endnote}%
|
||||
\@endnotemark
|
||||
}%
|
||||
}%
|
||||
\def\@xendnotemark[#1]{%
|
||||
\begingroup\c@endnote#1\relax
|
||||
\unrestored@protected@xdef\@theenmark{\theendnote}%
|
||||
\unrestored@protected@xdef\@theenvalue{\number\c@endnote}%
|
||||
\endgroup
|
||||
\@endnotemark
|
||||
}%
|
||||
\def\endnotetext{%
|
||||
\@ifnextchar[\@xendnotenext{%
|
||||
\protected@xdef\@theenmark{\theendnote}%
|
||||
\protected@xdef\@theenvalue{\number\c@endnote}%
|
||||
\@endnotetext
|
||||
}%
|
||||
}%
|
||||
\def\@xendnotenext[#1]{%
|
||||
\begingroup
|
||||
\c@endnote=#1\relax
|
||||
\unrestored@protected@xdef\@theenmark{\theendnote}%
|
||||
\unrestored@protected@xdef\@theenvalue{\number\c@endnote}%
|
||||
\endgroup
|
||||
\@endnotetext
|
||||
}%
|
||||
\def\endnote{%
|
||||
\@ifnextchar[\@xendnote{%
|
||||
\stepcounter{endnote}%
|
||||
\protected@xdef\@theenmark{\theendnote}%
|
||||
\protected@xdef\@theenvalue{\number\c@endnote}%
|
||||
\@endnotemark\@endnotetext
|
||||
}%
|
||||
}%
|
||||
\def\@xendnote[#1]{%
|
||||
\begingroup
|
||||
\c@endnote=#1\relax
|
||||
\unrestored@protected@xdef\@theenmark{\theendnote}%
|
||||
\unrestored@protected@xdef\@theenvalue{\number\c@endnote}%
|
||||
\show\@theenvalue
|
||||
\endgroup
|
||||
\@endnotemark\@endnotetext
|
||||
}%
|
||||
\def\@endnotemark{%
|
||||
\leavevmode
|
||||
\ifhmode
|
||||
\edef\@x@sf{\the\spacefactor}\nobreak
|
||||
\fi
|
||||
\ifenotelinks
|
||||
\expandafter\@firstofone
|
||||
\else
|
||||
\expandafter\@gobble
|
||||
\fi
|
||||
{%
|
||||
\Hy@raisedlink{%
|
||||
\hyper@@anchor{Hendnotepage.\@theenvalue}{\empty}%
|
||||
}%
|
||||
}%
|
||||
\hyper@linkstart{link}{Hendnote.\@theenvalue}%
|
||||
\makeenmark
|
||||
\hyper@linkend
|
||||
\ifhmode
|
||||
\spacefactor\@x@sf
|
||||
\fi
|
||||
\relax
|
||||
}%
|
||||
\long\def\@endnotetext#1{%
|
||||
\if@enotesopen
|
||||
\else
|
||||
\@openenotes
|
||||
\fi
|
||||
\immediate\write\@enotes{%
|
||||
\@doanenote{\@theenmark}{\@theenvalue}%
|
||||
}%
|
||||
\begingroup
|
||||
\def\next{#1}%
|
||||
\newlinechar='40
|
||||
\immediate\write\@enotes{\meaning\next}%
|
||||
\endgroup
|
||||
\immediate\write\@enotes{%
|
||||
\@endanenote
|
||||
}%
|
||||
}%
|
||||
\def\theendnotes{%
|
||||
\immediate\closeout\@enotes
|
||||
\global\@enotesopenfalse
|
||||
\begingroup
|
||||
\makeatletter
|
||||
\edef\@tempa{`\string>}%
|
||||
\ifnum\catcode\@tempa=12
|
||||
\let\@ResetGT\relax
|
||||
\else
|
||||
\edef\@ResetGT{\noexpand\catcode\@tempa=\the\catcode\@tempa}%
|
||||
\@makeother\>%
|
||||
\fi
|
||||
\def\@doanenote##1##2##3>{%
|
||||
\def\@theenmark{##1}%
|
||||
\def\@theenvalue{##2}%
|
||||
\par
|
||||
\smallskip %<-small vertical gap between endnotes
|
||||
\begingroup
|
||||
\def\href{\expandafter\savedhref}%
|
||||
\def\url{\expandafter\savedurl}%
|
||||
\@ResetGT
|
||||
\edef\@currentlabel{\csname p@endnote\endcsname\@theenmark}%
|
||||
\enoteformat
|
||||
}%
|
||||
\def\@endanenote{%
|
||||
\par\endgroup
|
||||
}%
|
||||
% Redefine, how numbers are formatted in the endnotes-section:
|
||||
\renewcommand*\@makeenmark{%
|
||||
\hbox{\normalfont\@theenmark.~}%
|
||||
}%
|
||||
% header of endnotes-section
|
||||
\enoteheading
|
||||
% font-size of endnotes
|
||||
\enotesize
|
||||
\input{\jobname.ent}%
|
||||
\endgroup
|
||||
}%
|
||||
\def\enoteformat{%
|
||||
\rightskip\z@
|
||||
\leftskip1.8em
|
||||
\parindent\z@
|
||||
\leavevmode\llap{%
|
||||
\setcounter{Hendnote}{\@theenvalue}%
|
||||
\addtocounter{Hendnote}{-1}%
|
||||
\refstepcounter{Hendnote}%
|
||||
\ifenotelinks
|
||||
\expandafter\@secondoftwo
|
||||
\else
|
||||
\expandafter\@firstoftwo
|
||||
\fi
|
||||
{\@firstofone}%
|
||||
{\hyperlink{Hendnotepage.\@theenvalue}}%
|
||||
{\makeenmark}%
|
||||
}%
|
||||
}%
|
||||
% stop redefining portions of endnotes-package:
|
||||
\makeatother
|
||||
% Toggle switch in order to turn on/off back-links in the
|
||||
% endnote-section:
|
||||
\enotelinkstrue
|
||||
%\enotelinksfalse
|
|
@ -0,0 +1,132 @@
|
|||
\chapter{Introduction}
|
||||
\label{sec:introduction}
|
||||
|
||||
The C programming language~\cite{Ritchie1988,ISO2011} grants
|
||||
programmers low-level control of their programs rivalled only by
|
||||
directly writing assembly by hand. This is both its greatest strength,
|
||||
and its greatest weakness~\cite{krebbers:n1637}. C grants control of
|
||||
memory safety (and many other kinds of safety) directly to the
|
||||
programmer, but unfortunately programmers are human and make mistakes,
|
||||
and these mistakes can be devastating.
|
||||
|
||||
We classify memory safety mistakes into one of two categories.
|
||||
\emph{Temporal safety} errors happen when a programmer attempts to use
|
||||
an object outside of its defined storage duration. \emph{Spatial
|
||||
safety} errors happen when a programmer attempts to use a pointer to
|
||||
an object to access memory beyond the extents of that particular
|
||||
object. A \emph{buffer overrun} is a kind of spatial safety violation
|
||||
where a program reads memory surrounding an object in memory, usually
|
||||
because the programmer calculated its size incorrectly.
|
||||
|
||||
During the period 2012--2016, buffer overruns were the source of 9.7\%
|
||||
to 18.4\% of CVEs reported in the NIST vulnerability
|
||||
database~\cite{nvdb}. During that time, buffer overruns were the
|
||||
leading single cause of CVEs.
|
||||
|
||||
Programmers are still unwilling to give up this low-level control of
|
||||
their programs, despite protections that type-safe languages may
|
||||
provide~\cite{Kell2017}. Fortunately we can build on prior
|
||||
research~\cite{Jim2002,Feng2006} into language safety to devise a
|
||||
safer C which does not give up this low-level control, without the
|
||||
drawbacks of previous approaches.
|
||||
|
||||
The Checked C project is a new effort towards achieving a safer C. It
|
||||
does this by extending the language with new types and constructs for
|
||||
expressing invariants about objects in memory, and then enforcing
|
||||
these invariants within compiled programs using both static and
|
||||
dynamic checks.
|
||||
|
||||
Checked C currently only addresses spatial memory safety, and
|
||||
therefore only provides constructs for reasoning about the bounds of
|
||||
objects in memory. The language extension does not yet address any
|
||||
temporal memory safety guarantees. The Checked C specification is
|
||||
available online at
|
||||
\url{https://github.com/microsoft/checkedc/releases}.
|
||||
|
||||
This report describes my work on developing and implementing the
|
||||
dynamic checks for spatial memory safety and the infrastructure they
|
||||
require.
|
||||
|
||||
\section{Contributions}
|
||||
|
||||
I joined the project at the time the Checked C Specification v0.6 was
|
||||
released, in January 2017. At that point, the Checked C compiler could
|
||||
parse and represent its new types, and bounds expressions, but it did
|
||||
minimal verification of these bounds, and could not generate dynamic
|
||||
checks as required by the specification.
|
||||
|
||||
\begin{minipage}{\textwidth}
|
||||
The Checked C Specification v0.6~\cite{CheckedCv06} has the following
|
||||
to say about dynamic bounds checks:
|
||||
\begin{quotation}
|
||||
Given \expr{*\mv{e1}}, where \mv{e1} is an expression of type
|
||||
\Arrayptr{\mv{T}}, the compiler determines the bounds for \mv{e1}
|
||||
following the rules in Section~4.2. Special rules are followed in
|
||||
\kw{bundled} blocks to determine the bounds for \mv{e1}. The
|
||||
compiler inserts checks before the memory pointed to by \mv{e1} is
|
||||
read or written that \mv{e1} is non-null and that the value of \mv{e1}
|
||||
is in bounds.
|
||||
|
||||
If \boundsinfer{\mv{e1}}{\bounds{\mv{e2}}{\mv{e3}}}, the compiler
|
||||
inserts a runtime check that \texttt{\mv{e2} <= \mv{e1} \&\& \mv{e1} <
|
||||
\mv{e3}}. If the runtime check fails, the program will be terminated
|
||||
by the runtime system or in, systems that support it, a runtime
|
||||
exception will be raised. If
|
||||
\boundsinfer{\mv{e1}}{\boundscount{\mv{e2}}}, this is expanded to
|
||||
\boundsinfer{\mv{e1}}{\bounds{\mv{e1}}{\mv{e1} + \mv{e2}}} before
|
||||
inserting checks. Of course a temporary variable would be used to hold
|
||||
the value of \mv{e1}.
|
||||
\signed{Extending C with bounds safety~\cite[Section 3.10]{CheckedCv06}}
|
||||
\end{quotation}
|
||||
|
||||
While this describes the requirements of the dynamic bounds checks and
|
||||
the propagation algorithm that computes the bounds to check against,
|
||||
it is informal and vague.
|
||||
\end{minipage}
|
||||
|
||||
My contributions to the Checked C project are:
|
||||
|
||||
\begin{itemize}
|
||||
|
||||
\item I was the first to clearly describe exactly where dynamic checks
|
||||
are required in terms of the C semantics (\autoref{sec:dynamic-checks}).
|
||||
|
||||
\item I devised the flow-insensitive algorithm described
|
||||
in~\autoref{sec:propagation} which computes bounds required for checks
|
||||
using lvalues and values, which corresponds closely to the semantics
|
||||
of C expressions, values and lvalues.
|
||||
|
||||
\item As part of devising this algorithm, I devised a coherent method
|
||||
for dealing with C structs and arrays, and their arbitrary
|
||||
composition, which keep checks sound and prevent some type errors,
|
||||
while also keeping run-time overhead low. This is called
|
||||
``narrowing'', and is described in~\autoref{sec:narrowing}.
|
||||
|
||||
\item I extended the existing clang code generator to generate the code to
|
||||
compute the required bounds and perform the required checks in LLVM
|
||||
IR (\autoref{sec:dynamic-checks-impl}).
|
||||
|
||||
\item I proposed and performed the first evaluation of the overhead of
|
||||
Checked C's dynamic bounds checks. The main result is that, for the
|
||||
pointer-intensive benchmarks we chose, the run-time overhead is on
|
||||
average \ResultRunTimeMean, and at most \ResultRunTimeMax, with an
|
||||
average executable size overhead of \ResultExecutableSizeMean and at
|
||||
most \ResultExecutableSizeMax. Further results are described
|
||||
in~\autoref{sec:evaluation}.
|
||||
|
||||
% \item I developed the first method for converting real-world C code
|
||||
% into Checked C, as described in \autoref{app:conversion}.
|
||||
|
||||
\end{itemize}
|
||||
|
||||
This report also provides a quicker overview of Checked C than the
|
||||
specification (\autoref{sec:checkedc}), and a worked example
|
||||
(\autoref{sec:example}) to illustrate how the compiler works in broad
|
||||
brushstrokes.
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,40 @@
|
|||
%% Source Code Notes
|
||||
\ProvidesPackage{listing-label}
|
||||
\RequirePackage{tikz}
|
||||
|
||||
% Counter, numbered by section
|
||||
\newcounter{decoratedlabel}[section]
|
||||
% Use this in listing, to label something right there. You'll need a way to
|
||||
% escape into normal latex in listings, I use \lstset{escapechar=@} so
|
||||
% this command goes in @\lstlabel{...}@. You'll probably also want
|
||||
% \lstset{flexiblecolumns,keepspaces} because it looks nicer.
|
||||
\newcommand{\decorate}[1]{\refstepcounter{decoratedlabel}\label{#1}%
|
||||
\protect\decoratedlabelpicture{\thedecoratedlabel}}
|
||||
% Use this in listing (again once escaped to latex), to label
|
||||
% something in the left margin (must be at start of a line, takes up 0
|
||||
% horizontal space in the listing line)
|
||||
\newcommand{\decorateleft}[1]{\refstepcounter{decoratedlabel}\label{#1}%
|
||||
\makebox[0pt]{\decoratedlabelpicture{\thedecoratedlabel}\hspace{2em}}}
|
||||
% Use this in text to refer to \lstlabel. If you use `hyperref` they
|
||||
% get linked together!
|
||||
\newcommand{\decref}[1]{\decoratedlabelpicture{\ref{#1}}}
|
||||
% Use letters, because we might have a lot of them
|
||||
\renewcommand{\thedecoratedlabel}{\Alph{decoratedlabel}}
|
||||
% Put a circle around the letter, of course.
|
||||
\newcommand*\decoratedlabelpicture[1]{%
|
||||
\tikz[baseline=(char.base)]{
|
||||
\node[shape=circle,draw,inner sep=1pt,minimum size=1.05em] (char) {\scriptsize\sffamily#1};
|
||||
}}
|
||||
%
|
||||
% If you want the label numbers to be per-listing, then uncomment the
|
||||
% following code. They are currently per-section (see above)
|
||||
%
|
||||
% \AtBeginDocument{
|
||||
% \makeatletter
|
||||
% \@addtoreset{decoratedlabel}{lstlisting}
|
||||
% \makeatother
|
||||
% }
|
||||
|
||||
\newcommand{\lstlabel}[1]{\decorate{#1}}
|
||||
\newcommand{\lstleftlabel}[1]{\decorateleft{#1}}
|
||||
\newcommand{\lstref}[1]{\decref{#1}}
|
|
@ -0,0 +1,268 @@
|
|||
\chapter{Overview of Checked C}
|
||||
\label{sec:checkedc}
|
||||
|
||||
Checked C is an extension to C that adds static and dynamic checking
|
||||
to prevent and detect common programming errors such as buffer
|
||||
overruns, out-of-bounds memory accesses, and incorrect type casts.
|
||||
Given that it is an extension to C, it aims to preserve backwards
|
||||
compatibility with C11~\cite{ISO2011}.
|
||||
|
||||
This report is concerned with how Checked C prevents buffer overruns
|
||||
and out-of-bounds memory accesses. In order to reason about these
|
||||
behaviours, Checked C adds new data types, a method to annotate
|
||||
variable declarations with a static description of their runtime
|
||||
bounds, checked regions, new casts, and a method for interoperation
|
||||
with and describing existing C code.
|
||||
|
||||
It is this static bounds description, which we call a ``bounds
|
||||
expression'', that is used both during static analysis and during code
|
||||
generation for the dynamic checks, and that we consider to be the main
|
||||
idea underlying Checked C's approach.
|
||||
|
||||
\section{New Data Types}
|
||||
|
||||
Checked C adds new types to C:
|
||||
|
||||
\begin{itemize}
|
||||
\item \PtrT, the checked pointer to singleton type,
|
||||
\item \ArrayptrT, the checked pointer to array type,
|
||||
\item \CheckedarrT{\mv{N}}, the checked array type,
|
||||
\end{itemize}
|
||||
|
||||
Both the checked pointer to singleton and the checked pointer to
|
||||
array types are intended to replace most uses of C's \uncheckedptrT,
|
||||
which we call the unchecked pointer type. The checked array type is
|
||||
intended to replace most uses of C's \uncheckedarrT{\mv{N}}, which we
|
||||
call the unchecked array type.
|
||||
|
||||
The checked pointer to singleton type, \PtrT, is intended to replace
|
||||
uses unchecked pointer types that are a
|
||||
pointer to a single value of type \mv{T}. For this reason, it is
|
||||
illegal to perform any pointer arithmetic on a value of type \PtrT,
|
||||
including array indexing. Checked pointers to singletons require no
|
||||
bounds checks, but may be null.
|
||||
|
||||
The checked pointer to array type, \ArrayptrT, is intended to replace
|
||||
uses of the equivalent unchecked pointer type, where it is used as a
|
||||
pointer to an array of values of type \mv{T}. These pointers may be
|
||||
null, may not overflow on pointer arithmetic, and require bounds
|
||||
checks when they are used to access memory.
|
||||
|
||||
The checked array type, \CheckedarrT{\mv{N}}, is intended to replace
|
||||
uses of the equivalent unchecked array type, \uncheckedarrT{\mv{N}}.
|
||||
In the same way that there is a duality between C's unchecked arrays
|
||||
and unchecked pointers, there is the same duality between Checked C's
|
||||
checked pointers to arrays and checked arrays.
|
||||
|
||||
We place two restrictions on the checked array type,
|
||||
\CheckedarrT{\mv{N}}. First, they may not be variable length, so
|
||||
\mv{N} must be a compile-time constant. Second, if an outer
|
||||
dimension of a multidimensional array is checked, all dimensions
|
||||
inside that dimension are also checked.
|
||||
|
||||
\section{Bounds Expressions}
|
||||
|
||||
In order to denote the bounds on a declaration or parameter, a bounds
|
||||
expression is associated with a checked array or checked array
|
||||
pointer, such as the bounds annotation at~\lstref{main:count}
|
||||
in~\autoref{lst:checked-main}.
|
||||
|
||||
\begin{code}[label=lst:checked-main,%
|
||||
caption={Declaration of \lstinline|main| in a Checked C program}]
|
||||
int main(int argc, array_ptr<char*> argv : count(argc) ~\lstlabel{main:count}~);
|
||||
\end{code}
|
||||
|
||||
\autoref{lst:checked-main}, though very short, shows several features
|
||||
of Checked C. The first is that Checked C's new types work exactly how
|
||||
C's regular types already do. The next feature is that we can annotate
|
||||
function parameters of pointer type with a bounds expression that
|
||||
provides a static description of that parameter's runtime bounds. The
|
||||
bounds expression at~\lstref{main:count} is only for the outer
|
||||
\ArrayptrT, it does not apply to the inner unchecked pointer.
|
||||
|
||||
The last feature is the bounds expression itself. There are three
|
||||
forms of bounds expressions in Checked C. The first, called a ``count
|
||||
expression'' is shown above --- here it denotes that the array has
|
||||
\lstinline|argc| elements. There is a more complicated bounds
|
||||
expression called a ``range expression'', which is written
|
||||
\bounds{\mv{lower}}{\mv{upper}} and denotes that the associated
|
||||
pointer is within the range $[\mv{lower}, \mv{upper})$. The last form
|
||||
of a bounds expression is written \boundsbytecount{\mv{n}} and means
|
||||
the array pointer points to the start of an array \mv{n} bytes long,
|
||||
regardless of the element size.
|
||||
|
||||
In~\autoref{lst:checked-main}, we could not declare \lstinline|argv|
|
||||
to have type \Checkedarr{\expr{\kw{char}*}}{\mv{argc}}, as
|
||||
\lstinline|argv| checked arrays may not be variable-length. It is also
|
||||
not \uncheckedarr{\Checkedarr{\kw{char}}{\mv{argc}}}{}, both as this
|
||||
declares the outer array as variable-length, and as this requires the
|
||||
inner array to be checked, but we have no information as to its length
|
||||
as it is a null-terminated string.\endnote{In fact, in C, no function
|
||||
parameter has array type, as you cannot pass an array (checked or not)
|
||||
to a function. Arrays are passed by passing a pointer to their first
|
||||
element, so C functions that are declared to have array-typed
|
||||
parameters actually have the equivalent pointer-typed parameter.
|
||||
Checked C does the same with checked arrays and checked pointers to
|
||||
arrays respectively.}
|
||||
|
||||
It is an error for a Checked C program to access memory via a null
|
||||
checked pointer, so these bounds only apply if the pointer is
|
||||
non-null.
|
||||
|
||||
\section{Explicit Cast Operators}
|
||||
\label{sec:dynamiccastops}
|
||||
|
||||
The logic of Checked C's bounds checks is reliant on C's expression
|
||||
semantics. Notably, this logic is undecidable, especially when it
|
||||
comes to deciding if one bounds expression implies another. We would
|
||||
prefer if compiling did not take infinitely long, so Checked C includes
|
||||
two operators that a programmer may use when the compiler requires,
|
||||
but cannot prove, that one bounds expression is a sub-range of
|
||||
another. These assert to the compiler that an expression has
|
||||
particular bounds, and are our ``bounds cast'' operators (in the same
|
||||
way a type cast operator asserts to the compiler that an expression
|
||||
has a particular type). \autoref{sec:static-checks} describes exactly
|
||||
when the compiler needs to prove these implications.
|
||||
|
||||
The dynamic bounds cast operator,
|
||||
\dynamiccast{\mv{T}}{\mv{e}}{\mv{lb}}{\mv{ub}}, performs a cast to
|
||||
type \mv{T} and a dynamic check to make sure that either \mv{e} is
|
||||
\NULL, or that the required bounds, $\left[\mv{lb}, \mv{ub}\right)$,
|
||||
are a sub-range of the bounds of \mv{e}. This check is explicit so that
|
||||
the programmer can see where there is an overhead in their program.
|
||||
Effectively this asserts a fact that will later be verified at run time.
|
||||
|
||||
The assume bounds cast operator,
|
||||
\assumecast{\mv{T}}{\mv{e}}{\mv{lb}}{\mv{ub}}, has the equivalent
|
||||
compile-time behaviour of the dynamic bounds cast operator, but
|
||||
performs no run-time check. This is used to assert to the compiler
|
||||
that \mv{e} really has bounds $[\mv{lb}, \mv{ub})$, without incurring
|
||||
run-time overhead. This asserts a fact that will never be verified,
|
||||
and is therefore a potential source of unsoundness.
|
||||
|
||||
\section{Checked Regions}
|
||||
\label{sec:checked-regions}
|
||||
|
||||
Checked C programs contain regions of unchecked and checked code. By
|
||||
default, all code is within an unchecked region. A programmer may
|
||||
annotate that a particular scope (i.e. a function or a block) is
|
||||
either a checked or unchecked region. A programmer may also use a
|
||||
pragma to say that any future scopes or declarations in the program
|
||||
file are part of a checked or unchecked region.
|
||||
|
||||
Within a checked region, the programmer may not use expressions or
|
||||
make declarations with unchecked types. In unchecked regions, a
|
||||
programmer may use checked types and they will still be checked
|
||||
against their bounds. Checked regions may call unchecked functions, as
|
||||
long as the latter's pointer parameter and return types are checked or
|
||||
have an interoperation type. Execution will also continue from a
|
||||
checked region into an unchecked region if a checked region contains
|
||||
an unchecked block (and vice versa).
|
||||
|
||||
\section{Interoperation Types}
|
||||
\label{sec:interoperation-types}
|
||||
|
||||
Checked C includes a method to annotate regular C declarations with
|
||||
bounds or ``interoperation'' types. This is so that Checked C code can
|
||||
call unchecked functions and still make assumptions about how pointer
|
||||
parameters will be used, or returned pointers may be used. In checked
|
||||
regions, these declarations are treated as if they have their checked
|
||||
types and bounds.
|
||||
|
||||
These interoperation types are primarily used for upgrading code
|
||||
gradually. Checked C provides a copy of the C11 standard library
|
||||
headers files with each declaration annotated with its Checked C
|
||||
interoperation type, in order to allow programmers to use the
|
||||
standard library from inside checked programs.
|
||||
|
||||
\section{Soundness}
|
||||
\label{sec:soundness}
|
||||
|
||||
The main aim of the Checked C compiler is to maintain a moderately
|
||||
complex soundness condition over checked regions of the program. A
|
||||
Checked C program must use either static checks, dynamic checks, or a
|
||||
mixture of both to enforce this soundness condition.
|
||||
|
||||
\paragraph{Well-Formedness} We define a Checked C checked region and
|
||||
its memory to be well-formed if:
|
||||
\begin{compactitem}
|
||||
\item The values of all bounds expressions for in-lifetime
|
||||
variables or their transitively reachable data are defined and a
|
||||
sub-range of their (potentially overlapping) objects in memory; and,
|
||||
\item All in-lifetime non-null pointers of type \mv{T} with bounds
|
||||
must point to an object in memory of type \mv{T}.
|
||||
\end{compactitem}
|
||||
|
||||
\paragraph{Soundness} A Checked C program is sound if, given a
|
||||
well-formed Checked C program and memory at the time when a checked
|
||||
region is entered:
|
||||
\begin{compactitem}
|
||||
\item during the evaluation of an expression in that checked region,
|
||||
any newly computed or declared bounds are also well-formed with
|
||||
respect to the region and program memory;\endnote{This requires that
|
||||
the Checked C code in that checked region does not perform any unsound
|
||||
casts, which is a hard property to formalize. Importantly we need to
|
||||
prevent casts between objects of two incompatible types, even if this
|
||||
is done by casting via a type that is compatible with both of these
|
||||
types, such as \protect\lstinline|void*|.} and
|
||||
\item accesses (reads or writes) to memory through a pointer in the
|
||||
checked region, ensure that the checked region and program memory
|
||||
remain well-formed.
|
||||
\end{compactitem}
|
||||
|
||||
\paragraph{Corollary} A corollary to this soundness is the fact that
|
||||
if a checked region completes evaluation, then we know that any
|
||||
reached memory was accurately described by its bounds information, and
|
||||
no checked pointers tried to access out-of-bounds memory. This implies
|
||||
a statement akin to the blame theorem from gradual
|
||||
typing~\cite{wadler09}, that, as long as all bounds-declarations
|
||||
are well-formed, any errors can be blamed on unchecked code.
|
||||
|
||||
\paragraph{Formalization} This condition is further explored and
|
||||
formalized in our paper draft~\cite{ruef2017draft}, where we provide a
|
||||
proof in the spirit of gradual typing, showing that (in a restricted
|
||||
core of the Checked C language) bounds errors in checked code can be
|
||||
entirely blamed on unchecked code.
|
||||
|
||||
\paragraph{Undefined Behaviour} Checked C is very clear that
|
||||
operations which do not retain this well-formedness condition should
|
||||
raise an error, rather than causing ``undefined behaviour'', which is
|
||||
what C specifies in similar circumstances. This is a deliberate effort
|
||||
to ensure soundness of compiled programs and to reduce programmer
|
||||
confusion, at the cost of some run-time overhead.
|
||||
|
||||
\section{Explicit Dynamic Checks}
|
||||
|
||||
Checked C also provides an explicit dynamic check operator,
|
||||
\kwdynamiccheck. This works similarly to C's \lstinline|assert|,
|
||||
taking a condition that must be true and signalling a run-time error
|
||||
if that condition evaluates to false. The main difference between the
|
||||
dynamic check built-in and assert is that dynamic check is never
|
||||
removed unless the compiler can prove the check will always pass,
|
||||
whereas an assert will be removed if a particular preprocessor macro
|
||||
is defined.
|
||||
|
||||
\section{Other Features}
|
||||
\label{sec:other-features}
|
||||
|
||||
Both the Checked C specification and our prototype compiler are
|
||||
currently incomplete. As far as the specification is concerned, we do
|
||||
not have a complete design for null-terminated arrays, which means we
|
||||
have no way of reasoning about C strings.
|
||||
|
||||
As far as the compiler is concerned, though we have a complete
|
||||
definition of how to deal with pointer alignment issues, our compiler
|
||||
has no support for these features at the moment. The specification
|
||||
also defines bundled blocks, which are for relaxing invariants inside
|
||||
a particular block (as long as the invariants are enforced again upon
|
||||
exiting that block). Bundled blocks are also currently unimplemented
|
||||
in our compiler.
|
||||
|
||||
I consider all these issues --- null-terminated arrays, pointer
|
||||
alignment issues, and bundled blocks --- to be out-of-scope of this
|
||||
report.
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,336 @@
|
|||
\chapter{Bounds Propagation}
|
||||
\label{sec:propagation}
|
||||
|
||||
We have covered bounds expressions on declarations
|
||||
in~\autoref{sec:checkedc}. The other side of this coin is that we have
|
||||
to perform propagation to calculate the bounds on pointers at access
|
||||
time. As pointer bounds expressions are currently flow-insensitive,
|
||||
this can be done with a set of context-free rules that I will sketch
|
||||
here, and show fully in~\autoref{app:propagation-rules}.
|
||||
|
||||
Our bounds propagation algorithm works in two steps. The first step
|
||||
normalizes existing bounds expressions into their most general form,
|
||||
and then the main propagation rules work over these canonical forms to
|
||||
compute the bounds we should check any pointer access with.
|
||||
|
||||
\section{Canonical Forms}
|
||||
\label{sec:canonical-forms}
|
||||
|
||||
We have been careful to define our bounds expressions such that they
|
||||
can be normalized into a single form, as shown in
|
||||
\autoref{tab:canonical}. This normalization makes bounds propagation
|
||||
easier, rather than having to ensure propagation works on every kind
|
||||
of bounds expression. This should also allow us to add new kinds of
|
||||
bounds expressions without having to change the propagation algorithm.
|
||||
|
||||
Checked array pointers with an explicit bounds expression are not the
|
||||
only kind of expressions we can compute bounds for. With this
|
||||
algorithm we may also need to compute the bounds of checked singleton
|
||||
pointers, and of the pointers derived from checked arrays (exactly
|
||||
when an array turns into a pointer we will cover shortly).
|
||||
|
||||
\begin{table}[ht]
|
||||
\input{tables/canonical}
|
||||
\end{table}
|
||||
|
||||
\section{C's Value Semantics}
|
||||
\label{sec:cs-value-semantics}
|
||||
|
||||
Before we can accurately cover our propagation rules, we need to cover
|
||||
a discussion of how C's lvalues and values work\endnote{What the C11
|
||||
Specification calls ``values'' are more commonly called ``r-values''
|
||||
in other language descriptions.}.
|
||||
|
||||
In C, there are two kinds of values. Lvalues describe the locations of
|
||||
objects in memory, and are used to both read from and write to memory.
|
||||
Integers, floats, and pointers are values.
|
||||
|
||||
Every expression in C evaluates to either an lvalue or a value. We
|
||||
will call an expression that evaluates to an lvalue an \emph{lvalue
|
||||
expression}; and likewise an expression that evaluates to a value a
|
||||
\emph{value expression}. Usually expressions in C require their
|
||||
sub-expressions to be value expressions. Some expressions require
|
||||
certain of their sub-expressions to be lvalue expressions.
|
||||
\autoref{fig:lvalue-expressions} shows all lvalue expression types and
|
||||
all places they may be used in C --- all other expressions are value
|
||||
expressions and have only value sub-expressions.
|
||||
|
||||
As our invariants are all about how programs access memory, and
|
||||
programs can only access memory using an lvalue expression, our bounds
|
||||
propagation rules are inextricably tied to the notion of lvalue
|
||||
expressions and value expressions.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\input{grammars/lvalue-expressions}
|
||||
\end{figure}
|
||||
|
||||
The subtle nuance in lvalue expression evaluation is that lvalues may
|
||||
also appear in a value position in an expression. In this case, one of
|
||||
two things happens: if the lvalue has array type, the lvalue
|
||||
expression evaluates to a pointer to the start of that array. If the
|
||||
lvalue does not have array type, then the lvalue is read to find the
|
||||
underlying value stored at its location. These operations are
|
||||
respectively called an \emph{array conversion}, and an \emph{lvalue
|
||||
conversion}.
|
||||
|
||||
Thus C differentiates implicitly between lvalues, which denote
|
||||
locations, and the objects in those locations. The result of
|
||||
performing this lvalue conversion is the value in the location denoted
|
||||
by that lvalue, which we refer to as an \emph{lvalue target}.
|
||||
|
||||
\section{Propagation Rules}
|
||||
\label{sec:propagation-rules}
|
||||
|
||||
Our bounds propagation rules, at a broad level, will work with these
|
||||
three different evaluation models, separately (but mutually
|
||||
recursively) propagating bounds for value expressions, \emph{value
|
||||
bounds}, for lvalue expressions, \emph{lvalue bounds}, and for the
|
||||
value result of an lvalue conversion, \emph{lvalue target bounds}. The
|
||||
full rules are shown in~\autoref{app:propagation-rules}.
|
||||
|
||||
In general, the value bounds of an expression are the value bounds of
|
||||
the pointer sub-expression with bounds; the lvalue bounds of an
|
||||
expression correspond to the bounds on the storage location of that
|
||||
lvalue; and the lvalue target bounds of an expression correspond to
|
||||
the bounds declared for that lvalue.
|
||||
|
||||
During propagation we add two more kinds of bounds annotation:
|
||||
$\mbounds(\text{any})$, which is equivalent to an infinite range, and
|
||||
represents the bounds on \NULL{}; and $\mbounds(\text{unknown})$,
|
||||
which is equivalent to an empty range, and represents that we cannot
|
||||
calculate a static description of the run-time bounds.
|
||||
|
||||
\subsection{Lvalue Bounds}
|
||||
\label{sec:lvalue-prop-rules}
|
||||
|
||||
The propagation rules for lvalue bounds of an lvalue expression are
|
||||
shown fully in~\autoref{fig:lval-bounds-prop}. The intuition behind
|
||||
the bounds these rules calculate is that they are the bounds on the
|
||||
memory used for the lvalue itself, for instance on the stack. The
|
||||
bounds declarations are not used in lvalue bounds propagation.
|
||||
|
||||
\paragraph{Variables} A (non-array-typed) variable only
|
||||
needs enough space to store one value of the type it holds, so
|
||||
pointer-typed variables only uses enough space for one pointer. This
|
||||
is what $\sbounds(e_v)$ computes.
|
||||
|
||||
In the case of array-typed variables, the lvalue itself is an array,
|
||||
so the amount of space used can be deduced from the array size and
|
||||
type, if the array is constant-sized. This is what $\abounds(e_v, N)$
|
||||
computes. If the array is not constant-sized, then we cannot deduce the
|
||||
bounds statically.
|
||||
|
||||
\paragraph{Dereferencing and Indexing} In the case of pointer
|
||||
dereferencing or indexing, the location being accessed is the memory
|
||||
that the pointer references, which is exactly what the value bounds on
|
||||
that pointer describe. In the case of an indexing expression,
|
||||
$e_v[e_v]$, C makes no requirements on which sub-expression is the
|
||||
pointer, so we define the notion of the $base$ of an expression, which
|
||||
in this case refers to the pointer-typed sub-expression.
|
||||
|
||||
\paragraph{Member Accesses} In the case of struct and pointer memory
|
||||
accesses, the lvalue bounds we propagate match the location of the
|
||||
struct member in memory, including if it is an array or not. A further
|
||||
description of this behaviour is included in~\autoref{sec:narrowing}.
|
||||
|
||||
\subsection{Value Bounds}
|
||||
\label{sec:value-prop-rules}
|
||||
|
||||
The propagation rules for value bounds of expressions are shown
|
||||
in~\autoref{fig:val-bounds-prop}. The intuition behind the bounds
|
||||
these rules calculate is that these bounds expressions effectively
|
||||
propagate the lvalue target bounds up from any lvalue conversions.
|
||||
|
||||
\paragraph{Null Pointers} The null pointer, \NULL{}, always has
|
||||
$\mbounds(\text{any})$, as we allow any pointer with bounds also to be
|
||||
\NULL{}. We will check the pointer is non-null before dereferencing.
|
||||
|
||||
\paragraph{Literals} All other literals have unknown bounds, as they
|
||||
are not pointer literals.
|
||||
|
||||
\paragraph{Address-of Operator} The address-of operator converts an
|
||||
lvalue sub-expression into a pointer representing its memory location.
|
||||
Thus the bounds on this pointer must be the lvalue bounds of the
|
||||
lvalue.
|
||||
|
||||
\paragraph{Assignment} In C, assignment and compound assignment are
|
||||
expressions, not statements, so can appear within other expressions.
|
||||
The resultant value is the same as the value that is assigned into the
|
||||
lvalue, so in the case of assignment the bounds are the bounds of the
|
||||
right hand side, and in the case of compound assignment the bounds are
|
||||
the lvalue target bounds of the left hand side. Increment and
|
||||
decrement are just special cases of compound assignment.
|
||||
|
||||
\paragraph{Struct Member} In the case of struct member expressions, we
|
||||
narrow to the field of the struct as we do when the base expression is
|
||||
an lvalue expression. Our implementation does not yet cope with this,
|
||||
as we have no way of referring to the base value expression from which
|
||||
we want to calculate the bounds.
|
||||
|
||||
\paragraph{Function Calls} For function declarations, we allow the
|
||||
programmer to specify parameter and return bounds in terms of other
|
||||
parameters. Therefore, to calculate the bounds on a value returned
|
||||
from a function call, we need to substitute any argument values into
|
||||
the bounds expression of the return value.
|
||||
|
||||
\paragraph{Comma Expressions} As comma expressions evaluate to their
|
||||
final expression, we propagate the value bounds of the final
|
||||
expression.
|
||||
|
||||
\paragraph{Array and Lvalue Conversions} In the case of
|
||||
non-array-typed lvalues, because we will be reading the lvalue in
|
||||
order for it to become a value, the bounds on this conversion
|
||||
expression refer to the bounds on the object the lvalue denotes, i.e.
|
||||
the lvalue target bounds of that lvalue expression. In the case of
|
||||
array-typed lvalues, evaluation will perform an array conversion,
|
||||
converting the array into a pointer to its first element, which
|
||||
corresponds to the lvalue bounds on that array.
|
||||
|
||||
\paragraph{Binary Operators} Binary operator expressions returning a
|
||||
pointer type, such as performing pointer arithmetic, take the bounds
|
||||
of the base expression, i.e. the sub-expression that has pointer type.
|
||||
This ensures that for a pointer, $p$, $p + 3$ and $p - 2$ have the
|
||||
same bounds as $p$ does. The notion of ``base'' here is the same as
|
||||
with array indexing.
|
||||
|
||||
\paragraph{Relational and Logical Operators} Relational operators
|
||||
return a boolean result, i.e. an integer 0 or 1 corresponding to the
|
||||
comparison result. These do not denote anywhere in memory, so do not
|
||||
have bounds. The same applies to the unary negation operator.
|
||||
|
||||
\paragraph{Other Expressions} For other expressions, including
|
||||
arithmetic over integers (but not floats), we propagate bounds from
|
||||
sub-expressions to expressions. This allows us not to lose bounds
|
||||
information when performing bit-wise operations over pointer values
|
||||
(for pointer alignment, for example). The requirement we make is that
|
||||
if more than one sub-expression has bounds, we can only merge bounds
|
||||
if they are syntactically equal. We may later relax this to being
|
||||
syntactically equal modulo variable equality. In the case of float
|
||||
expressions, we do not believe performing pointer arithmetic with
|
||||
floating-point numbers makes any sense, so float expressions may not
|
||||
carry or propagate bounds information.
|
||||
|
||||
\subsection{Lvalue Target Bounds}
|
||||
\label{sec:lvalue-target-prop-rules}
|
||||
|
||||
The propagation rules for value bounds of expressions are shown
|
||||
in~\autoref{fig:lval-target-bounds-prop}. The intuition behind the
|
||||
bounds these rules calculate is that these bounds represent the extent
|
||||
in memory of the value returned when an lvalue conversion is
|
||||
performed.
|
||||
|
||||
\paragraph{Checked Singleton Pointers} The bounds of a \PtrT, \mv{p},
|
||||
start at their value, and include exactly enough space for a single
|
||||
value of their own type, \mv{T}. They are
|
||||
$\left[\mv{p}, \mv{p} + \sizeof{\mv{T}} \right)$.
|
||||
|
||||
\paragraph{Variables} The lvalue target bounds for a variable declared
|
||||
with bounds is exactly the bounds expression it was declared with.
|
||||
|
||||
\paragraph{Member Accesses} Struct and pointer member accesses both
|
||||
use the declared bounds on the member to compute their bounds
|
||||
expressions. In the case of struct member accesses, they use the value
|
||||
in the same struct of any other members referenced in the bounds
|
||||
expression. In the case of pointer member accesses, any members
|
||||
referenced in the bounds expression are also computed using a pointer
|
||||
member access of the same pointer.
|
||||
|
||||
For instance, given a struct, \mv{s}, with two members, \mv{m1} and
|
||||
\mv{m2}, where \mv{m2} has declared bounds of \boundscount{\mv{m1}},
|
||||
the lvalue target bounds of $\mv{s} \member \mv{m2}$ are
|
||||
$\left[\mv{s}\member\mv{m2}, \mv{s}\member\mv{m2} +
|
||||
\mv{s}\member\mv{m1} \right)$. If we had a pointer, \mv{p}, to the
|
||||
same type of struct, the lvalue target bounds of
|
||||
$\mv{p} \arrow \mv{m2}$ would be
|
||||
$\left[\mv{p}\arrow\mv{m2}, \mv{p}\arrow\mv{m2} + \mv{p}\arrow\mv{m1}
|
||||
\right)$. In the same way, any variables referenced in expressions on
|
||||
struct members are assumed to refer to other members of the same
|
||||
struct.
|
||||
|
||||
\paragraph{Dereferencing and Indexing} We cannot propagate lvalue
|
||||
target bounds for a pointer that is stored at another pointer (unlike
|
||||
multidimensional arrays). This is because we have no way of expressing
|
||||
where the bounds of the inner pointer are stored or should be
|
||||
computed, unlike in a struct where there is potentially storage space.
|
||||
|
||||
\section{Narrowing}
|
||||
\label{sec:narrowing}
|
||||
|
||||
One important part of our approach is how we choose to ``narrow''
|
||||
bounds when accessing members of structs or elements of arrays. This
|
||||
means potentially inferring a sub-range of the aggregate object's
|
||||
bounds for accesses to inner parts of that object.
|
||||
|
||||
It is this narrowing that ensures the second part of the
|
||||
well-formedness condition in~\autoref{sec:soundness}, that all
|
||||
in-lifetime non-null pointers of type \mv{T} with bounds must point to
|
||||
an object in memory of type \mv{T}.
|
||||
|
||||
In the case of structs (and unions), we have to narrow to the bounds
|
||||
sub-range for a particular member, because different members have
|
||||
different types. This is what the functions like $\srbounds(e_v, m)$
|
||||
do, narrowing the bounds inferred to the bounds of the field $m$
|
||||
within $e_v$.
|
||||
|
||||
In the case of arrays, our semantics allow us two implementations. The
|
||||
first is that we do narrow through each dimension of a
|
||||
multidimensional array, to ensure an access, \lstinline|a[i][j]|, to a
|
||||
5 by 5 array, is within the 5 elements starting at
|
||||
\lstinline|a[i][0]|. Our compiler actually chooses the other
|
||||
alternative, ensuring that any access to \lstinline|a[x][y]| is within
|
||||
the 25 elements starting at \lstinline|a[0][0]|. This maintains
|
||||
well-formedness as arrays all contain elements of the same type, and
|
||||
should allow the compiler to remove some bounds checks from tight
|
||||
inner loops.
|
||||
|
||||
\section{Propagation Limitations}
|
||||
\label{sec:prop-limitations}
|
||||
|
||||
We have several limitations with the current propagation algorithm,
|
||||
the most major of which relates to ``modifying expressions'', as
|
||||
defined in~\cite[Section 3.4]{CheckedCv06}, which pose a problem to
|
||||
the flow-insensitive algorithm. In particular, our algorithm does not
|
||||
introduce temporaries for expressions which are modifying, instead
|
||||
duplicating these expressions into the bounds expressions, and then
|
||||
computing based off the duplicates. This means if we generate directly
|
||||
from the bounds expressions, each modifying expression could be
|
||||
generated in code several times.
|
||||
|
||||
For instance, say I have an array, $a$ of structs, each with a member
|
||||
$m1$ that has bounds defined to be \boundscount{\mv{m2}}, where $m2$
|
||||
is another member of the same struct. In this case, an access like
|
||||
\lstinline|a[i++].m1[0]| will have the bounds inferred of
|
||||
\bounds{a[i++].m1}{a[i++].m1 + a[i++].m2}. If we generate dynamic
|
||||
checks directly from bounds expressions, we have now incremented
|
||||
\lstinline|i| four times (once for the pointer itself, once for the
|
||||
lower bound, twice for the upper bound), which changes the meaning of
|
||||
the program.
|
||||
|
||||
This also is a problem if the programmer calls a function that returns
|
||||
a struct, and directly (without assigning into a temporary) accesses a
|
||||
member of the returned struct. We have no way of referring to the
|
||||
place in memory of the returned struct.
|
||||
|
||||
In both of these cases, the programmer can manually insert a temporary
|
||||
variable to avoid these limitations.
|
||||
|
||||
We are planning to solve this issue with a set of dynamically-scoped
|
||||
keywords that will refer to the current expression value, the current
|
||||
struct base pointer, and the current return value, which we would then
|
||||
treat as temporaries during code generation. While it makes the code
|
||||
generation more complex, it means we can infer the bounds in more
|
||||
places.
|
||||
|
||||
We also totally lack a way of handling ternary expressions during
|
||||
bounds propagation. Arguably we can generate new bounds expressions
|
||||
with ternary expressions in both the upper and the lower expressions,
|
||||
but this will make any static analysis much harder in the long term.
|
||||
We may introduce a new form of bounds expression to handle these cases
|
||||
explicitly.
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,100 @@
|
|||
\chapter{Propagation Rules}
|
||||
\label{app:propagation-rules}
|
||||
|
||||
These are the algebraic definitions of our propagation algorithm to go
|
||||
with the descriptions in~\autoref{sec:propagation-rules}. Certain
|
||||
helper functions are explained in the text, rather than
|
||||
algebraically.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{align*}
|
||||
% Lvalue Bounds
|
||||
\lbounds(e_l) &\in \mbounds(e_v, e_v) \tag*{Lvalue Bounds} \\[1ex]
|
||||
\lbounds(x) &= \abounds(x, \mv{N}) \tag*{Variables} \\
|
||||
& \qquad \text{when } \typeof(x) = \arr(\mv{T}, \mv{N}) \\
|
||||
&= \sbounds(\addrof x) \text{ otherwise.} \\[1ex]
|
||||
\lbounds(\deref e_v) &= \vbounds(e_v) \tag*{Dereference} \\[1ex]
|
||||
\lbounds(e_1[e_2]) &= \vbounds(e_{base}) \tag*{Index} \\
|
||||
& \qquad \text{where } e_{base} = base(e_1, e_2) \\[1ex]
|
||||
\lbounds(e_l \member m) &= \abounds(e_l \member m, \mv{N}) \tag*{Struct Member Access} \\
|
||||
&\qquad \text{when } \typeof(e_l \member m) = \arr(\mv{T}, \mv{N}) \\
|
||||
&= \sbounds(\addrof (e_l \member m)) \text{ otherwise.} \\[1ex]
|
||||
\lbounds(e_v \arrow m) &= \abounds(e_v \arrow m, \mv{N}) \tag*{Pointer Member Access} \\
|
||||
&\qquad \text{when } \typeof(e_v \arrow m) = \arr(\mv{T}, \mv{N}) \\
|
||||
&= \sbounds(\addrof (e_v \arrow m)) \text{ otherwise.} \\[1em]
|
||||
%% Bounds Helpers/Sugar
|
||||
\sbounds(e_v) &= \mbounds(e_v, e_v + 1) \tag*{Single Object Bounds} \\
|
||||
\abounds(e_l, N) &= \mbounds(e_l, e_l + N) \tag*{Constant-sized Array Bounds} \\
|
||||
& \qquad \text{when } N \text{ is constant.} \\
|
||||
&= \mbounds(\text{unknown}) \text{ otherwise.}
|
||||
\end{align*}
|
||||
\caption[Lvalue Bounds Propagation]{Lvalue Bounds Propagation Rules (Mutually
|
||||
Recursive with~\autoref{fig:val-bounds-prop})}
|
||||
\label{fig:lval-bounds-prop}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{align*}
|
||||
% Value Bounds
|
||||
\vbounds(e_v) &\in \mbounds(e_v, e_v) \tag*{Value Bounds} \\[1ex]
|
||||
\vbounds(\mathtt{NULL}) &= \mbounds(\text{any}) \tag*{Null Pointer Literal} \\[1ex]
|
||||
\vbounds(l) &= \mbounds(\text{unknown}) \tag*{Other Literals} \\[1ex]
|
||||
\vbounds(\addrof e_l) &= \lbounds(e_l) \tag*{Address-of} \\[1ex]
|
||||
\vbounds(e_l \assign e_v) &= \vbounds(e_v) \tag*{Assignment} \\[1ex]
|
||||
\vbounds(e_l \compassign e_v) &= \ltbounds(e_l) \tag*{Compound Assignment} \\[1ex]
|
||||
\vbounds(e_l \inc) &= \ltbounds(e_l) \tag*{Increment \& Decrement} \\
|
||||
&\qquad \text{Likewise for all Increment \& Decrement operators} \\[1ex]
|
||||
\vbounds(e_v\member m) &= \srbounds(e_v, m) \tag*{Struct Member Operator} \\[1ex]
|
||||
\vbounds(e_f(\overline{e_{arg}})) &= \rbounds(e_f) \left[ \textrm{params}(f) \middle/ \overline{e_{arg}}\right] \tag*{Function Call}\\[1ex]
|
||||
\vbounds(e_{v1} , e_{v2}) &= \vbounds(e_{v2}) \tag*{Comma Expression} \\[1ex]
|
||||
\vbounds(e_l) &= \lbounds(e_l) \tag*{Array Conversion} \\
|
||||
&\qquad \text{when } \typeof(e_l) = \arr(\mv{T}, \mv{N}) \\
|
||||
&= \ltbounds(e_l) \text{ otherwise}
|
||||
\tag*{Lvalue Conversion} \\[1ex]
|
||||
\vbounds(e_1 \binop e_2) &= \vbounds(e_{base}) \tag*{Pointer Arithmetic} \\
|
||||
&\qquad \text{when } e_1 \binop e_2 \text{ has pointer type} \\
|
||||
&\qquad \text{where } e_{base} = base(e_1, e_2) \\[1ex]
|
||||
\vbounds(e_1 \mathrel{R} e_2) &= \mbounds(\text{unknown}) \tag*{Binary Relational Operators} \\[1ex]
|
||||
\vbounds(! e_v) &= \mbounds(\text{unknown}) \tag*{Logical Negation Operator} \\[1ex]
|
||||
\vbounds(e_v) &= \vbounds(e_{vi}) \tag*{Other Expressions} \\
|
||||
&\qquad
|
||||
\text{where } e_{vi}
|
||||
\text{ is the subexpression of } e_v
|
||||
\text{ with known bounds} \\
|
||||
&\qquad
|
||||
\text{If more than one subexpression of } e_v
|
||||
\text{ has known bounds,}\\
|
||||
&\qquad\quad
|
||||
\text{all calculated bounds must be equal.} \\[1ex]
|
||||
&= \mbounds(\text{unknown}) \text{ otherwise}\\[1em]
|
||||
\rbounds(f) &\in \mbounds(e_v, e_v) \tag*{Bounds on Function Return} \\[1em]
|
||||
\srbounds(e_v, m) &\in \mbounds(e_v', e_v') \tag*{Struct Relative Bounds}
|
||||
\end{align*}
|
||||
\caption[Value Bounds Propagation Rules]{Value Bounds Propagation Rules (Mutually
|
||||
Recursive with~\autoref{fig:lval-bounds-prop}
|
||||
and~\autoref{fig:lval-target-bounds-prop})}
|
||||
\label{fig:val-bounds-prop}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{align*}
|
||||
%% Lvalue Target Bounds
|
||||
\ltbounds(e_l) &\in \mbounds(e_v, e_v) \tag*{Lvalue Target Bounds} \\[1ex]
|
||||
\ltbounds(e_l) &= \sbounds(e_l) \tag*{Checked Singleton Pointer Bounds}\\
|
||||
&\qquad \text{where } \typeof(e_l) = \PtrT \\[1ex]
|
||||
\ltbounds(x) &= \dbounds(x) \tag*{Variable Target Bounds}\\[1ex]
|
||||
\ltbounds(e_l \member m) &= \srbounds(e_l, m) \tag*{Struct Member Target Bounds}\\
|
||||
\ltbounds(e_v \arrow m) &= \srbounds(e_v, m) \tag*{Pointer Member Target Bounds}\\[1ex]
|
||||
\ltbounds(\_) &= \mbounds(\text{unknown}) \\[1em]
|
||||
\dbounds(x) &\in \mbounds(e_v, e_v) \tag*{Declared Bounds}
|
||||
\end{align*}
|
||||
\caption[Lvalue Target Bounds Propagation Rules]{Lvalue Target Bounds
|
||||
Propagation Rules}
|
||||
\label{fig:lval-target-bounds-prop}
|
||||
\end{figure}
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,83 @@
|
|||
\chapter{Related Work}
|
||||
\label{sec:related-work}
|
||||
|
||||
Attempting to make C memory safe is a well-trodden
|
||||
path~\cite{Szekeres:2013:SEW:2497621.2498101}. Approaches fall into
|
||||
four categories: better type systems; runtime representations of
|
||||
bounds; static analysis annotations; and instrumentation. We avoid
|
||||
mentioning approaches that propose new languages or operating-system
|
||||
integration of protections.
|
||||
|
||||
\paragraph{Better Type Systems} In this approach, the compiler adds
|
||||
new pointer types, perhaps including bounds information, with rules
|
||||
that are enforced at compile time either by the type-checker or by
|
||||
other static analyses. Any found errors prevent compilation from being
|
||||
successful.
|
||||
|
||||
The largest influence on Checked C's approach is
|
||||
Deputy~\cite{Condit2007}, a dependent-type system for C. This project
|
||||
used dependent types for two reasons, the first is to represent array
|
||||
bounds, the second is to ensure access to unions is via the correct
|
||||
member. Unfortunately, to use Deputy, the programmer had to rewrite
|
||||
their C program all at once, which is uneconomical and raises the
|
||||
effort required to get any safety payoff. With our unchecked blocks
|
||||
and interoperation types, we do not require the whole program to be
|
||||
translated at once. The programmer also has little to no control over
|
||||
the bounds expressions, preventing dynamic check optimizations.
|
||||
|
||||
CCured~\cite{Necula2005} takes a similar approach to ours, with new
|
||||
pointer types to represent various kinds of arrays. They couple this
|
||||
with an analysis of pointer usage which we have emulated in the
|
||||
conversion tool described in~\cite{ruef2017draft}.
|
||||
|
||||
\paragraph{Run-time Representations of Bounds} In this approach, the
|
||||
compiler uses extra memory to represent run-time bounds. This then
|
||||
allows the compiler to insert checks which ensure pointers are within
|
||||
bounds. They may also have to generate code to update bounds
|
||||
information as other data within the program changes.
|
||||
|
||||
For instance, the SoftBound~\cite{Nagarakatte2009} work has a shadow
|
||||
memory space where they store information about pointer bounds. In
|
||||
order to achieve low run-time overhead, this approach requires up to
|
||||
twice the memory of the original program, something that is not always
|
||||
available on all platforms. Other work in the same
|
||||
vein~\cite{Nagarakatte2015} uses hardware-based approaches that
|
||||
require Memory Management Unit support.
|
||||
|
||||
CCured~\cite{Necula2005} also uses run-time representations for its
|
||||
pointers, changing the in-memory layout, which we wanted to avoid. In
|
||||
particular, pointer values are now represented by up to three
|
||||
pointers, one for the pointer itself, and one each for its upper and
|
||||
lower bounds. CCured also uses a garbage collector to implement sound
|
||||
management of pointers it cannot infer information about, something we
|
||||
have avoided the runtime overhead of.
|
||||
|
||||
\paragraph{Static Analysis Annotations} In this approach, the
|
||||
programmer adds manual annotations to pointer declarations, similar to
|
||||
our bounds expressions. These are then checked by an external static
|
||||
analyser. The main difference is correct usage of these is enforced by
|
||||
separate static analysis, and is not built into the original compiler.
|
||||
|
||||
The main solution in this space is SAL~\cite{MicrosoftSAL}, which can
|
||||
detect, but not prevent bugs. Its annotations not only denote bounds,
|
||||
but also whether a pointer may be read from or written to. SAL
|
||||
includes annotations for function behaviour, such as locking, which we
|
||||
do not include.
|
||||
|
||||
\paragraph{Instrumentation} Lastly, we have some dynamic analysis
|
||||
systems that are not intended for production usage, but can detect
|
||||
incorrect usage of the C language. They are intended for usage in
|
||||
testing setups in conjunction with fuzzing systems that ensure the
|
||||
dynamic analysis can reach most program points.
|
||||
|
||||
The two most prominent of these are ASan~\cite{Serebryany2012} and
|
||||
UBSan~\cite{UBSan}. These can detect and prevent addressing and
|
||||
undefined behaviour issues in C dynamically, including out-of-bounds
|
||||
accesses and invalid pointer arithmetic.
|
||||
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
TEX_SCRIPTS=bm_results_tab.tex bm_results_macros.tex
|
||||
GRAPHS=modifications.pdf overheads.pdf
|
||||
RESULTS=$(TEX_SCRIPTS) $(GRAPHS)
|
||||
|
||||
.PHONY : clean all install
|
||||
all : $(RESULTS)
|
||||
|
||||
install : bm_res_install.r
|
||||
./bm_res_install.r
|
||||
|
||||
clean :
|
||||
rm $(RESULTS)
|
||||
|
||||
|
||||
$(RESULTS) : bm_res.r bm_res.csv
|
||||
./bm_res.r
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
Suite,Name,LM,EM,LU,RT,CT,ES
|
||||
#
|
||||
Olden,bh,9.98,76.7,5.15,0.24,23.76,6.21
|
||||
Olden,bisort,21.76,84.3,6.98,0.0,7.34,3.78
|
||||
Olden,em3d,35.29,66.4,16.89,0.80,18.01,-0.37
|
||||
Olden,health,23.96,97.8,9.28,2.11,18.47,6.73
|
||||
Olden,mst,30.15,75.0,19.33,0.0,6.28,-4.97
|
||||
Olden,perimeter,9.77,92.3,5.16,0.0,4.88,0.76
|
||||
Olden,power,15.04,69.2,3.90,0.0,21.58,8.54
|
||||
Olden,treadd,17.22,92.3,20.42,8.25,83.10,6.96
|
||||
Olden,tsp,9.90,94.5,10.33,0.0,47.62,4.63
|
||||
# Olden,voronoi,NA,NA,NA,NA,NA,NA
|
||||
#
|
||||
Ptrdist,anagram,26.59,67.5,10.72,23.53,16.82,5.13
|
||||
# Ptrdist,bc,NA,NA,NA,NA,NA,NA
|
||||
Ptrdist,ft,18.70,98.5,6.35,25.92,16.54,11.3
|
||||
Ptrdist,ks,14.21,93.4,8.12,12.77,32.34,26.71
|
||||
Ptrdist,yacr2,14.51,51.5,16.17,49.32,38.37,24.50
|
|
|
@ -0,0 +1,183 @@
|
|||
#!/usr/bin/env Rscript
|
||||
|
||||
# You'll need some packages. Run the "bm_res_install.r" file in this dir.
|
||||
suppressPackageStartupMessages(library(ggplot2))
|
||||
suppressPackageStartupMessages(library(cowplot))
|
||||
suppressPackageStartupMessages(library(scales))
|
||||
|
||||
results <- read.csv("bm_res.csv", header=TRUE, na.strings = "NA", strip.white = TRUE, comment.char = "#")
|
||||
|
||||
# Convert percentages to proportion [0..1]
|
||||
results["LM."] = results["LM"]/100
|
||||
results["EM."] = results["EM"]/100
|
||||
results["LU."] = results["LU"]/100
|
||||
|
||||
# Convert % slowdowns to proportions [0..100] (1 = no change)
|
||||
results["RT."] = (results["RT"]/100)+1
|
||||
results["CT."] = (results["CT"]/100)+1
|
||||
results["ES."] = (results["ES"]/100)+1
|
||||
|
||||
gm_mean = function(x){
|
||||
xprime <- na.omit(x)
|
||||
prod(xprime, na.rm=TRUE)^(1/length(xprime))
|
||||
}
|
||||
|
||||
gmeans <- c()
|
||||
gmeans$LM. = gm_mean(results$LM.)
|
||||
gmeans$EM. = gm_mean(results$EM.)
|
||||
gmeans$LU. = gm_mean(results$LU.)
|
||||
gmeans$RT. = gm_mean(results$RT.)
|
||||
gmeans$CT. = gm_mean(results$CT.)
|
||||
gmeans$ES. = gm_mean(results$ES.)
|
||||
|
||||
#
|
||||
# Plots
|
||||
#
|
||||
|
||||
# Adds + / - (or no sign for zero) to percentages, to display 0.0 as "0%", 0.9 as "- 10%" and 1.1 as "+ 10%"
|
||||
slowdown_formatter = function(datum) {
|
||||
datum_slowdown <- round(datum - 1, digits=3)
|
||||
positive <- !is.na(datum_slowdown) & datum_slowdown > 0
|
||||
paste0(ifelse(positive, "+ ", ""), percent(datum_slowdown))
|
||||
}
|
||||
|
||||
plot_with_mean = function(name, data, key, mean, y_breaks, nudge=0, cat_label=NA) {
|
||||
if (is.na(cat_label)) {
|
||||
x_scale <- scale_x_discrete("", limits=rev(data$Name))
|
||||
} else {
|
||||
x_scale <- scale_x_discrete(cat_label, limits=rev(data$Name))
|
||||
}
|
||||
|
||||
ggplot(dat=data, aes_(x=quote(Name), y=key)) +
|
||||
x_scale +
|
||||
scale_y_continuous(name,
|
||||
labels=percent,
|
||||
limits=c(min(y_breaks), max(y_breaks)),
|
||||
breaks=y_breaks) +
|
||||
geom_col(aes(fill=Suite), na.rm = TRUE, position="dodge") +
|
||||
theme_gray() +
|
||||
theme(legend.position="none", axis.ticks.y = element_blank(), aspect.ratio = 1) +
|
||||
theme(panel.grid.major.y = element_blank()) +
|
||||
geom_hline(aes(), yintercept = mean, linetype="dotted") +
|
||||
geom_label(aes(Inf, mean, label=percent(mean), fill=NULL), vjust=1, hjust=ifelse(nudge<0,1,0), nudge_y=nudge, show.legend=FALSE) +
|
||||
coord_flip()
|
||||
}
|
||||
|
||||
plot_slowdown_with_mean = function(name, data, key, mean, y_breaks, nudge=0, cat_label=NA) {
|
||||
if (is.na(cat_label)) {
|
||||
x_scale <- scale_x_discrete("", limits=rev(data$Name))
|
||||
} else {
|
||||
x_scale <- scale_x_discrete(cat_label, limits=rev(data$Name))
|
||||
}
|
||||
|
||||
ggplot(dat=data, aes_(x=quote(Name), xend=quote(Name), yend=key, y=1.0)) +
|
||||
x_scale +
|
||||
scale_y_continuous(name,
|
||||
labels=slowdown_formatter,
|
||||
limits=c(min(y_breaks), max(y_breaks)),
|
||||
breaks=y_breaks) +
|
||||
geom_segment(aes(color=Suite), na.rm = TRUE, arrow=arrow(length=unit(0.015, "npc"), angle=90)) +
|
||||
theme_gray() +
|
||||
theme(panel.grid.major.y = element_blank()) +
|
||||
theme(legend.position="none", axis.ticks.y = element_blank(), aspect.ratio = 1) +
|
||||
geom_hline(aes(), yintercept = mean, linetype="dotted") +
|
||||
geom_label(aes(x=Inf, y=mean, label=slowdown_formatter(mean), fill=NULL), vjust=1, hjust=ifelse(nudge<0,1,0), nudge_y=nudge, show.legend=FALSE) +
|
||||
coord_flip()
|
||||
}
|
||||
|
||||
modifications = function() {
|
||||
pLM <- plot_with_mean("Lines Modified (%)", results, ~LM., gmeans$LM., seq(0,0.4,0.1), 0.005, cat_label="Benchmark")
|
||||
pEM <- plot_with_mean("Easy Modifications (%)", results, ~EM., gmeans$EM., seq(0,1.0,0.2), 0.01)
|
||||
pLU <- plot_with_mean("Lines Unchecked (%)", results, ~LU., gmeans$LU., seq(0,0.25,0.05), 0.005)
|
||||
plot_grid(pLM, pEM, pLU + theme(legend.position="right"), nrow=1, align="hv", axis="tb", rel_widths=c(1,1,1.25))
|
||||
}
|
||||
ggsave("modifications.pdf", modifications(), "pdf", scale=2, width=6, height=2, units="in")
|
||||
|
||||
# modifications_pres = function() {
|
||||
# pLM <- plot_with_mean("Lines Modified (%)", results, ~LM., gmeans$LM., seq(0,0.4,0.1), 0.005, cat_label="Benchmark")
|
||||
# pEM <- plot_with_mean("Easy Modifications (%)", results, ~EM., gmeans$EM., seq(0,1.0,0.2), 0.01)
|
||||
# pLU <- plot_with_mean("Lines Unchecked (%)", results, ~LU., gmeans$LU., seq(0,0.25,0.05), 0.005, cat_label="Benchmark")
|
||||
# pTopRow <- plot_grid(pLM, pEM, ncol=2, align="hv", axis="tb")
|
||||
# plot_grid(pTopRow, pLU + theme(legend.position="right"), ncol=1, rel_heights=c(1.2,1))
|
||||
# }
|
||||
# ggsave("modifications_pres.pdf", modifications_pres(), "pdf", scale=1, width=6, height=6, units="in")
|
||||
|
||||
|
||||
|
||||
overheads = function () {
|
||||
pRT <- plot_slowdown_with_mean("Runtime Slowdown (±%)", results, ~RT., gmeans$RT., seq(0.8,1.6,0.2), nudge=0.01, cat_label="Benchmark")
|
||||
pCT <- plot_slowdown_with_mean("Compile Time Slowdown (±%)", results, ~CT., gmeans$CT., seq(1,2.0,0.25), nudge=0.01)
|
||||
pES <- plot_slowdown_with_mean("Executable Size Change (±%)", results, ~ES., gmeans$ES., seq(0.90,1.3,0.1), nudge=0.01)
|
||||
plot_grid(pRT, pCT, pES + theme(legend.position="right"), nrow=1, align="hv", axis="tb", rel_widths=c(1,1,1.25))
|
||||
}
|
||||
ggsave("overheads.pdf", overheads(), "pdf", scale=2, width=6, height=2, units="in")
|
||||
|
||||
# overheads_pres = function() {
|
||||
# pRT <- plot_slowdown_with_mean("Runtime Slowdown (±%)", results, ~RT., gmeans$RT., seq(0.8,1.6,0.2), nudge=0.01, cat_label="Benchmark")
|
||||
# pCT <- plot_slowdown_with_mean("Compile Time Slowdown (±%)", results, ~CT., gmeans$CT., seq(1,2.0,0.25), nudge=0.01)
|
||||
# pES <- plot_slowdown_with_mean("Executable Size Change (±%)", results, ~ES., gmeans$ES., seq(0.90,1.3,0.2), nudge=0.01, cat_label="Benchmark")
|
||||
# pTopRow <- plot_grid(pRT, pCT, ncol=2, align="hv", axis="tb")
|
||||
# plot_grid(pTopRow, pES + theme(legend.position="right"), ncol=1, rel_heights=c(1.2,1))
|
||||
# }
|
||||
# ggsave("overheads_pres.pdf", overheads_pres(), "pdf", scale=1, width=6, height=6, units="in")
|
||||
|
||||
#
|
||||
# Tables
|
||||
#
|
||||
|
||||
table_percent = function(datum) {
|
||||
ifelse(is.na(datum), datum, sprintf("%.1f", datum*100))
|
||||
}
|
||||
|
||||
table_slowdown = function(datum) {
|
||||
datum_slowdown <- round(datum - 1, digits=3)
|
||||
positive <- datum_slowdown > 0
|
||||
negative <- datum_slowdown < 0
|
||||
|
||||
prefix <- ifelse(positive, "+ ", ifelse(negative, "- ", ""))
|
||||
ifelse(is.na(datum), datum, paste0(prefix, table_percent(abs(datum_slowdown))))
|
||||
}
|
||||
|
||||
split_at_anagram = function(datum) {
|
||||
paste0(ifelse(datum=="anagram", "\\addlinespace\n", ""), datum)
|
||||
}
|
||||
|
||||
|
||||
out_columns <- c("Name", "LM.", "EM.", "LU.", "RT.", "CT.", "ES.")
|
||||
geo_mean_label = "\\midrule\n\\multicolumn{1}{r}{Geo. Mean:}"
|
||||
geo_mean_row <- data.frame(geo_mean_label, gmeans$LM., gmeans$EM., gmeans$LU., gmeans$RT., gmeans$CT., gmeans$ES.)
|
||||
names(geo_mean_row) <- out_columns
|
||||
results_out <- rbind(results[out_columns], geo_mean_row)
|
||||
|
||||
results_out[c("LM.", "EM.", "LU.")] <- lapply(results_out[c("LM.", "EM.", "LU.")], table_percent)
|
||||
results_out[c("RT.", "CT.", "ES.")] <- lapply(results_out[c("RT.", "CT.", "ES.")], table_slowdown)
|
||||
results_out["Name"] = lapply(results_out["Name"], split_at_anagram)
|
||||
|
||||
write.table(results_out[out_columns], "bm_results_tab.tex", quote=FALSE, sep=" & ", eol=" \\\\\n", na="--", row.names = FALSE, col.names=FALSE)
|
||||
|
||||
stat_commands = function(kind, mean, max=NA, min=NA) {
|
||||
stat_percent = function(datum) {
|
||||
ifelse(is.na(datum), "??", sprintf("%.1f", datum*100))
|
||||
}
|
||||
|
||||
mean_cmd <- sprintf("\\newcommand{\\Result%sMean}{%s\\%%\\xspace}", kind, stat_percent(mean))
|
||||
max_cmd <- sprintf("\\newcommand{\\Result%sMax}{%s\\%%\\xspace}", kind, stat_percent(max))
|
||||
min_cmd <- sprintf("\\newcommand{\\Result%sMin}{%s\\%%\\xspace}", kind, stat_percent(min))
|
||||
paste0("\n", mean_cmd, "\n", max_cmd, "\n", min_cmd, "\n")
|
||||
}
|
||||
|
||||
sink("bm_results_macros.tex")
|
||||
cat("% Auto Generated by bm_res.r\n% Do Not Modify")
|
||||
cat(stat_commands("LinesModified", gmeans$LM., max(results$LM.,
|
||||
na.rm=TRUE), min(results$LM., na.rm=TRUE)))
|
||||
cat(stat_commands("EasyModifications", gmeans$EM., max(results$EM.,
|
||||
na.rm=TRUE), min(results$EM., na.rm=TRUE)))
|
||||
cat(stat_commands("LinesUnchecked", gmeans$LU., max(results$LU.,
|
||||
na.rm=TRUE), min(results$LU., na.rm=TRUE)))
|
||||
cat(stat_commands("RunTime", gmeans$RT.-1, max(results$RT.,
|
||||
na.rm=TRUE)-1, min(results$RT., na.rm=TRUE)-1))
|
||||
cat(stat_commands("CompileTime", gmeans$CT.-1, max(results$CT.,
|
||||
na.rm=TRUE)-1, min(results$CT., na.rm=TRUE)-1))
|
||||
cat(stat_commands("ExecutableSize", gmeans$ES.-1, max(results$ES.,
|
||||
na.rm=TRUE)-1, min(results$ES., na.rm=TRUE)-1))
|
||||
sink()
|
|
@ -0,0 +1,5 @@
|
|||
#!/usr/bin/env Rscript
|
||||
|
||||
# You'll need the following packages
|
||||
install.packages(c("tidyverse", "scales", "cowplot"), repos="https://ftp.osuosl.org/pub/cran/")
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
% Auto Generated by bm_res.r
|
||||
% Do Not Modify
|
||||
\newcommand{\ResultLinesModifiedMean}{17.5\%\xspace}
|
||||
\newcommand{\ResultLinesModifiedMax}{35.3\%\xspace}
|
||||
\newcommand{\ResultLinesModifiedMin}{9.8\%\xspace}
|
||||
|
||||
\newcommand{\ResultEasyModificationsMean}{80.1\%\xspace}
|
||||
\newcommand{\ResultEasyModificationsMax}{98.5\%\xspace}
|
||||
\newcommand{\ResultEasyModificationsMin}{51.5\%\xspace}
|
||||
|
||||
\newcommand{\ResultLinesUncheckedMean}{9.3\%\xspace}
|
||||
\newcommand{\ResultLinesUncheckedMax}{20.4\%\xspace}
|
||||
\newcommand{\ResultLinesUncheckedMin}{3.9\%\xspace}
|
||||
|
||||
\newcommand{\ResultRunTimeMean}{8.6\%\xspace}
|
||||
\newcommand{\ResultRunTimeMax}{49.3\%\xspace}
|
||||
\newcommand{\ResultRunTimeMin}{0.0\%\xspace}
|
||||
|
||||
\newcommand{\ResultCompileTimeMean}{24.3\%\xspace}
|
||||
\newcommand{\ResultCompileTimeMax}{83.1\%\xspace}
|
||||
\newcommand{\ResultCompileTimeMin}{4.9\%\xspace}
|
||||
|
||||
\newcommand{\ResultExecutableSizeMean}{7.4\%\xspace}
|
||||
\newcommand{\ResultExecutableSizeMax}{26.7\%\xspace}
|
||||
\newcommand{\ResultExecutableSizeMin}{-5.0\%\xspace}
|
|
@ -0,0 +1,16 @@
|
|||
bh & 10.0 & 76.7 & 5.2 & + 0.2 & + 23.8 & + 6.2 \\
|
||||
bisort & 21.8 & 84.3 & 7.0 & 0.0 & + 7.3 & + 3.8 \\
|
||||
em3d & 35.3 & 66.4 & 16.9 & + 0.8 & + 18.0 & - 0.4 \\
|
||||
health & 24.0 & 97.8 & 9.3 & + 2.1 & + 18.5 & + 6.7 \\
|
||||
mst & 30.1 & 75.0 & 19.3 & 0.0 & + 6.3 & - 5.0 \\
|
||||
perimeter & 9.8 & 92.3 & 5.2 & 0.0 & + 4.9 & + 0.8 \\
|
||||
power & 15.0 & 69.2 & 3.9 & 0.0 & + 21.6 & + 8.5 \\
|
||||
treadd & 17.2 & 92.3 & 20.4 & + 8.3 & + 83.1 & + 7.0 \\
|
||||
tsp & 9.9 & 94.5 & 10.3 & 0.0 & + 47.6 & + 4.6 \\
|
||||
\addlinespace
|
||||
anagram & 26.6 & 67.5 & 10.7 & + 23.5 & + 16.8 & + 5.1 \\
|
||||
ft & 18.7 & 98.5 & 6.3 & + 25.9 & + 16.5 & + 11.3 \\
|
||||
ks & 14.2 & 93.4 & 8.1 & + 12.8 & + 32.3 & + 26.7 \\
|
||||
yacr2 & 14.5 & 51.5 & 16.2 & + 49.3 & + 38.4 & + 24.5 \\
|
||||
\midrule
|
||||
\multicolumn{1}{r}{Geo. Mean:} & 17.5 & 80.1 & 9.3 & + 8.6 & + 24.3 & + 7.4 \\
|
Двоичный файл не отображается.
Двоичный файл не отображается.
|
@ -0,0 +1,35 @@
|
|||
% Table: A Description of our Benchmarks
|
||||
\centering
|
||||
\begin{tabular}{lrl}
|
||||
\toprule
|
||||
Name & \multicolumn{1}{c}{LoC} & Description \\
|
||||
\midrule
|
||||
bh & 1,162 & Barnes \& Hut N-body force computation algorithm \\
|
||||
bisort & 262 & Sorts using two disjoint bitonic sequences \\
|
||||
em3d & 476 & Simulates electromagnetic waves in 3D \\
|
||||
health & 338 & Simulates Colombian health-care system \\
|
||||
mst & 325 & Computes minimum spanning tree using linked lists \\
|
||||
perimeter & 399 & Computes perimeter of a set of quad-tree encoded images \\
|
||||
power & 452 & The Power System Optimization problem \\
|
||||
treadd & 180 & Computes the sum of values in a tree \\
|
||||
tsp & 415 & Estimates solution for the Traveling-salesman problem \\
|
||||
\emph{voronoi} & 814 & Computes voronoi diagram of a set of points \\
|
||||
\addlinespace
|
||||
anagram & 346 & Generates anagrams from a list of words \\
|
||||
\emph{bc} & 5,194 & An arbitrary precision calculator \\
|
||||
ft & 893 & Computes minimum spanning tree using Fibonacci heaps \\
|
||||
ks & 549 & Schweikert-Kernighan graph partitioning \\
|
||||
yacr2 & 2,529 & VLSI channel router \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Compiler Benchmarks. Top group is the Olden suite, bottom
|
||||
group is the Ptrdist suite. LoC includes all comments and blank lines
|
||||
in benchmark source files. Descriptions are from
|
||||
\cite{Rogers1995Olden,Austin1994Ptrdist}. We were unable to convert
|
||||
\emph{voronoi} from the Olden suite and \emph{bc} from the Ptrdist suite using the current version of
|
||||
Checked C.}
|
||||
\label{tab:bmdesc}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,33 @@
|
|||
% Table of benchmark results
|
||||
\centering
|
||||
\begin{tabular}{lrrrrrr}
|
||||
\toprule
|
||||
& \multicolumn{3}{c}{Code Changes}
|
||||
& \multicolumn{3}{c}{Observed Overheads} \\
|
||||
\addlinespace
|
||||
Benchmark
|
||||
& \multicolumn{1}{c}{\emph{LM} \%}
|
||||
& \multicolumn{1}{c}{\emph{EM} \%}
|
||||
& \multicolumn{1}{c}{\emph{LU} \%}
|
||||
& \multicolumn{1}{c}{\emph{RT} $\pm$\%}
|
||||
& \multicolumn{1}{c}{\emph{CT} $\pm$\%}
|
||||
& \multicolumn{1}{c}{\emph{ES} $\pm$\%} \\
|
||||
\midrule
|
||||
\input{scripts/bm_results_tab}
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Benchmark Results. Key: \emph{LM~\%}:
|
||||
Percentage of Source LoC Modified, including Additions; \emph{EM~\%}:
|
||||
Percentage of Code Modifications deemed to be Easy (see
|
||||
\ref{sec:eval-code-changes}); \emph{LU~\%}: Percentage of Lines
|
||||
remaining Unchecked; \emph{RT~$\pm$\%}: Percentage Change in Run Time;
|
||||
\emph{CT~$\pm$\%}: Percentage Change in Compile Time;
|
||||
\emph{ES~$\pm$\%}: Percentage Change in Executable Size
|
||||
(\texttt{.text} section only)}
|
||||
\label{tab:bmresults}
|
||||
|
||||
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,29 @@
|
|||
\centering
|
||||
\begin{tabular}{lll}
|
||||
\toprule
|
||||
Bounds Kind & Declaration & Canonical Bounds \\
|
||||
\midrule
|
||||
Range & \boundsdecl{\ArrayptrT~p}{\bounds{\mv{l}}{\mv{u}}} &
|
||||
\bounds{\mv{l}}{\mv{u}} \\
|
||||
Count & \boundsdecl{\ArrayptrT~p}{\boundscount{\mv{n}}} &
|
||||
\bounds{p}{p + \mv{n}} \\
|
||||
Byte Count & \boundsdecl{\ArrayptrT~p}{\boundsbytecount{\mv{n}}} &
|
||||
\bounds{p}{((\Arrayptr{\kw{char}})p) + \mv{n}} \\
|
||||
Singleton & \expr{\PtrT~p} & \bounds{p}{p + 1} \\
|
||||
\addlinespace
|
||||
\emph{Array} & \expr{\mv{T}~a~\kwchecked[\mv{N}]} &
|
||||
\bounds{a}{a + \mv{N}} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Canonical Bounds Expressions. In Canonical Bounds, the
|
||||
\expr{+} refers to C's pointer-integer addition operator, which adds
|
||||
integer multiples of the size of the pointer's referent type to the
|
||||
original pointer (hence the cast to \Arrayptr{\kw{char}} in the
|
||||
\kwbytecount{} case). The bounds on \emph{Array} are used when
|
||||
\expr{a} is converted from an array into a pointer, and \mv{N} must be
|
||||
constant.}
|
||||
\label{tab:canonical}
|
||||
%%% Local Variables:
|
||||
%%% mode: latex
|
||||
%%% TeX-master: "../tr02"
|
||||
%%% End:
|
|
@ -0,0 +1,73 @@
|
|||
\documentclass[10pt,openany,oneside]{book}
|
||||
\usepackage[left=1in,right=1in,top=1in,bottom=1in,headheight=14pt,headsep=14pt]{geometry}
|
||||
|
||||
\usepackage[draft]{checked_c}
|
||||
|
||||
\usepackage{bytefield}
|
||||
\usepackage{xspace}
|
||||
|
||||
\title{Putting the Checks into Checked C \\%
|
||||
Checked C Technical Report Number 2}
|
||||
|
||||
\author{Archibald Samuel Elliott}
|
||||
|
||||
\date{31 October 2017}
|
||||
|
||||
\hypersetup{%
|
||||
pdftitle={Putting the Checks into Checked C},
|
||||
pdfauthor={Archibald Samuel Elliott},
|
||||
pdfsubject={Checked C Technical Report Number 2}
|
||||
}
|
||||
|
||||
\allowdisplaybreaks
|
||||
|
||||
\input{scripts/bm_results_macros}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\frontmatter
|
||||
\begin{titlepage}
|
||||
\begin{center}
|
||||
\vspace*{2in}
|
||||
{\huge Putting the Checks into Checked C \\}
|
||||
\vspace{0.5in}
|
||||
{Checked C Technical Report Number 2\medskip\\}
|
||||
{\makeatletter\@date\makeatother\\}
|
||||
\vspace{0.25in}
|
||||
{\large Archibald Samuel Elliott\medskip\\}
|
||||
{Paul G. Allen School,\\
|
||||
University of Washington\\}
|
||||
\vspace{1in}
|
||||
{\it Summary \par}
|
||||
\begin{minipage}{5in}
|
||||
\input{abstract}
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
\end{titlepage}
|
||||
|
||||
\tableofcontents
|
||||
|
||||
\mainmatter
|
||||
\input{introduction}
|
||||
\input{overview}
|
||||
\input{example}
|
||||
\input{propagation}
|
||||
\input{checks}
|
||||
\input{evaluation}
|
||||
\input{related_work}
|
||||
\input{conclusion}
|
||||
|
||||
\appendix
|
||||
\addtocontents{toc}{%
|
||||
\protect\bigbreak
|
||||
{\large\bfseries \hfill Appendices\hfill }
|
||||
\protect\smallbreak
|
||||
}
|
||||
|
||||
\bibliography{checkedc}
|
||||
\theendnotes
|
||||
\input{c_syntax}
|
||||
\input{propagation_rules}
|
||||
|
||||
|
||||
\end{document}
|
Загрузка…
Ссылка в новой задаче