From caff1bb5ad8bd3f75a1ae7d4817dd56ebd0bfb5d Mon Sep 17 00:00:00 2001 From: Yahui Sun Date: Tue, 30 Jun 2020 14:46:22 -0500 Subject: [PATCH] Fix typos and spacing in spec and samples. (#406) --- samples/string-helpers.c | 16 +-- spec/bounds_safety/core-extensions.tex | 122 ++++++++-------- spec/bounds_safety/introduction.tex | 82 +++++------ spec/bounds_safety/variable-bounds.tex | 190 ++++++++++++------------- 4 files changed, 205 insertions(+), 205 deletions(-) diff --git a/samples/string-helpers.c b/samples/string-helpers.c index 824b69d..7c0dd9f 100644 --- a/samples/string-helpers.c +++ b/samples/string-helpers.c @@ -7,11 +7,11 @@ // clang -o string-helpers string-helper.c // On Windows use: // clang -o string-helpers.exe string-helpers.c -// +// // Then run the program with 6 string arguments, the 3rd of which should // be an integer For example: // string-helpers hello meet -125 fed brown red -// +// // Overview of using null-terminated arrays in Checked C: // // In Checked C, pointers to null-terminated data are represented using the @@ -52,7 +52,7 @@ // int f(nt_array_ptr p : count(n), int n) { // if (p[n]) { // read element at upper bound. // ... -// +// // If v != 0, the following code fails at runtime for an nt_array_ptr. It // always fails for an array_ptr. // @@ -67,13 +67,13 @@ // int set_zero(nt_array_ptr p : count(n), int n) { // p[n] = 0; // allowed for nt_array_ptr. Runtime error for array_ptr! // } -// +// // The following example illustrates how bounds differ from // the count: // // nt_array_ptr : count(5) = "hello"; // -// The string "hello" contains 6 characters, the last of which is +// The string "hello" contains 6 characters, the last of which is // a null terminator. The valid bounds is count(5). // // @@ -84,7 +84,7 @@ // 1. Nt_array_ptrs with no bounds declared have a default bounds of count(0). // 2. If you are using array subscripting to access the nt_array_ptr, // you'll need to widen the bounds as you determine the last character -// is not a null terminator. This is typically done by +// is not a null terminator. This is typically done by // declaring bounds involving a separate counter: // array_ptr p : count(0); // int i = 0; @@ -108,7 +108,7 @@ // for ( ; arr[i] !=0; i++) { // nt_array_ptr : count(i+1) tmp = p; // tmp[i] = ... -// +// // The function my_strlen show points 1 and 2. It is less // frequent for string functions to modify strings. The function squeeze, // which does do that, shows point 3. @@ -136,7 +136,7 @@ checked int my_strlen(nt_array_ptr p) { } // Delete all c from p (adapted from p. 47, K&R 2nd Edition) -// p implicltly has count(0). +// p implicitly has count(0). checked void squeeze(nt_array_ptr p, char c) { int i = 0, j = 0; // Create a temporary whose count of elements can diff --git a/spec/bounds_safety/core-extensions.tex b/spec/bounds_safety/core-extensions.tex index 04fe149..022597c 100644 --- a/spec/bounds_safety/core-extensions.tex +++ b/spec/bounds_safety/core-extensions.tex @@ -3,13 +3,13 @@ \chapter{Extensions to existing C concepts} \label{chapter:core-extensions} -This chapter describes extensions to existing C concepts. +This chapter describes extensions to existing C concepts. It covers new kinds of pointer types and array types, their meaning, and the operations on them. It introduces two new program scopes: \lstinline+checked+ and \lstinline+unchecked+ -blocks. It covers a generalization of assertions to +blocks. It covers a generalization of assertions to dynamic checks that are never removed (unless a compiler -proves them redundant) because bounds safety depends upon them. +proves them redundant) because bounds safety depends upon them. Finally, it describes changes to undefined behavior needed for bounds safety. @@ -22,7 +22,7 @@ capital letter are reserved for system use, as are identifiers that begin with two underscores \cite[Section 7.1.3]{ISO2011}. The following new keywords are introduced: \begin{lstlisting} -_Array_ptr _Checked _Dynamic_check _For_any _Itype_for_any _Nt_array_ptr _Nt_checked +_Array_ptr _Checked _Dynamic_check _For_any _Itype_for_any _Nt_array_ptr _Nt_checked _Opaque _Ptr _Reveal _Where _Unchecked \end{lstlisting} @@ -38,7 +38,7 @@ the implementations of standard header files need to be modified to not use identifiers that conflict with these keywords. The all-lowercase versions of the keywords are: \begin{lstlisting} -array_ptr checked dynamic_check for_any nt_array_ptr nt_checked +array_ptr checked dynamic_check for_any nt_array_ptr nt_checked opaque ptr reveal where unchecked \end{lstlisting} @@ -87,8 +87,8 @@ have an error or warning mode that flags unexpected uses of `\code{*}'. The same syntax as C++ template instantiations is used for building instances of these types because this syntax is well-known and -understood. The new pointer types are added to the syntax for {\it type specifiers} -\cite[Section 6.7.2]{ISO2011}. The parameters to these type constructors +understood. The new pointer types are added to the syntax for {\it type specifiers} +\cite[Section 6.7.2]{ISO2011}. The parameters to these type constructors must be types, which are described syntactically by {\it type names} \cite[Section 6.7.7]{ISO2011}. If Checked C is extended to C++, in the C++ extension these new types will be template types that have special meaning. @@ -125,10 +125,10 @@ int *const p = &x; const ptr q = &x; const array_ptr r = &x; \end{lstlisting} -In this case, the pointers cannot be modified after they are defined, but -the integer that they point to can be modified. The checked pointer declarations +In this case, the pointers cannot be modified after they are defined, but +the integer that they point to can be modified. The checked pointer declarations have simpler syntax than the unchecked pointer declaration, where \keyword{const} -must be placed after the \lstinline+*+. The checked pointer declarations are all analogous +must be placed after the \lstinline+*+. The checked pointer declarations are all analogous to \lstinline+const double y = 5.0;+ declaring a variable \lstinline+y+ that is \keyword{const}. In the declaration of \keyword{y}, \keyword{double} specifies a type. The \keyword{const} keyword is placed before it to declare that @@ -137,7 +137,7 @@ and \arrayptrint\ all specify types too. The checked pointer types follow the same rules that apply to modifiers for unchecked pointer types. For example, a variable that is a pointer to a -\keyword{const} value can be assigned a pointer to a non-const value. +\keyword{const} value can be assigned a pointer to a non-const value. The reverse is not allowed. Array pointers (\arrayptr) of function types are not allowed. Functions have @@ -166,7 +166,7 @@ int a checked[10]; All array references to checked array types are bounds checked. C has the rule that an ``array of \var{T}'' is converted implicitly to a ``pointer to \var{T}'' in a number of situations. This rule is extended -to convert a ``checked array of \var{T}'' to an ``\arrayptr\ +to convert a ``checked array of \var{T}'' to an ``\arrayptr\ to \var{T}''. In C, array types may be complete or incomplete. A complete array type @@ -186,7 +186,7 @@ that \lstinline{i >= 0} and \lstinline{i < 10}. Array references to multi-dimensional arrays must be uniformly bounds checked or not bounds checked. If any dimension is bounds checked, all dimensions must be checked. A programmer can simply declare that -the outer dimension of an array is checked. The checked property will be +the outer dimension of an array is checked. The checked property will be propagated to the inner dimensions: \begin{lstlisting} @@ -258,9 +258,9 @@ can be used to declare multi-dimensional arrays: typedef int arr_ty[10]; arr_ty x[10]; \end{lstlisting} -The checked property is propagated only to array declarators that are +The checked property is propagated only to array declarators that are nested directly within other array declarators. It is not propagated to -the bodies of type definitions. It is an error if an array type and a nested +the bodies of type definitions. It is an error if an array type and a nested array type from the use of a type name have different checked properties. Here are examples of correct and incorrect declarations: @@ -282,9 +282,9 @@ in memory that ends with a null terminator value. We add a new type to represent pointers to these kinds of arrays. We divide these arrays into two parts: a prefix with bounds and a sequence of additional elements that ends -with a null terminator. The initial elements of the +with a null terminator. The initial elements of the sequence can be read, provided that preceding elements -are not the null terminator. The bounds can be +are not the null terminator. The bounds can be {\em widened} based on the number of elements read. A programmer may declare bounds for an \ntarrayptr\ variable. If bounds @@ -293,12 +293,12 @@ at runtime. If no bounds are declared, the bounds are inferred. At variable declarations, the declared bounds are for arrays with a prefix of 0 elements. -At variable uses, +At variable uses, the bounds are determined using a program analysis that widens bounds based on control-flow. The bounds are then used to check memory accesses. A problem with using an analysis to determine bounds -is that the bounds are not written down in the program. +is that the bounds are not written down in the program. This may lead to unexpected failures if the bounds are narrower than expected and it may make it difficult to understand failures. @@ -314,10 +314,10 @@ if (*p == 'a') if (*(p + 1) == 'b') if (*(p + 2) == 'c') \end{lstlisting} - + The element type \var{T} of an \ntarrayptrT\ must be an integral or pointer type. \ntarrayptrT\ extends \arrayptrT: the -prefix with bounds is treated the same as if an \arrayptrT\ pointed +prefix with bounds is treated the same as if an \arrayptrT\ pointed to it. An \ntarrayptrT\ value can be converted to an \arrayptrT\ value with the same bounds. @@ -400,11 +400,11 @@ The following operations involving pointer-typed values are allowed: inequality. The pointers do not have to be the same kind of pointer. To support reasoning about program behavior, the result of comparing pointers to different objects must be defined. Checked pointers can also - be compared for equality or inequality with \code{0} and \void\ pointers, + be compared for equality or inequality with \code{0} and \void\ pointers, just like unchecked pointers. \item Pointers to objects of the same type can be compared relationally. Relational comparisons are the - \lstinline|<|, \lstinline|<=|, \lstinline|>|, \lstinline|>=| operators. The pointers do not have + \lstinline|<|, \lstinline|<=|, \lstinline|>|, \lstinline|>=| operators. The pointers do not have to be the same kind of pointer. For example, an \uncheckedptrT\ can be compared with an \arrayptrT . To support bounds checking and reasoning about program behavior, the @@ -480,7 +480,7 @@ Programmers must initialize these variables explicitly using C's initializer syn when they are declared. Initializing an automatic variable with a checked pointer type is simple: a programmer adds -\lstinline+= 0+ or \lstinline+x = NULL+ to the variable declarator. C's initializer syntax +\lstinline+= 0+ or \lstinline+x = NULL+ to the variable declarator. C's initializer syntax allows compact initializers for structure variables and array variables. A list of initialization values is specified using a brace-enclosed list of expressions. The initializer list can be partial list that describes @@ -526,13 +526,13 @@ scopes, the declared types of variables are allowed to be or use checked pointer types or checked array types; unchecked pointer types and unchecked array types are not allowed. Similarly, for functions, return types and parameter types are allowed to be or use checked pointer -types or checked array types. +types or checked array types. On the other hand, declarations in checked scopes can use unchecked pointer types and unchecked array types, provided that the declarations provide a bounds-safe interface. These are described in Section~\ref{section:function-bounds-safe-interfaces}. -Variables and functions used in checked scopes are +Variables and functions used in checked scopes are allowed to have or use checked pointer types, checked array types, or unchecked pointer types and unchecked array types with bounds declarations. @@ -551,7 +551,7 @@ existing C code. A checked block is introduced by placing \keyword{checked} before a compound block: \begin{lstlisting} -checked +checked { int a = 5; ptr pa = &a; @@ -573,25 +573,25 @@ checked keyword can be used. This example shows the use of an unchecked block: \begin{lstlisting} -checked +checked { int a = 5; - unchecked + unchecked { - int *upa = &a; + int *upa = &a; int b[5][5]; for (int i = 0; i < 5; i++) { for (int j = 0; j < 5; j++) { // not bounds checked b[i][j] = -1; } - } + } } ... } \end{lstlisting} -In a checked function definition, the body of the function is a +In a checked function definition, the body of the function is a checked block. A checked function definition is declared by placing the \keyword{checked} keyword before the definition. Here are examples of checked and unchecked function definitions: @@ -599,7 +599,7 @@ unchecked function definitions: \begin{lstlisting} // checked at the function level: no unchecked pointers can appear in // argument types, the return type, or the body of the function. -checked int f(ptr p) +checked int f(ptr p) { int a = 5; ptr pa = &a; @@ -635,10 +635,10 @@ referred to as K\&R-style function declarations because this is the only way functions were declared in the first edition of the C Programming Language book by Brian Kernigan and Dennis Ritchie. The second edition incorporated changes from the C ANSI Standard and introduced function prototypes.}, where the types -of the arguments to functions are not specified . These -functions are dangerous to use because there can be mismatches +of the arguments to functions are not specified. These +functions are dangerous to use because there can be mismatches between argument types and parameter types at function -calls. This can corrupt data or the call stack. In checked scopes, +calls. This can corrupt data or the call stack. In checked scopes, the use or declaration of functions without prototypes is not allowed. As we add different notions of checking to Checked C, we will use the @@ -656,7 +656,7 @@ It consists of a parenthesized type name followed by list of initializers in bra A compound literal has static storage if it is at the top level of a file and automatic storage if it occurs within a block. Here are examples of compound literals: \begin{lstlisting} -int *arr = (int []) { 0, 1, 2}; +int *arr = (int []) { 0, 1, 2 }; struct Point { int x; int y; @@ -714,7 +714,7 @@ checked void f(void) { // checking to nt_array_ptr. nt_array_ptr p = "hello"; - // "goodbye" has type char nt_checked[8]. The type is altered during + // "goodbye" has type char nt_checked[8]. The type is altered during // type checking to the pointer type array_ptr. array_ptr r = "goodbye"; @@ -803,11 +803,11 @@ struct inner_array { typedef struct inner_array ragged_2d_arr[]; -ragged_2d_arr data = +ragged_2d_arr data = { { 3, (double checked[]) { 1.0, 0.0, 0.0 } }, { 1, (double checked[]) { 5.0 } }, { 0, 0 }, - { 2, (double checked[]) { 6.0, 7.0 } } + { 2, (double checked[]) { 6.0, 7.0 } } }; struct BufferPair { @@ -841,7 +841,7 @@ nt_array_ptr more_colors nt_checked[] = { "white", "black", // A null-terminated array of pointers to null-terminated arrays // of strings. -nt_array_ptr> sentences nt_checked[4] = +nt_array_ptr> sentences nt_checked[4] = { (nt_array_ptr nt_checked[]) { "this", "is", "the", "first", "sentence", 0 }, (nt_array_ptr nt_checked[]) { "here", "is", "another", 0 }, @@ -869,16 +869,16 @@ The type to which a pointer and its bounds are relatively aligned is called the relative alignment type. By default, the relative alignment type for a pointer and its bounds is the referent type. However, the relative alignment type can be overridden by specifying it explicitly as -part of the bounds. This is described in +part of the bounds. This is described in Section~\ref{section:representing-relative-alignment}. -This can be used for bounds for the results of pointers casts and +This can be used for bounds for the results of pointers casts and for \arrayptrvoid\ pointers. The type \void\ has no defined size, so the default relative alignment is undefined for \void. We considered removing the entire concept of relative alignment from the -design to simplify the design. We decided against that because it would -increase the cost of bounds checking throughout the program. +design to simplify the design. We decided against that because it would +increase the cost of bounds checking throughout the program. Section~\ref{section:design-alternatives:always-unaligned} explains this choice in more detail. @@ -1036,7 +1036,7 @@ pointer arithmetic. To support this expansion, integer arithmetic operators are extended with the operators \plusovf, \minusovf, and \mulovf. The -operators interpet pointers as unsigned integers in some range 0 to +operators interpret pointers as unsigned integers in some range 0 to \code{UINTPTR_MAX}. An operator produces a runtime error if the value of its result cannot be represented by the result type: @@ -1044,7 +1044,7 @@ of its result cannot be represented by the result type: \item \plusovf\ takes an unsigned integer \var{i} and an integer \var{j} and produces an unsigned integer in the range 0 to - \code{UINTPTR_MAX}. Its result is the mathemetical value \var{i} + \var{j}. + \code{UINTPTR_MAX}. Its result is the mathematical value \var{i} + \var{j}. \item For subtraction, there are two forms: @@ -1057,7 +1057,7 @@ of its result cannot be represented by the result type: \item \lstinline|-|\textsubscript{ovf\_diff } takes two unsigned integers \var{i} and \var{j} and computes \var{i} - \var{j}, producing a signed integer of type - \code{ptrdiff_t}. Its result is the mathemetical value \var{i} - \var{j}. + \code{ptrdiff_t}. Its result is the mathematical value \var{i} - \var{j}. \end{itemize} \item \mulovf\ takes two integers \var{i} and \var{j} (both either @@ -1080,7 +1080,7 @@ signed or unsigned integer as \code{e2}. If \code{e2} is a signed integer, the expansion of \code{e1 + e2} must cast \sizeof{\var{T}} to a signed integer. We introduce a signed integer type \code{signed_size_t} that is large enough for - this cast. \code{e1 + e2} expands to \code{e1} \plusovf\ + this cast. \code{e1 + e2} expands to \code{e1} \plusovf\ \code{((signed_size_t)} \sizeof{\var{T}}\code{)} \mulovf\ \code{e2}. This cast is necessary because in C, multiplying a signed integer by an unsigned integer implicitly converts the signed integer to be an unsigned @@ -1110,7 +1110,7 @@ Otherwise, in a typical C environment, the program would fault at \lstinline+*p+. In the code \lstinline|if (p < hi) { p += 1; }| the check \lstinline+p < hi+ implies that the increment cannot overflow. The checks are also inexpensive on superscalar processors: -they can be placed so that they are statically predicted by +they can be placed so that they are statically predicted by processors to never fail. \section{Programmer-inserted dynamic checks} @@ -1215,7 +1215,7 @@ void decode(array_ptr output_buffer, array_ptr input_buffer, |\hl{\textit{// need check that dst and src have at least len bytes of space}}| memcpy(dst, src, len); src += len; - dst += len; + dst += len; break; } case COMPRESSED_INT64: { @@ -1273,7 +1273,7 @@ so there is no way to test the path. \end{lstlisting} The problems with requiring functions that validate buffer lengths to -return status codes for errors are analyzed by O'Donell and Sebor\cite{ODonell2015}. +return status codes for errors are analyzed by O'Donell and Sebor\cite{ODonell2015}. Annex K of the C Standard \cite{ISO2011} introduced a new set of standard library functions to replace functions that provide no way to validate their arguments. These functions return status codes to indicate success or failure. A classic @@ -1281,7 +1281,7 @@ example of a function prone to misuse is \lstinline+strcpy(char *dst, const char *src)+. It copies all bytes in \lstinline+src+ to \lstinline+dst+ until it hits a null byte. If \lstinline+src+ is missing the null byte or -\lstinline+dst+ is too small, this causes a buffer overrunn. The new +\lstinline+dst+ is too small, this causes a buffer overrun. The new function \lstinline+strcpy_s+ takes an additional size parameter for \lstinline+dst+ and has the signature \lstinline+errno_t strcpy_s(char *dst, size_t dest_len, const char *src)+. @@ -1305,7 +1305,7 @@ void decode(array_ptr output_buffer, array_ptr input_buffer, |\hl{\textit{dynamic\_check(dst + len < dst\_bound \&\& src + len < src\_bound);}}| memcpy(dst, src, len); src += len; - dst += len; + dst += len; break; } ... @@ -1321,15 +1321,15 @@ behavior is worse. The following example uses \lstinline+dynamic_check+ to eliminate bounds checks in a loop. It is based on experience hand-optimizing C\# and Java programs. This kind of example is typically found during a performance -tuning phase of program development. In the example, +tuning phase of program development. In the example, the \lstinline|: count(|\var{exp}\lstinline|)| notation indicates that \var{exp} is the length of the buffer. \begin{lstlisting} void append(array_ptr dst : count(dst_count), - array_ptr src : count(src_count), + array_ptr src : count(src_count), size_t dst_count, size_t src_count) -{ +{ for (size_t i = 0; i < src_count; i++) { if (src[i] == marker) { break; @@ -1382,7 +1382,7 @@ A compiler would not introduce this dynamic\_check because it would alter the behavior of \lstinline+append+. The bounds check on \lstinline+dst+ in the original code is done only if a marker is not found in \lstinline+src+ and \lstinline+src_count > 0+. A compiler could -try to deduce a precondition for the loop +try to deduce a precondition for the loop that prevents the bounds check from failing, but this is not possible because the precondition depends on the contents of \lstinline+src+. A compiler would have to clone code to maintain the same @@ -1391,7 +1391,7 @@ sort of transformation or do it sparingly. Programmer control produces better results. Here is the code a compiler might generate with cloning. \begin{lstlisting}[escapechar=\|] -void append(array_ptr dst: count(dest_count), +void append(array_ptr dst: count(dest_count), array_ptr src : count(src_count), size_t dst_count, size_t src_count) { @@ -1493,7 +1493,7 @@ signed integer overflow: \begin{itemize} \item Unchecked pointers shall be treated as addresses of locations in memory, - just as checked pointers are treated as addressses. The addresses shall + just as checked pointers are treated as addresses. The addresses shall be unsigned integers with a defined range of 0 to \code{UINTPTR_MAX}: @@ -1527,7 +1527,7 @@ In the case of a runtime error, program execution cannot continue in a way that uses the value of the expression that produced the error. For programs with expressions and initializers with undefined meanings, -those programs must be rejected at translation-time. +those programs must be rejected at translation-time. Section~\ref{section:avoiding-undefinedness} describes this in detail. diff --git a/spec/bounds_safety/introduction.tex b/spec/bounds_safety/introduction.tex index 04bf840..0b8af26 100644 --- a/spec/bounds_safety/introduction.tex +++ b/spec/bounds_safety/introduction.tex @@ -3,7 +3,7 @@ \chapter{Introduction} \label{chapter:introduction} -The C programming language \cite{Ritchie1988, ISO2011} allows programmers to use +The C programming language \cite{Ritchie1988, ISO2011} allows programmers to use pointers directly. A pointer is an address of a location in memory. Programs may do arithmetic on pointers, dereference them to read memory, or assign through them to modify memory. The ability to use pointers directly @@ -69,7 +69,7 @@ Checked C addresses the bounds checking problem for C by: extension to \arrayptr\ for null-terminated arrays (\ntarrayptr). \item For array pointer types (\arrayptr), in keeping with the low-level - nature of C, bounds checking is placed under programmer control. + nature of C, bounds checking is placed under programmer control. This differs from languages like Java, where bounds checking is completely automatic. A programmer declares \emph{bounds}, where the bounds for an \arrayptr\ @@ -174,11 +174,11 @@ This problem is addressed in two ways. First, the language rules that are used to check the validity of bounds are limited intentionally. The rules can check the validity of bounds that are needed in practice. However, there are bounds that are true that cannot be proved using the rules. In the terminology -of program logics, the rules are incomplete. As an example, the +of program logics, the rules are incomplete. As an example, the rules about distributivity limit the size of -expressions that they produce. In practice, this means that +expressions that they produce. In practice, this means that simple disjunctive bounds can be checked, but complex disjunctive bounds cannot be checked. - + Second, the inference that compilers do for checking program invariants is limited. Compilers act as \emph{checkers} for invariants. They check that declared invariants follow from other declared invariants, the @@ -202,25 +202,25 @@ scope of this technical report. For now, it is assumed that programs are correct in these other aspects. This design is being done in an iterative fashion. To validate the -design, we mocked up modifying a subset of the OpenSSL -code base \cite{OpenSSL2015} to be bounds-safe . -We created C++ templates for the new pointer types and modified OpenSSL to +design, we mocked up modifying a subset of the OpenSSL +code base \cite{OpenSSL2015} to be bounds-safe . +We created C++ templates for the new pointer types and modified OpenSSL to compile as valid C++ code. We hand-edited about 11,000 lines of the code to use checked pointer -types with full bounds annotations. We used macros to encapsulate +types with full bounds annotations. We used macros to encapsulate the bounds annotations so that they could be elided from the code and OpenSSL compiled and tested using the new types. We also modified the generic stack type in OpenSSL to use the \code{ptr} type, which required -cross-cutting changes across the code base (in all, about 160 files were changed) -as well dealing with complicated macros. +cross-cutting changes across the code base (in all, about 160 files were changed) +as well dealing with complicated macros. We learned the following from this experience. First, it was important to have a compact, succinct syntax -for declaring bounds. -Second, in most cases, declaring bounds at declarations was sufficient for -tracking the bounds of variables. Large blocks of code +for declaring bounds. +Second, in most cases, declaring bounds at declarations was sufficient for +tracking the bounds of variables. Large blocks of code remained unchanged, which matches the observations of the -Cyclone project \cite{Jim2002}, an earlier research +Cyclone project \cite{Jim2002}, an earlier research effort to create a type-safe version of C. Third, the expressions allowed in declaring bounds needed to be rich. Fourth, pointer casts were used @@ -230,19 +230,19 @@ easily to be appropriate for the new referent type of the pointer. Fifth, it was clear that there needed to be a graceful way of interoperating with existing libraries that could not be changed. Finally, signed integer overflow was a pervasive -possibility, which raised questions about the meaning of bounds -declarations that used signed integers. +possibility, which raised questions about the meaning of bounds +declarations that used signed integers. We revised the design to address these issues. In particular, we paid close attention to tracking bounds through pointer casts. We also made sure that the -contraints on signed integer expressions used in bounds expressions +constraints on signed integer expressions used in bounds expressions were understood and could be written down in the language of simple invariants. The design is a work-in-progress. Some material will be missing or incomplete because the language design has not yet been done. -Readers should be aware that all parts of the document are subject to change. +Readers should be aware that all parts of the document are subject to change. We are interested in feedback and suggestions about ways to improve the design. @@ -304,7 +304,7 @@ Two specific design principles are adopted based on these principles: inventing new notations. Many systems are hybrid C/C++ systems, so this approach fits with the principle of clarity. It also enables incremental adoption. One of the design goals of C++ has been that C - is a subset of C++. It is a design goal to allow Checked C to be a + is a subset of C++. It is a design goal to allow Checked C to be a subset of C++ too. \end{enumerate} @@ -314,13 +314,13 @@ Operators and symbolic characters will be shown in black (like \lstinline|->|). variables will be shown in light blue (like \lstinline|for (int i = 0; i<10; i++)|). At times, we will define properties or make mathematical statements that range over sets of values or program elements, such as mathematical integers, variables, or expressions. We will use meta-variables in {\it italics} that -range over these values or program elements. We might say something like +range over these values or program elements. We might say something like ``for all expressions \var{e\textsubscript{1}} and \var{e\textsubscript{2}}.'' These meta-variables differ from program variables, which are elements of specific C programs. \section{Organization of this document} -The target audience for this document includes programmers interested in +The target audience for this document includes programmers interested in learning about Checked C, compiler writers, and language designers. These groups have different and conflicting needs. Programmers will find that some sections have too much detail, while language designers will likely @@ -339,7 +339,7 @@ bounds safety even in the face of integer overflow, which is a widespread possibility in C programs. This requires a more nuanced requirement on the result of overflow than ``the result is undefined.'' -Chapters~\ref{chapter:core-extensions} through +Chapters~\ref{chapter:core-extensions} through \ref{chapter:pointers-to-data-with-arrayptrs} present the extensions to C for declaring bounds for \arrayptr\ values stored in variables and structures and @@ -377,13 +377,13 @@ covers inferring effects of an expression on the bounds of variables. This inferred information is then used to validate that declarations and statements correctly declare bounds and maintain the bounds information. -Chapter~\ref{chapter:interoperation} covers interoperation between +Chapter~\ref{chapter:interoperation} covers interoperation between checked and unchecked code. It covers conversions between checked and unchecked pointers, as well as conversions between the new kinds of checked pointers. It pins down the notion of checked code and unchecked code. Finally, it covers bounds-safe interfaces in depth. -Chapter~\ref{chapter:structure-bounds} extends these ideas from variables to data +Chapter~\ref{chapter:structure-bounds} extends these ideas from variables to data structures by introducing member bounds, which are type-level invariants about members of structure types. For now, it assumes that the programmer has done @@ -395,13 +395,13 @@ uses to check programs with member bounds. Chapter~\ref{chapter:pointers-to-data-with-arrayptrs} describes how to extend reasoning about bounds to pointers to data that has \arrayptr-typed values within it. This includes pointers to structures with members that -have \arrayptr\ types and pointers directly to \arrayptr s. Pointers to +have \arrayptr\ types and pointers directly to \arrayptr s. Pointers to structures can be used easily by ensuring that modifications to members -preserve type-level bounds invariants. -This follows the lead of the Deputy system \cite{Condit2007}. -Complexity arises for pointers to \arrayptr s where -the \arrayptr s have bounds that depend on other variables or members. The -other variables or members need to be abstracted by pointers as well to ensure +preserve type-level bounds invariants. +This follows the lead of the Deputy system \cite{Condit2007}. +Complexity arises for pointers to \arrayptr s where +the \arrayptr s have bounds that depend on other variables or members. The +other variables or members need to be abstracted by pointers as well to ensure that invariants for bounds are not violated. Assignments via pointer indirections need to be coordinated to maintain the invariant. This gives rise to constraints on the pointers and the variables or @@ -418,12 +418,12 @@ about whether one fact is true given a set of other facts (for example, given x == y and the statement \code{z = y;} z == x is true after that statement). These rules and facts can be used to deduce the correctness of bounds declarations that differ from those inferred directly by the -checking described in Chapters~\ref{chapter:checking-bounds} -and~\ref{chapter:pointers-to-data-with-arrayptrs}. For example, +checking described in Chapters~\ref{chapter:checking-bounds} +and~\ref{chapter:pointers-to-data-with-arrayptrs}. For example, a programmer may wish to narrow the memory that is accessible via an \arrayptr\ variable by declaring bounds that are a subrange of the bounds inferred by the checking. A programmer may wish to update the -bounds for an \arrayptr\ variable after an assignment \code{x = y}, substituting +bounds for an \arrayptr\ variable after an assignment \code{x = y}, substituting \code{x} for \code{y}. The same static checking that is used for bounds can be used to reason @@ -448,7 +448,7 @@ of the modifications or understand the practical benefits of checking. Still, this gives some idea about the usefulness and applicability of the design. -Chapter~\ref{chapter:open-issues} summarizes the open problems uncovered by +Chapter~\ref{chapter:open-issues} summarizes the open problems uncovered by Chapter~\ref{chapter:eval} or unaddressed by the design. It also describes next steps for implementing this in a C compiler. @@ -460,21 +460,21 @@ that were considered and not chosen. It explains why those alternatives were not chosen. Here are the suggested roadmaps for the different groups of readers. -For programmers interested in learning about Checked C, -we suggest reading Chapter~\ref{chapter:core-extensions} except +For programmers interested in learning about Checked C, +we suggest reading Chapter~\ref{chapter:core-extensions} except Sections~\ref{section:pointer-arithmetic-errors} and \ref{section:changes-to-undefined-behavior}, Chapter~\ref{chapter:tracking-bounds} -except Section~\ref{section:extent-of-declarations}, +except Section~\ref{section:extent-of-declarations}, Section~\ref{section:inferring-expression-bounds} of Chapter~\ref{chapter:checking-bounds}, Chapter~\ref{chapter:interoperation} except for Section~\ref{section:checking-bounds-interfaces}, Chapter~\ref{chapter:structure-bounds} except for Sections~\ref{section:member-bounds-state-extent} and \ref{section:checking-bounds-with-structures}, -and Chapter~\ref{chapter:simple-invariants} through the first section. +and Chapter~\ref{chapter:simple-invariants} through the first section. Sections~\ref{section:pointer-cast-results}, \ref{section:integer-overflow-informal}, and \ref{section:pointer-integer-conversions} are advanced topics that can be read after the other recommended chapters and sections. -We suggest language designers and compiler writers +We suggest language designers and compiler writers read most of the document. Language designers can skip Sections~\ref{section:computing-extent}, \ref{section:canonicalization}, and \ref{section:canonicalization-example}. Compiler writers can skip the discussions in Section~\ref{section:programmer-dynamic-checks} @@ -484,7 +484,7 @@ and Chapters~\ref{chapter:related-work} and \ref{chapter:design-alternatives}. \section{Acknowledgements} -This design has benefited from many discussions with Weidong Cui, Gabriel Dos Reis, +This design has benefited from many discussions with Weidong Cui, Gabriel Dos Reis, Sam Elliott, Chris Hawblitzel, Galen Hunt, Shuvendu Lahiri, and Reuben Olinsky. The design has benefited also from feedback from Michael Hicks, Wonsub Kim, Greg Morrisett, Jonghyun Park, and Andrew Ruef. We thank them for their contributions to diff --git a/spec/bounds_safety/variable-bounds.tex b/spec/bounds_safety/variable-bounds.tex index 6ff0b4e..64993cf 100644 --- a/spec/bounds_safety/variable-bounds.tex +++ b/spec/bounds_safety/variable-bounds.tex @@ -32,7 +32,7 @@ the form: \end{tabbing} where \var{x} is a variable. Additional forms for handling -\void\ pointers and casts are described in +\void\ pointers and casts are described in Sections~\ref{section:pointers-to-void} and \ref{section:pointer-cast-results}. A bounds expression describes the memory that can be accessed using the @@ -67,7 +67,7 @@ include counts, pairs that specify an upper bound and lower bound, Bounds expression use ``non-modifying'' expressions. These are a subset of C expressions that do not modify variables or memory. They include local variables, parameters, constant expressions, casts, and operators -such as addition, subtraction, and address-of. +such as addition, subtraction, and address-of. Section~\ref{section:non-modifying-expressions} describes non-modifying expressions in detail. @@ -80,8 +80,8 @@ type of \var{e1} is a character, a short integer, a bit field, or an enumeration the expression is promoted to an \keyword{int} type if that is large enough to hold all values of the type or an \keyword{unsigned int} type otherwise.} For \boundsdecl{\var{x}}{\bounds{\var{e1}}{\var{e2}}}, \var{e1} and -\var{e2} must be pointers to the same type. Typically \var{x} is also -a pointer to that type or an array of that type. +\var{e2} must be pointers to the same type. Typically \var{x} is also +a pointer to that type or an array of that type. However, \var{x} can be a pointer to or an array of a different type. This is useful for describing the results of casts and bounds for \arrayptrvoid\ pointers. @@ -140,7 +140,7 @@ have been derived via a sequence of operations from a pointer to some object \var{obj} with \bounds{\var{low}}{\var{high}}. The following statement will be true at runtime: \var{ev} \lstinline@== 0 || (@\var{low} \lstinline|<=| \var{lbv} \lstinline|&&| -\var{ubv} \lstinline|<=| \var{high}\lstinline|)|. In other words, +\var{ubv} \lstinline|<=| \var{high}\lstinline|)|. In other words, if \var{ev} is null, the bounds may or may not be valid. If \var{ev} is non-null, the bounds must be valid. This implies that any access to memory where \var{ev} @@ -155,10 +155,10 @@ acceptable to use the address of an address-taken variable in a bounds declaration. This is provided that the resulting address is not used to access memory in the bounds declaration (that is, indirectly access the value of a variable). For example, given the declaration \code{int x}, -the expression \code{&x} may appear in a bounds expression. +the expression \code{&x} may appear in a bounds expression. Chapter~\ref{chapter:pointers-to-data-with-arrayptrs} extends the -design to avoid these restrictions. It covers pointers to data with -\arrayptr\ values, pointers to variables used in bounds, and bounds +design to avoid these restrictions. It covers pointers to data with +\arrayptr\ values, pointers to variables used in bounds, and bounds that use pointers. \section{Using bounds declarations} @@ -169,7 +169,7 @@ by following the declarator with \code{:} \var{bounds-exp}. In that case, the bounds expression applies to the variable that is the subject of the declarator. By making the bounds be part of the program, this preserves the control and efficiency of C. The bounds declarations will -be checked statically for correctness using rules described in +be checked statically for correctness using rules described in Chapter~\ref{chapter:checking-bounds}. \subsection{Bounds declarations at variable declarations} @@ -182,13 +182,13 @@ between \code{start} and \code{end}, where the integer stored at \begin{lstlisting} int sum(array_ptr start : bounds(start, end), array_ptr end) -{ +{ int result = 0; array_ptr current : bounds(start, end) = start; while (current < end) { - result += *current++; // *current is bounds-checked. The checking ensures - // that current is within the bounds of (start, end) - // at the memory access. + result += *current++; // *current is bounds-checked. The checking ensures + // that current is within the bounds of (start, end) + // at the memory access. } return result; } @@ -198,13 +198,13 @@ This can be written using \keyword{where} clauses as well, at the inconvenience of typing variable names twice in declarations: \begin{lstlisting} -int sum(array_ptr start where start : bounds(start, end), +int sum(array_ptr start where start : bounds(start, end), array_ptr end) -{ +{ int result = 0; array_ptr current where current : bounds(start, end) = start; while (current < end) { - result += *current++; + result += *current++; } return result; } @@ -249,12 +249,12 @@ int bad_find(int key, array_ptr a : count(len), int len) This function adds two arrays of 2x2 arrays. \begin{lstlisting} -int add(int a checked[][2][2] : count(len), int b checked[][2][2] : count(len), - int len) +int add(int a checked[][2][2] : count(len), int b checked[][2][2] : count(len), + int len) { for (int i = 0; i < len; i++) { // All array accesses are bounds checked - a[i][0][0] += b[i][0][0]; + a[i][0][0] += b[i][0][0]; a[i][0][1] += b[i][0][1]; a[i][1][0] += b[i][1][0]; a[i][1][1] += b[i][1][1]; @@ -282,9 +282,9 @@ int sum(void) This is a declaration of a function that copies bytes provided that the pointers and lengths are aligned: \begin{lstlisting} -int aligned_memcpy(array dst where dst : count(len) && +int aligned_memcpy(array dst where dst : count(len) && aligned(dst, 4), - array src where src : count(len) && + array src where src : count(len) && aligned(src, 4), int len where len % 4 == 0); \end{lstlisting} @@ -374,7 +374,7 @@ statements. It does not make sense to have dataflow-sensitive bounds for externally-scoped variables and variables with static or thread storage. The bounds extend to the next assignment to any variable in the bounds declaration, with some exceptions for pointer incrementing or -decrementing. Section~\ref{section:extent-of-declarations} +decrementing. Section~\ref{section:extent-of-declarations} describes the extent of flow-sensitive bounds in more detail. @@ -384,7 +384,7 @@ elements and then points to an array with 10 elements; the bounds are adjusted accordingly. \begin{lstlisting} -void f(void) +void f(void) { int x[5]; int y[10]; @@ -401,7 +401,7 @@ lazily to either \code{a} or \code{b}, depending on another parameter \code{val} \begin{lstlisting} /* use either a or b depending on val */ int choose(int val, array_ptr a : count(alen), int alen, - array_ptr b : count(blen), int blen) + array_ptr b : count(blen), int blen) { array_ptr c; int clen; @@ -412,7 +412,7 @@ int choose(int val, array_ptr a : count(alen), int alen, else { clen = blen; c = b where c : count(clen); - } + } ... } \end{lstlisting} @@ -425,7 +425,7 @@ the update. Consider the following example: \begin{lstlisting} /* sum integers stored between start and end, where end is not included */ int sum(array_ptr start : bounds(start, end), array_ptr end) -{ +{ ... // Adjusting end. Can declare new bounds for start at the assignment end = end - 1 where start : bounds(start, end + 1); @@ -466,11 +466,11 @@ added: \code{byte_count} is the identifier \code{byte_count}. The bounds declaration \boundsdecl{\var{x}}{\boundsbytecount{\var{e1}}} -describes the number of bytes that are accessible beginning at \var{x}. -Only memory that is at or above \var{x} and below -\lstinline|(|\arrayptrchar\lstinline|)| \var{x} \code{+} \var{e1} can be accessed through +describes the number of bytes that are accessible beginning at \var{x}. +Only memory that is at or above \var{x} and below +\lstinline|(|\arrayptrchar\lstinline|)| \var{x} \code{+} \var{e1} can be accessed through \var{x}. The type of \var{e1} must be an integral type. The usual C integer conversions are -applied to \var{e1}. This bounds declaration is a synonym for +applied to \var{e1}. This bounds declaration is a synonym for \boundsdecl{\var{x}} {\boundsrel{\lstinline|(|\arrayptrchar)\lstinline|)| \var{x}} {\lstinline|(|\arrayptrchar)\lstinline|)| \var{x} \code{+} \var{e1}} @@ -479,7 +479,7 @@ applied to \var{e1}. This bounds declaration is a synonym for The standard C library functions for \code{malloc}, \code{memcmp}, and \code{memcpy} will be given bounds-safe interfaces to avoid breaking existing code as -described in Section~\ref{section:function-bounds-safe-interfaces}. +described in Section~\ref{section:function-bounds-safe-interfaces}. However, if they were to return checked pointer types, their bounds declarations would be: @@ -509,7 +509,7 @@ in-line bounds declarations and \keyword{where} clauses for declarations: \begin{tabbing} \var{init-}\=\var{declarator:} \\ \>\var{declarator inline-bounds-specifier\textsubscript{opt} where-clause\textsubscript{opt}} \\ -\>\var{declarator inline-bounds-specifier\textsubscript{opt} +\>\var{declarator inline-bounds-specifier\textsubscript{opt} where-clause\textsubscript{opt}} \lstinline|=| \var{initializer where-clause\textsubscript{opt}} \\ \>\ldots{} \\ @@ -518,7 +518,7 @@ where-clause\textsubscript{opt}} \\ \>\var{declaration-specifiers declarator} \var{inline-bounds-specifier\textsubscript{opt} where-clause\textsubscript{opt}} \\ \\ \var{inline-bounds-specifier:}\\ -\>\lstinline|:| \var{bounds-exp} \\ +\>\lstinline|:| \var{bounds-exp} \\ \\ \var{where-clause}: \\ \>\keyword{where} \var{facts} \\ @@ -573,7 +573,7 @@ must be the identifier \code{any} or the identifier \code{unknown}. The semantics of C expressions is subtle. The expression \lstinline|*|\var{e1} does not dereference memory. It produces an lvalue that can be used to access memory. If the expression -occurs on the left-hand side of an assigment, the memory pointed to +occurs on the left-hand side of an assignment, the memory pointed to by the lvalue is updated with the value of the right-hand side of the assignment. For example, given \lstinline|*|\var{e1} \lstinline|=| \var{e2}, @@ -645,7 +645,7 @@ is used to access memory. The compiler computes the bounds of runtime checks that \var{e1} \lstinline|+| \var{e2} is within this bounds as part of the evaluation of \var{e1}\lstinline|[|\var{e2}\lstinline|]|. For example, given \lstinline|x[5]| where \lstinline|x| has \lstinline|bounds(x, x + c)|, the -compiler inserts runtime checks that \lstinline|x <= x + 5 < x + c|. +compiler inserts runtime checks that \lstinline|x <= x + 5 < x + c|. The runtime checks simplify to \lstinline|5 < c|. For a multi-dimensional array access, only one bounds check is done. @@ -685,11 +685,11 @@ has {\bounds{\var{e2}}{\var{e3}}}: \begin{itemize} \item Given an assignment of the form \lstinline|*|\var{e1} \lstinline|=| \var{e4}, the value of \var{e4} is computed to some temporary \var{v}. The -check is \lstinline|(|\var{e2} \lstinline|<=| \var{t} \lstinline|&&| \var{t} \lstinline|<| -\var{e3}\lstinline@) || (@ \var{e2} \lstinline|<=| \var{t} \lstinline|&&| \var{t} +check is \lstinline|(|\var{e2} \lstinline|<=| \var{t} \lstinline|&&| \var{t} \lstinline|<| +\var{e3}\lstinline@) || (@ \var{e2} \lstinline|<=| \var{t} \lstinline|&&| \var{t} \lstinline|==| \var{e3} \lstinline|&&| \var{v} \lstinline|== 0)|. -When checking for a write of \code{0} exactly at the upper bound, we +When checking for a write of \code{0} exactly at the upper bound, we include the first element of the sequence in the allowed memory range. The second lower bound comparison \var{e2} \lstinline|<=| \var{t} prevents an assignment @@ -740,7 +740,7 @@ Additional storage would be particularly problematic when bounds declarations are extend to structure members. It would cause the size and layout of data structures to change. Deferring evaluation also avoids complications when \code{x} is -\code{null}. Section~\ref{section:bounds-declarations-alternate-semantics} +\code{null}. Section~\ref{section:bounds-declarations-alternate-semantics} discusses eager evaluation of bounds expressions at bounds declarations in more detail and explains why this was not chosen. @@ -750,7 +750,7 @@ It is common in C code to use multiple statements to update program state. This can cause problems when variables in a bounds declaration are updated. Invariant bounds declarations must be valid at the end of every statement. When updates involve multiple statements, a bounds declaration may be valid only -after all the updates are done. In Checked C, statements and declarations can be +after all the updates are done. In Checked C, statements and declarations can be grouped into bundled blocks. Bounds declarations are checked only at the end of bundled blocks. Consider the following example where a function is added to the earlier \code{sum} @@ -770,7 +770,7 @@ int sum(void) } /* buggy resize function */ -void resize(int len) +void resize(int len) { array_ptr tmp : count(len) = malloc(sizeof(int) * len); copy(tmp, buf, buflen); @@ -787,7 +787,7 @@ The updates to \code{buflen} and \code{buf} can be grouped together, so the checker considers them to be one action: \begin{lstlisting} /* correct resize function */ -void resize(int len) +void resize(int len) { array_ptr tmp : count(len) = malloc(sizeof(int) * len); copy(tmp, buf, buflen); @@ -812,7 +812,7 @@ The C syntax for is extended with: \\ \var{bundled-item:}\\ \> \var{declaration}\\ -\> \var{expression-statement} +\> \var{expression-statement} \end{tabbing} There is some subtlety with bundled blocks and function calls. The @@ -825,7 +825,7 @@ results in temporary variables and updating static variables \textit{en masse} after the required function calls have been made. Bundled blocks expose an interesting difference between regular C -and Checked C programs. In regular C, +and Checked C programs. In regular C, \begin{lstlisting} expr1, expr2; \end{lstlisting} @@ -843,7 +843,7 @@ statements. Thus bounds could be valid after In the \code{resize} example, the following suceeds with a bundle block: \begin{lstlisting} -void resize(int len) +void resize(int len) { array_ptr tmp = malloc(sizeof(int) * len); copy(tmp, buf, buflen); @@ -1043,7 +1043,7 @@ the declaration. \label{section:integer-overflow-informal} When objects are allocated dynamically in C, programmers have to compute -the amount of memory to allocate for the objects. It is well-known +the amount of memory to allocate for the objects. It is well-known that integer overflow or wraparound in these computations can lead to buffer overruns \cite{Howard2003,Mitre2015-128,Mitre2015-190,Mitre2015-680,Dietz2015}. In Checked C, the explicit size computations are not enough @@ -1073,10 +1073,10 @@ a mismatch when overflow or wraparound can happen, which causes static checking to fail. We expand the count expression; to integer arithmetic to make its size -computation clear. \lstinline|count(|\var{e1}\lstinline|)| expands to +computation clear. \lstinline|count(|\var{e1}\lstinline|)| expands to \bounds{\code{p}}{\code{p +} \var{e1}}. Following the rules in Section~\ref{section:pointers-as-integers}, -the expansion of \lstinline|p +| \var{e1} from pointer arithmetic to integer +the expansion of \lstinline|p +| \var{e1} from pointer arithmetic to integer arithmetic depends on the type of \var{e1}: \begin{itemize} @@ -1086,7 +1086,7 @@ arithmetic depends on the type of \var{e1}: \var{e1}. \item If \var{e1} is a signed integer, \lstinline|p +| \var{e1} expands to - \lstinline|p| \lstinline{+}\textsubscript{ovf} + \lstinline|p| \lstinline{+}\textsubscript{ovf} \lstinline|((signed_size_t)| \sizeof{\var{T}}\lstinline|)| \lstinline{*}\textsubscript{ovf} \var{e1}. \end{itemize} @@ -1103,11 +1103,11 @@ Values differ?\tabularnewline \midrule \endhead Unsigned integer & \sizeof{\var{T}} \lstinline|*|\textsubscript{ovf} \var{e1} & - \sizeof{\var{T}} \lstinline|* |\var{e1} & + \sizeof{\var{T}} \lstinline|* |\var{e1} & On overflow\tabularnewline Signed integer & \lstinline|(signed_size_t)| \sizeof{\var{T}} - \lstinline|*|\textsubscript{ovf} \var{e1} & - \sizeof{\var{T}} \lstinline|* (size_t)| \var{e1} & + \lstinline|*|\textsubscript{ovf} \var{e1} & + \sizeof{\var{T}} \lstinline|* (size_t)| \var{e1} & On overflow or when \code{e1 < 0}.\tabularnewline \bottomrule \end{longtable} @@ -1166,7 +1166,7 @@ Function types can be declared to have parameters and return values that have bo declarations, just like functions. The bounds declarations become part of the function types. Here are simple examples of function pointer types with bounds declarations: \begin{lstlisting} -// Function that takes a pointer to a 5 element array and returns an +// Function that takes a pointer to a 5 element array and returns an // integer. int (array_ptr arr : count(5)) // checked function pointer @@ -1233,7 +1233,7 @@ the pointer has type \arrayptrT. The bounds specify a range of memory that is exactly the size of some array of \var{T} and the pointer points exactly at an element of that array. For -example, suppose short ints are 2 bytes in size and +example, suppose short ints are 2 bytes in size and {\boundsdecl{\code{x}}{\bounds{\code{y}}{\code{z}}}}, where the types of \code{x}, \code{y}, \code{z} are \arrayptrinst{\code{short int}} . @@ -1264,7 +1264,7 @@ becomes more expensive. With relative alignment, the bounds check is alignment, the check for the upper bound needs to compute the highest address that will be accessed using x. For this example, the highest address accessed will be -\lstinline|(|\arrayptrchar\lstinline|) x + sizeof(short int) - 1|, so the check becomes +\lstinline|(|\arrayptrchar\lstinline|) x + sizeof(short int) - 1|, so the check becomes \code{y <= x} and \code{x + 1 < z}. A pointer cast can create a pointer that is not relatively aligned to @@ -1309,8 +1309,8 @@ instead of \code{char}) usually already are doing these checks. \subsection{Representing relative alignment in bounds declarations} \label{section:representing-relative-alignment} -Bounds expressions are extended with an optional relative alignment -clause to represent situations where an \arrayptrT\ is not relatively +Bounds expressions are extended with an optional relative alignment +clause to represent situations where an \arrayptrT\ is not relatively aligned to its bounds for type \var{T}: \begin{tabbing} @@ -1321,7 +1321,7 @@ aligned to its bounds for type \var{T}: \\ \var{relative-alignment-clause:}\\ \> \relalign{\var{type}} \\ -\> \relalignval{\var{constant-exp}} +\> \relalignval{\var{constant-exp}} \end{tabbing} This clause is only added to bounds pairs because (by definition) count @@ -1331,9 +1331,9 @@ identifiers. The optional clause specifies a relative alignment type \var{T} or the required relative alignment in bytes. Given \boundsdecl{\var{x}}{\boundsrel{\var{e1}}{\var{e2}}{\var{T}}}, -\lstinline|((|\arrayptrchar\lstinline|)| \var{x} \lstinline|- (|\arrayptrchar\lstinline|)| \var{e1}) +\lstinline|((|\arrayptrchar\lstinline|)| \var{x} \lstinline|- (|\arrayptrchar\lstinline|)| \var{e1}) \lstinline|%| \sizeof{\var{T}}\lstinline|== 0| and -\lstinline|((|\arrayptrchar\lstinline|)| \var{e2} \lstinline|- (|\arrayptrchar\lstinline|)| \var{x}) +\lstinline|((|\arrayptrchar\lstinline|)| \var{e2} \lstinline|- (|\arrayptrchar\lstinline|)| \var{x}) \lstinline|% sizeof(|\var{T}\lstinline|) == 0| must be true. If the number of bytes is specified, \sizeof{\var{T}} is replaced by the constant expression. The relative alignment clause \relalign{\var{type}} @@ -1346,7 +1346,7 @@ If the default relative alignment has been overridden and \sizeof{referent-type(\var{e1})} is a common factor of \sizeof{\var{T}} If it is, it inserts the same runtime check as before. Otherwise, it inserts a runtime check that -\var{e2} \lstinline|<=| \var{e1} \lstinline|&&| +\var{e2} \lstinline|<=| \var{e1} \lstinline|&&| \var{e1} \lstinline|+ sizeof(|\var{T}) \lstinline|- 1 <| \var{e3}. \subsection{Examples of uses of bounds declarations that specify relative alignment} @@ -1403,7 +1403,7 @@ possible. For simplicity, it is assumed that the memory pointed to by \code{dst} and \code{src} does not overlap and that \code{sizeof(int)} is 4: \begin{lstlisting} void copy(array_ptr dst : bounds(dst, dst + num), - array_ptr src : bounds(src, src + num), + array_ptr src : bounds(src, src + num), size_t num) { if (num % 4 == 0) { @@ -1429,7 +1429,7 @@ relative alignment can be changed: \begin{lstlisting} void copy(array_ptr dst : bounds(dst, dst + num), - array_ptr src : bounds(src, src + num), + array_ptr src : bounds(src, src + num), size_t num) { if (num % 4 == 0) { @@ -1444,7 +1444,7 @@ void copy(array_ptr dst : bounds(dst, dst + num), aligned_dst[i] = aligned_src[i]; } } - else + else ... } \end{lstlisting} @@ -1472,12 +1472,12 @@ array_ptr r : count(2) = (array_ptr) p; Variables that have bounds declared for them at expression statements have dataflow-sensitive bounds declarations. There are two aspects to dataflow-sensitive bounds: extent and consistency. The extent of a -flow-sensitive bounds declaration is the part of the program to which +flow-sensitive bounds declaration is the part of the program to which a flow-sensitive bounds -declaration applies. Within the extent of a flow-sensitive +declaration applies. Within the extent of a flow-sensitive bounds declaration, the bounds for the variable for memory dereferences are given by the bounds -declaration. Consistency is that all the flow-sensitive bounds +declaration. Consistency is that all the flow-sensitive bounds declarations flowing to a statement agree with each other. As mentioned in Section~\ref{section:statement-declarations}, @@ -1511,19 +1511,19 @@ lead to unexpected bounds checks failures. \subsection{Definition of extent} \label{section:extent-definition} -We first define the set of flow-sensitive bounds declarations that apply to a +We first define the set of flow-sensitive bounds declarations that apply to a component of a function. A component is an expression statement, variable declaration, or a compound statement. For any flow-sensitive bounds declaration for a variable \var{v}, if \begin{compactenum} \item - There is some path from the bounds declaration to the + There is some path from the bounds declaration to the component, and \item \var{v} occurs in the component, and \item - there is no other flow-sensitive bounds declaration for + there is no other flow-sensitive bounds declaration for \var{v} along the path \end{compactenum} @@ -1531,26 +1531,26 @@ then \begin{compactenum} \item - if all the variables occurring in the bounds declaration are + if all the variables occurring in the bounds declaration are in-scope at the component, and - \begin{compactenum} + \begin{compactenum} \item No expressions or statements on the path modify a variable occurring in the bounds declaration \item or \begin{compactenum} \item The only expressions or statements on the path that - modify a variable occurring in the bounds declaration + modify a variable occurring in the bounds declaration are pointer increments or decrements of \var{v}, and \item \var{v} does not occur in the bounds expression, and \item the bounds expression is not a count expression \end{compactenum} \end{compactenum} - then the bounds declaration applies to the component -\item + then the bounds declaration applies to the component +\item Otherwise, the bounds declaration \boundsdecl{\var{v}}{\boundsunknown} applies to the component. \end{compactenum} - + \subsection{Consistency} These conditions ensure the consistency of bounds declarations: @@ -1582,7 +1582,7 @@ rejected by the compiler. /* buggy function */ /* sum integers stored between start and end, where end is not included */ int sum(array_ptr start : bounds(start, end), array_ptr end) -{ +{ end = end + 1; // bounds(start, end) does not hold after this, so program is // rejected start[5] = 0; @@ -1595,22 +1595,22 @@ A correct function declares new bounds for \code{start}: \begin{lstlisting} /* sum integers stored between start and end, where end is not included */ int sum(array_ptr start : bounds(start, end), array_ptr end) -{ +{ end = end + 1 where start : bounds(start, end - 1); - start[5] = 0; // program accepted by compiler; may fail bounds check + start[5] = 0; // program accepted by compiler; may fail bounds check // at run time ... } \end{lstlisting} The following example shows the extents of the two bounds declarations for -\code{start}. The bounds declaration at the declaration of \code{start} and -its extent is highlighted in blue. The bounds declaration at the assignment +\code{start}. The bounds declaration at the declaration of \code{start} and +its extent is highlighted in blue. The bounds declaration at the assignment to \code{end} and its extent is highlighted in yellow: \sethlcolor{lightblue} \begin{lstlisting}[escapechar=\|] -/* sum integers stored between start and end, where end is not included */ +/* sum integers stored between start and end, where end is not included */ int sum(array_ptr start : |\hl{bounds(start, end)}|, array_ptr end) |\hl{\{\\ start[5] = 0; // bounds checked using (start, end)}\\ @@ -1620,14 +1620,14 @@ start[5] = 0; // bounds checked using (start, end - 1)\\ } \end{lstlisting} -Figure~\ref{fig:bounds-extent:choose} expands on the \code{choose} function earlier. +Figure~\ref{fig:bounds-extent:choose} expands on the \code{choose} function earlier. There are 4 explicit bounds declarations for \code{c}. The bounds declarations and their extents are also color-coded. Three bounds declarations for \code{c} have the form \boundsdecl{\code{c}}{\boundscount{\code{clen}}}. They must be syntactically identical because there are statements that they cover in common. Their extents are highlighted in blue. Another bounds declaration has the form \boundsdecl{\code{c}}{\boundscount{\code{clen - 1}}}. Its extent is highlighted -in yellow. It covers code where \code{c} is incremented and the +in yellow. It covers code where \code{c} is incremented and the length is decremented. The increment invalidates the bounds declaration \boundsdecl{\code{c}}{\boundscount{\code{clen}}}. A different bounds declaration for \code{c} is needed after the increment. @@ -1650,16 +1650,16 @@ int choose(int val, array_ptr a: count(alen), int alen, clen = blen; c = b where |\hl{c : count(clen);}| |\hl{\}}| - + |\hl{if (clen > 1) \{}| |\hl{result = c[0];}| |\hl{\}}| - + |\hl{if (cond \&\& clen > 1) \{}| c = |\hl{c + 1}| where |\sethlcolor{lightyellow}\hl{c : count(clen - 1);}| clen = |\sethlcolor{lightyellow}\hl{clen - 1}| where |\sethlcolor{lightblue}\hl{c : count(clen);}| |\hl{result += c[0];}| - |\hl{\}}| + |\hl{\}}| |\hl{...}| |\hl{return result;}| } @@ -1687,8 +1687,8 @@ function. In functions that do not have loops in their control-flow, the dataflow analysis can be done in O(N*M) time. The dataflow analysis uses a lattice of values, assigning one lattice -value to each variable at each program point in the function. -Figure~\ref{fig:extent-dataflow-lattice} shows the lattice of values. +value to each variable at each program point in the function. +Figure~\ref{fig:extent-dataflow-lattice} shows the lattice of values. It includes singleton values consisting of the bounds expressions in the flow-sensitive bounds declaration for variables in the function, \boundsunknown\ (indicating absence of bounds @@ -1706,11 +1706,11 @@ information), and \code{Top} (indicating contradiction or error): \caption{The lattice of values used in dataflow computation of extent} \label{fig:extent-dataflow-lattice} \end{figure} - + For an assignment to an \arrayptr\ variable, the existing lattice value for the \arrayptr\ variable is killed, unless the special conditions described in Section~\ref{section:extent-of-declarations} -are met. +are met. A new lattice value is generated for the \arrayptr\ variable. If the assignment declares bounds for the \arrayptr\ variable, the new lattice value is the bounds expression in the bounds declaration. @@ -1749,7 +1749,7 @@ loop. The loop modifies \code{current}, but the bounds for \begin{lstlisting} /* sum integers stored between start and end, where end is not included */ int sum(array_ptr start where start : bounds(start, end), array_ptr end) -{ +{ int sum = 0; array_ptr current : (start, end) = start; @@ -1776,10 +1776,10 @@ eliminate the runtime bounds checks easily. \begin{lstlisting} /* lexicographic comparison of two arrays of integers */ -int compare(array_ptr x : bounds(x, x_end), +int compare(array_ptr x : bounds(x, x_end), array_ptr y : bounds(y, y_end) array_ptr x_end, array_ptr y_end) -{ +{ while (x < x_end && y < y_end) { if (*x == *y) { // bounds check: x >= x && x < x_end; easily // optimizable. @@ -1802,7 +1802,7 @@ int compare(array_ptr x : bounds(x, x_end), return 1; } else { - return -1; + return -1; } } \end{lstlisting}