From a0f15b0848405ae16d63bd5d78c862a6526b338a Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 6 Oct 2010 04:57:26 +0000 Subject: [PATCH] Add support for 4-byte UCNs like \U12345678. Warn about UCNs in c90 mode. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@115743 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/clang/Basic/DiagnosticLexKinds.td | 4 +++ lib/Lex/LiteralSupport.cpp | 39 ++++++++++++++++----- test/CodeGen/string-literal-short-wstring.c | 14 ++++++++ test/CodeGen/string-literal.c | 11 +++++- test/Lexer/c90.c | 4 +++ test/Lexer/wchar.c | 6 ++++ 6 files changed, 68 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/string-literal-short-wstring.c create mode 100644 test/Lexer/wchar.c diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index dcb05c8fcd..8f61c69f9e 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -98,6 +98,10 @@ def warn_hex_escape_too_large : ExtWarn<"hex escape sequence out of range">; def ext_string_too_long : Extension<"string literal of length %0 exceeds " "maximum length %1 that %select{C90|ISO C99|C++}2 compilers are required to " "support">, InGroup; +def warn_ucn_escape_too_large : ExtWarn< + "character unicode escape sequence too long for its type">; +def warn_ucn_not_valid_in_c89 : ExtWarn< + "unicode escape sequences are only valid in C99 or C++">; //===----------------------------------------------------------------------===// // PTH Diagnostics diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index fb543d0f03..9b7c46f091 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -172,8 +172,8 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, SourceLocation Loc, Preprocessor &PP, bool wide, bool Complain) { - // FIXME: Add a warning - UCN's are only valid in C++ & C99. - // FIXME: Handle wide strings. + if (!PP.getLangOptions().CPlusPlus && !PP.getLangOptions().C99) + PP.Diag(Loc, diag::warn_ucn_not_valid_in_c89); // Save the beginning of the string (for error diagnostics). const char *ThisTokBegin = ThisTokBuf; @@ -218,13 +218,34 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, } if (wide) { (void)UcnLenSave; - assert(UcnLenSave == 4 && - "ProcessUCNEscape - only ucn length of 4 supported"); - // little endian assumed. - *ResultBuf++ = (UcnVal & 0x000000FF); - *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; - *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; - *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; + assert((UcnLenSave == 4 || UcnLenSave == 8) && + "ProcessUCNEscape - only ucn length of 4 or 8 supported"); + + if (!PP.getLangOptions().ShortWChar) { + // Note: our internal rep of wide char tokens is always little-endian. + *ResultBuf++ = (UcnVal & 0x000000FF); + *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; + *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; + *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; + return; + } + + // Convert to UTF16. + if (UcnVal < (UTF32)0xFFFF) { + *ResultBuf++ = (UcnVal & 0x000000FF); + *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; + return; + } + PP.Diag(Loc, diag::warn_ucn_escape_too_large); + + typedef uint16_t UTF16; + UcnVal -= 0x10000; + UTF16 surrogate1 = 0xD800 + (UcnVal >> 10); + UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF); + *ResultBuf++ = (surrogate1 & 0x000000FF); + *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8; + *ResultBuf++ = (surrogate2 & 0x000000FF); + *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8; return; } // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. diff --git a/test/CodeGen/string-literal-short-wstring.c b/test/CodeGen/string-literal-short-wstring.c new file mode 100644 index 0000000000..de84953dd3 --- /dev/null +++ b/test/CodeGen/string-literal-short-wstring.c @@ -0,0 +1,14 @@ +// RUN: %clang_cc1 -emit-llvm -fshort-wchar %s -o - | FileCheck %s + +int main() { + // This should convert to utf8. + // CHECK: internal constant [10 x i8] c"\E1\84\A0\C8\A0\F4\82\80\B0\00", align 1 + char b[10] = "\u1120\u0220\U00102030"; + + // CHECK: private constant [6 x i8] c"A\00B\00\00\00" + void *foo = L"AB"; + + // This should convert to utf16. + // CHECK: private constant [10 x i8] c" \11 \02\C8\DB0\DC\00\00" + void *bar = L"\u1120\u0220\U00102030"; +} diff --git a/test/CodeGen/string-literal.c b/test/CodeGen/string-literal.c index 22a81e7185..457ff6ca7a 100644 --- a/test/CodeGen/string-literal.c +++ b/test/CodeGen/string-literal.c @@ -1,7 +1,16 @@ -// RUN: %clang_cc1 -emit-llvm %s -o - +// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s int main() { + // CHECK: internal constant [10 x i8] c"abc\00\00\00\00\00\00\00", align 1 char a[10] = "abc"; + // This should convert to utf8. + // CHECK: internal constant [10 x i8] c"\E1\84\A0\C8\A0\F4\82\80\B0\00", align 1 + char b[10] = "\u1120\u0220\U00102030"; + + // CHECK: private constant [12 x i8] c"A\00\00\00B\00\00\00\00\00\00\00" void *foo = L"AB"; + + // CHECK: private constant [12 x i8] c"4\12\00\00\0B\F0\10\00\00\00\00\00" + void *bar = L"\u1234\U0010F00B"; } diff --git a/test/Lexer/c90.c b/test/Lexer/c90.c index f191397102..f74135542c 100644 --- a/test/Lexer/c90.c +++ b/test/Lexer/c90.c @@ -27,3 +27,7 @@ void test2() { "sdjflksdjf lksdjf skldfjsdkljflksdjf kldsjflkdsj fldks jflsdkjfds" "sdjflksdjf lksdjf skldfjsdkljflksdjf kldsjflkdsj fldks jflsdkjfds"; } + +void test3() { + (void)L"\u1234"; // expected-error {{unicode escape sequences are only valid in C99 or C++}} +} diff --git a/test/Lexer/wchar.c b/test/Lexer/wchar.c new file mode 100644 index 0000000000..cbc0c455f8 --- /dev/null +++ b/test/Lexer/wchar.c @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -fsyntax-only -fshort-wchar -verify %s + +void f() { + (void)L"\U00010000"; // expected-warning {{character unicode escape sequence too long for its type}} +} +