[ruby/yarp] Encoding-dependent escapes

https://github.com/ruby/yarp/commit/36a5b801c4
This commit is contained in:
Kevin Newton 2023-08-23 15:48:24 -04:00
Родитель 9aba46c666
Коммит 432702a427
3 изменённых файлов: 84 добавлений и 64 удалений

Просмотреть файл

@ -14,6 +14,20 @@ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
return true;
}
// We don't call the char_width function unless we have to because it's
// expensive to go through the indirection of the function pointer. Instead we
// provide a fast path that will check if we can just return 1.
static inline size_t
yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
const unsigned char *uc = (const unsigned char *) start;
if (parser->encoding_changed || (*uc >= 0x80)) {
return parser->encoding.char_width(start, end - start);
} else {
return 1;
}
}
/******************************************************************************/
/* Lookup tables for characters */
/******************************************************************************/
@ -178,7 +192,7 @@ unescape_char(const unsigned char value, const unsigned char flags) {
// Read a specific escape sequence into the given destination.
static const char *
unescape(char *dest, size_t *dest_length, const char *backslash, const char *end, yp_list_t *error_list, const unsigned char flags, bool write_to_str) {
unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backslash, const char *end, const unsigned char flags, bool write_to_str) {
switch (backslash[1]) {
case 'a':
case 'b':
@ -218,7 +232,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
// \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
case 'u': {
if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
return backslash + 2;
}
@ -235,11 +249,11 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
// \u{nnnn} character literal allows only 1-6 hexadecimal digits
if (hexadecimal_length > 6)
yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
// there are not hexadecimal characters
if (hexadecimal_length == 0) {
yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
return unicode_cursor;
}
@ -252,7 +266,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
uint32_t value;
unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
if (write_to_str) {
*dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
*dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, &parser->error_list);
}
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
@ -260,7 +274,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
// ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
return unicode_cursor + 1;
}
@ -270,12 +284,12 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
unescape_unicode(backslash + 2, 4, &value);
if (write_to_str) {
*dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
*dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, &parser->error_list);
}
return backslash + 6;
}
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
return backslash + 2;
}
// \c\M-x meta control character, where x is an ASCII printable character
@ -283,18 +297,18 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
// \cx control character, where x is an ASCII printable character
case 'c':
if (backslash + 2 >= end) {
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
return end;
}
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
return backslash + 2;
}
switch (backslash[2]) {
case '\\':
return unescape(dest, dest_length, backslash + 2, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
case '?':
if (write_to_str) {
dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
@ -302,7 +316,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
return backslash + 3;
default: {
if (!char_is_ascii_printable(backslash[2])) {
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
return backslash + 2;
}
@ -316,23 +330,23 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
// \C-? delete, ASCII 7Fh (DEL)
case 'C':
if (backslash + 3 >= end) {
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
return end;
}
if (flags & YP_UNESCAPE_FLAG_CONTROL) {
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
return backslash + 2;
}
if (backslash[2] != '-') {
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
return backslash + 2;
}
switch (backslash[3]) {
case '\\':
return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
case '?':
if (write_to_str) {
dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
@ -340,7 +354,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
return backslash + 4;
default:
if (!char_is_ascii_printable(backslash[3])) {
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid control escape sequence");
return backslash + 2;
}
@ -354,22 +368,22 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
// \M-x meta character, where x is an ASCII printable character
case 'M': {
if (backslash + 3 >= end) {
yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
return end;
}
if (flags & YP_UNESCAPE_FLAG_META) {
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
return backslash + 2;
}
if (backslash[2] != '-') {
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
return backslash + 2;
}
if (backslash[3] == '\\') {
return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_META, write_to_str);
return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, write_to_str);
}
if (char_is_ascii_printable(backslash[3])) {
@ -379,7 +393,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
return backslash + 4;
}
yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
return backslash + 3;
}
// \n
@ -390,14 +404,17 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
if (backslash + 2 < end && backslash[2] == '\n') {
return backslash + 3;
}
/* fallthrough */
/* fallthrough */
// In this case we're escaping something that doesn't need escaping.
default: {
size_t width = yp_char_width(parser, backslash + 1, end);
if (write_to_str) {
dest[(*dest_length)++] = backslash[1];
memcpy(dest + *dest_length, backslash + 1, width);
*dest_length += width;
}
return backslash + 2;
return backslash + 1 + width;
}
}
}
@ -431,7 +448,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
// \c? or \C-? delete, ASCII 7Fh (DEL)
//
YP_EXPORTED_FUNCTION void
yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list) {
yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
if (unescape_type == YP_UNESCAPE_NONE) {
// If we're not unescaping then we can reference the source directly.
return;
@ -448,7 +465,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
// within the string.
char *allocated = malloc(string->length);
if (allocated == NULL) {
yp_diagnostic_list_append(error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
return;
}
@ -493,7 +510,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
// This is the only type of unescaping left. In this case we need to
// handle all of the different unescapes.
assert(unescape_type == YP_UNESCAPE_ALL);
cursor = unescape(dest, &dest_length, backslash, end, error_list, YP_UNESCAPE_FLAG_NONE, true);
cursor = unescape(parser, dest, &dest_length, backslash, end, YP_UNESCAPE_FLAG_NONE, true);
break;
}
@ -521,29 +538,11 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
}
YP_EXPORTED_FUNCTION bool
yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
bool success;
yp_parser_t parser;
yp_parser_init(&parser, start, length, "");
yp_list_t error_list = YP_LIST_EMPTY;
yp_string_shared_init(result, start, start + length);
yp_unescape_manipulate_string(&parser, result, unescape_type, &error_list);
success = yp_list_empty_p(&error_list);
yp_list_free(&error_list);
yp_parser_free(&parser);
return success;
}
// This function is similar to yp_unescape_manipulate_string, except it doesn't
// actually perform any string manipulations. Instead, it calculates how long
// the unescaped character is, and returns that value
YP_EXPORTED_FUNCTION size_t
yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list) {
size_t
yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
assert(unescape_type != YP_UNESCAPE_NONE);
switch (backslash[1]) {
@ -551,7 +550,9 @@ yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unes
case '\'':
return 2;
default: {
if (unescape_type == YP_UNESCAPE_MINIMAL) return 2;
if (unescape_type == YP_UNESCAPE_MINIMAL) {
return 1 + yp_char_width(parser, backslash + 1, parser->end);
}
// This is the only type of unescaping left. In this case we need to
// handle all of the different unescapes.
@ -561,10 +562,27 @@ yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unes
if (expect_single_codepoint)
flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
const char *cursor = unescape(NULL, 0, backslash, end, error_list, flags, false);
const char *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, false);
assert(cursor > backslash);
return (size_t) (cursor - backslash);
}
}
}
// This is one of the main entry points into the extension. It accepts a source
// string, a type of unescaping, and a pointer to a result string. It returns a
// boolean indicating whether or not the unescaping was successful.
YP_EXPORTED_FUNCTION bool
yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
yp_parser_t parser;
yp_parser_init(&parser, start, length, NULL);
yp_string_shared_init(result, start, start + length);
yp_unescape_manipulate_string(&parser, result, unescape_type);
bool success = yp_list_empty_p(&parser.error_list);
yp_parser_free(&parser);
return success;
}

Просмотреть файл

@ -31,12 +31,14 @@ typedef enum {
// Unescape the contents of the given token into the given string using the
// given unescape mode.
YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list);
YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type);
// Accepts a source string and a type of unescaping and returns the unescaped version.
// The caller must yp_string_free(result); after calling this function.
YP_EXPORTED_FUNCTION bool yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result);
YP_EXPORTED_FUNCTION size_t yp_unescape_calculate_difference(const char *value, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list);
// Returns the number of bytes that encompass the first escape sequence in the
// given string.
size_t yp_unescape_calculate_difference(yp_parser_t *parser, const char *value, yp_unescape_type_t unescape_type, bool expect_single_codepoint);
#endif

Просмотреть файл

@ -3600,7 +3600,7 @@ yp_symbol_node_label_create(yp_parser_t *parser, const yp_token_t *token) {
assert((label.end - label.start) >= 0);
yp_string_shared_init(&node->unescaped, label.start, label.end);
yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL, &parser->error_list);
yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL);
break;
}
case YP_TOKEN_MISSING: {
@ -5104,7 +5104,7 @@ lex_question_mark(yp_parser_t *parser) {
if (parser->current.start[1] == '\\') {
lex_state_set(parser, YP_LEX_STATE_END);
parser->current.end += yp_unescape_calculate_difference(parser->current.start + 1, parser->end, YP_UNESCAPE_ALL, true, &parser->error_list);
parser->current.end += yp_unescape_calculate_difference(parser, parser->current.start + 1, YP_UNESCAPE_ALL, true);
return YP_TOKEN_CHARACTER_LITERAL;
} else {
size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end);
@ -6493,7 +6493,7 @@ parser_lex(yp_parser_t *parser) {
// and find the next breakpoint.
if (*breakpoint == '\\') {
yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
// If the result is an escaped newline, then we need to
// track that newline.
@ -6606,7 +6606,7 @@ parser_lex(yp_parser_t *parser) {
// literally. In this case we'll skip past the next character
// and find the next breakpoint.
if (*breakpoint == '\\') {
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, YP_UNESCAPE_ALL, false, &parser->error_list);
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
// If the result is an escaped newline, then we need to
// track that newline.
@ -6746,7 +6746,7 @@ parser_lex(yp_parser_t *parser) {
// literally. In this case we'll skip past the next character and
// find the next breakpoint.
yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
// If the result is an escaped newline, then we need to
// track that newline.
@ -6907,7 +6907,7 @@ parser_lex(yp_parser_t *parser) {
breakpoint += le_len;
} else {
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
@ -6963,7 +6963,7 @@ yp_regular_expression_node_create_and_unescape(yp_parser_t *parser, const yp_tok
assert((content->end - content->start) >= 0);
yp_string_shared_init(&node->unescaped, content->start, content->end);
yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type, &parser->error_list);
yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
return node;
}
@ -6974,7 +6974,7 @@ yp_symbol_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openin
assert((content->end - content->start) >= 0);
yp_string_shared_init(&node->unescaped, content->start, content->end);
yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type, &parser->error_list);
yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
return node;
}
@ -6985,7 +6985,7 @@ yp_string_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openin
assert((content->end - content->start) >= 0);
yp_string_shared_init(&node->unescaped, content->start, content->end);
yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type, &parser->error_list);
yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
return node;
}
@ -6996,7 +6996,7 @@ yp_xstring_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openi
assert((content->end - content->start) >= 0);
yp_string_shared_init(&node->unescaped, content->start, content->end);
yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL, &parser->error_list);
yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL);
return node;
}
@ -9334,7 +9334,7 @@ parse_heredoc_dedent(yp_parser_t *parser, yp_node_t *node, yp_heredoc_quote_t qu
yp_node_destroy(parser, node);
} else {
string->length = dest_length;
yp_unescape_manipulate_string(parser, string, (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL, &parser->error_list);
yp_unescape_manipulate_string(parser, string, (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL);
nodes->nodes[write_index++] = node;
}