From 512f8217cb378c289b7d79cdf033715afcf82667 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 1 Sep 2023 10:45:11 -0400 Subject: [PATCH] [ruby/yarp] fix: double-counting of errors in parsing escaped strings Essentially, this change updates `yp_unescape_calculate_difference` to not create syntax errors, and we rely entirely on `yp_unescape_manipulate_string` to report syntax errors. To do that, this PR adds another (!) parameter to `unescape`: `yp_list_t *error_list`. When present, `unescape` reports syntax errors (and otherwise does not). However, an edge case that needed to be addressed is reporting syntax errors in this case: ?\u{1234 2345} In a string context, it's possible to have multiple codepoints by doing something like `"\u{1234 2345}"`; however, in the character literal context, this is a syntax error -- only a single codepoint is allowed. Unfortunately, when `yp_unescape_manipulate_string` is called, there's nothing to indicate that we are in a "character literal" context and that only a single codepoint is valid. To make this work, this PR: - introduces a new static utility function in yarp.c, `yp_char_literal_node_create_and_unescape`, which is called when we're parsing `YP_TOKEN_CHARACTER_LITERAL` - introduces a new (unexported) function, `yp_unescape_manipulate_char_literal` which does the same thing as `yp_unescape_manipulate_string` but tells `unescape` that only a single codepoint is expected https://github.com/ruby/yarp/commit/f6a65840b5 --- test/yarp/errors_test.rb | 3 -- yarp/unescape.c | 94 +++++++++++++++++++++++++--------------- yarp/unescape.h | 4 +- yarp/yarp.c | 13 +++++- 4 files changed, 73 insertions(+), 41 deletions(-) diff --git a/test/yarp/errors_test.rb b/test/yarp/errors_test.rb index faf840e925..caf5db650a 100644 --- a/test/yarp/errors_test.rb +++ b/test/yarp/errors_test.rb @@ -608,7 +608,6 @@ module YARP assert_errors expected, '"\u{0000001}"', [ ["invalid Unicode escape.", 4..11], - ["invalid Unicode escape.", 4..11] ] end @@ -617,14 +616,12 @@ module YARP assert_errors expected, '"\u{000z}"', [ ["unterminated Unicode escape", 7..7], - ["unterminated Unicode escape", 7..7] ] end def test_unterminated_unicode_brackets_should_be_a_syntax_error assert_errors expression('?\\u{3'), '?\\u{3', [ ["invalid Unicode escape.", 1..5], - ["invalid Unicode escape.", 1..5], ] end diff --git a/yarp/unescape.c b/yarp/unescape.c index 14c0faf2eb..84f0095da1 100644 --- a/yarp/unescape.c +++ b/yarp/unescape.c @@ -156,7 +156,7 @@ unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, cons // If we get here, then the value is too big. This is an error, but we don't // want to just crash, so instead we'll add an error to the error list and put // in a replacement character instead. - yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence."); + if (error_list) yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence."); dest[0] = 0xEF; dest[1] = 0xBF; dest[2] = 0xBD; @@ -186,7 +186,15 @@ unescape_char(uint8_t value, const uint8_t flags) { // Read a specific escape sequence into the given destination. static const uint8_t * -unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t *backslash, const uint8_t *end, const uint8_t flags) { +unescape( + yp_parser_t *parser, + uint8_t *dest, + size_t *dest_length, + const uint8_t *backslash, + const uint8_t *end, + const uint8_t flags, + yp_list_t *error_list +) { switch (backslash[1]) { case 'a': case 'b': @@ -226,7 +234,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F]) case 'u': { if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags."); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags."); return backslash + 2; } @@ -243,12 +251,11 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t // \u{nnnn} character literal allows only 1-6 hexadecimal digits if (hexadecimal_length > 6) { - yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape."); + if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape."); } - // there are not hexadecimal characters - if (hexadecimal_length == 0) { - yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape"); + else if (hexadecimal_length == 0) { + if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape"); return unicode_cursor; } @@ -261,36 +268,36 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t uint32_t value; unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value); if (dest) { - *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, &parser->error_list); + *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list); } unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor); } // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm} - if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) - yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal"); - + if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) { + if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal"); + } if (unicode_cursor < end && *unicode_cursor == '}') { unicode_cursor++; } else { - yp_diagnostic_list_append(&parser->error_list, backslash, unicode_cursor, "invalid Unicode escape."); + if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, "invalid Unicode escape."); } + return unicode_cursor; } - - if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) { + else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) { uint32_t value; unescape_unicode(backslash + 2, 4, &value); if (dest) { - *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, &parser->error_list); + *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list); } return backslash + 6; } - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid Unicode escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence"); return backslash + 2; } // \c\M-x meta control character, where x is an ASCII printable character @@ -298,18 +305,18 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t // \cx control character, where x is an ASCII printable character case 'c': if (backslash + 2 >= end) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence"); return end; } if (flags & YP_UNESCAPE_FLAG_CONTROL) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled."); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled."); return backslash + 2; } switch (backslash[2]) { case '\\': - return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL); + return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list); case '?': if (dest) { dest[(*dest_length)++] = unescape_char(0x7f, flags); @@ -317,7 +324,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t return backslash + 3; default: { if (!char_is_ascii_printable(backslash[2])) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence"); return backslash + 2; } @@ -331,23 +338,23 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t // \C-? delete, ASCII 7Fh (DEL) case 'C': if (backslash + 3 >= end) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence"); return end; } if (flags & YP_UNESCAPE_FLAG_CONTROL) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled."); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled."); return backslash + 2; } if (backslash[2] != '-') { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence"); return backslash + 2; } switch (backslash[3]) { case '\\': - return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL); + return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list); case '?': if (dest) { dest[(*dest_length)++] = unescape_char(0x7f, flags); @@ -355,7 +362,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t return backslash + 4; default: if (!char_is_ascii_printable(backslash[3])) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid control escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence"); return backslash + 2; } @@ -369,22 +376,22 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t // \M-x meta character, where x is an ASCII printable character case 'M': { if (backslash + 3 >= end) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence"); return end; } if (flags & YP_UNESCAPE_FLAG_META) { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled."); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled."); return backslash + 2; } if (backslash[2] != '-') { - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence"); return backslash + 2; } if (backslash[3] == '\\') { - return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META); + return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list); } if (char_is_ascii_printable(backslash[3])) { @@ -394,7 +401,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t return backslash + 4; } - yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence"); + if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence"); return backslash + 3; } // \n @@ -448,8 +455,8 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t // \c\M-x same as above // \c? or \C-? delete, ASCII 7Fh (DEL) // -YP_EXPORTED_FUNCTION void -yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) { +static void +yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) { if (unescape_type == YP_UNESCAPE_NONE) { // If we're not unescaping then we can reference the source directly. return; @@ -511,7 +518,13 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc // This is the only type of unescaping left. In this case we need to // handle all of the different unescapes. assert(unescape_type == YP_UNESCAPE_ALL); - cursor = unescape(parser, dest, &dest_length, backslash, end, YP_UNESCAPE_FLAG_NONE); + + uint8_t flags = YP_UNESCAPE_FLAG_NONE; + if (expect_single_codepoint) { + flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE; + } + + cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list); break; } @@ -539,6 +552,16 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor))); } +YP_EXPORTED_FUNCTION void +yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) { + yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false); +} + +void +yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) { + yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true); +} + // This function is similar to yp_unescape_manipulate_string, except it doesn't // actually perform any string manipulations. Instead, it calculates how long // the unescaped character is, and returns that value @@ -564,10 +587,11 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, assert(unescape_type == YP_UNESCAPE_ALL); uint8_t flags = YP_UNESCAPE_FLAG_NONE; - if (expect_single_codepoint) + if (expect_single_codepoint) { flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE; + } - const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags); + const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL); assert(cursor > backslash); return (size_t) (cursor - backslash); diff --git a/yarp/unescape.h b/yarp/unescape.h index bf8b7e83ec..fb0df9fcb0 100644 --- a/yarp/unescape.h +++ b/yarp/unescape.h @@ -29,9 +29,9 @@ typedef enum { YP_UNESCAPE_ALL } yp_unescape_type_t; -// Unescape the contents of the given token into the given string using the -// given unescape mode. +// Unescape the contents of the given token into the given string using the given unescape mode. YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type); +void yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type); // Accepts a source string and a type of unescaping and returns the unescaped version. // The caller must yp_string_free(result); after calling this function. diff --git a/yarp/yarp.c b/yarp/yarp.c index 176e9f76b6..7e098daa48 100644 --- a/yarp/yarp.c +++ b/yarp/yarp.c @@ -7547,6 +7547,17 @@ yp_symbol_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openin return node; } +static yp_string_node_t * +yp_char_literal_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *opening, const yp_token_t *content, const yp_token_t *closing, yp_unescape_type_t unescape_type) { + yp_string_node_t *node = yp_string_node_create(parser, opening, content, closing); + + assert((content->end - content->start) >= 0); + yp_string_shared_init(&node->unescaped, content->start, content->end); + + yp_unescape_manipulate_char_literal(parser, &node->unescaped, unescape_type); + return node; +} + static yp_string_node_t * yp_string_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *opening, const yp_token_t *content, const yp_token_t *closing, yp_unescape_type_t unescape_type) { yp_string_node_t *node = yp_string_node_create(parser, opening, content, closing); @@ -10921,7 +10932,7 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) { yp_token_t closing = not_provided(parser); - return (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &closing, YP_UNESCAPE_ALL); + return (yp_node_t *) yp_char_literal_node_create_and_unescape(parser, &opening, &content, &closing, YP_UNESCAPE_ALL); } case YP_TOKEN_CLASS_VARIABLE: { parser_lex(parser);