[ruby/yarp] Encoding-dependent escapes

https://github.com/ruby/yarp/commit/36a5b801c4
2023-08-23 15:48:24 -04:00 · 2023-08-23 15:48:24 -04:00 · 432702a427
--- a/yarp/unescape.c
+++ b/yarp/unescape.c
@ -14,6 +14,20 @@ yp_char_is_hexadecimal_digits(const char *c, size_t length) {
    return true;
 }

+// We don't call the char_width function unless we have to because it's
+// expensive to go through the indirection of the function pointer. Instead we
+// provide a fast path that will check if we can just return 1.
+static inline size_t
+yp_char_width(yp_parser_t *parser, const char *start, const char *end) {
+    const unsigned char *uc = (const unsigned char *) start;
+
+    if (parser->encoding_changed || (*uc >= 0x80)) {
+        return parser->encoding.char_width(start, end - start);
+    } else {
+        return 1;
+    }
+}
+
 /******************************************************************************/
 /* Lookup tables for characters                                               */
 /******************************************************************************/
@ -178,7 +192,7 @@ unescape_char(const unsigned char value, const unsigned char flags) {

 // Read a specific escape sequence into the given destination.
 static const char *
-unescape(char *dest, size_t *dest_length, const char *backslash, const char *end, yp_list_t *error_list, const unsigned char flags, bool write_to_str) {
+unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backslash, const char *end, const unsigned char flags, bool write_to_str) {
    switch (backslash[1]) {
        case 'a':
        case 'b':
@ -218,7 +232,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
        // \unnnn       Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
        case 'u': {
            if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags.");
                return backslash + 2;
            }

@ -235,11 +249,11 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end

                    // \u{nnnn} character literal allows only 1-6 hexadecimal digits
                    if (hexadecimal_length > 6)
-                        yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
+                        yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");

                    // there are not hexadecimal characters
                    if (hexadecimal_length == 0) {
-                        yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
+                        yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape");
                        return unicode_cursor;
                    }

@ -252,7 +266,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
                    uint32_t value;
                    unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
                    if (write_to_str) {
-                        *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
+                        *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, &parser->error_list);
                    }

                    unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
@ -260,7 +274,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end

                // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
                if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
-                    yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
+                    yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");

                return unicode_cursor + 1;
            }
@ -270,12 +284,12 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
                unescape_unicode(backslash + 2, 4, &value);

                if (write_to_str) {
-                    *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
+                    *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, &parser->error_list);
                }
                return backslash + 6;
            }

-            yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
+            yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid Unicode escape sequence");
            return backslash + 2;
        }
        // \c\M-x       meta control character, where x is an ASCII printable character
@ -283,18 +297,18 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
        // \cx          control character, where x is an ASCII printable character
        case 'c':
            if (backslash + 2 >= end) {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
                return end;
            }

            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
                return backslash + 2;
            }

            switch (backslash[2]) {
                case '\\':
-                    return unescape(dest, dest_length, backslash + 2, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
+                    return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
                case '?':
                    if (write_to_str) {
                        dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
@ -302,7 +316,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
                    return backslash + 3;
                default: {
                    if (!char_is_ascii_printable(backslash[2])) {
-                        yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                        yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
                        return backslash + 2;
                    }

@ -316,23 +330,23 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
        // \C-?         delete, ASCII 7Fh (DEL)
        case 'C':
            if (backslash + 3 >= end) {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
                return end;
            }

            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled.");
                return backslash + 2;
            }

            if (backslash[2] != '-') {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
                return backslash + 2;
            }

            switch (backslash[3]) {
                case '\\':
-                    return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
+                    return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str);
                case '?':
                    if (write_to_str) {
                        dest[(*dest_length)++] = (char) unescape_char(0x7f, flags);
@ -340,7 +354,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
                    return backslash + 4;
                default:
                    if (!char_is_ascii_printable(backslash[3])) {
-                        yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid control escape sequence");
+                        yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid control escape sequence");
                        return backslash + 2;
                    }

@ -354,22 +368,22 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
        // \M-x         meta character, where x is an ASCII printable character
        case 'M': {
            if (backslash + 3 >= end) {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 1, "Invalid control escape sequence");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence");
                return end;
            }

            if (flags & YP_UNESCAPE_FLAG_META) {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled.");
                return backslash + 2;
            }

            if (backslash[2] != '-') {
-                yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
+                yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
                return backslash + 2;
            }

            if (backslash[3] == '\\') {
-                return unescape(dest, dest_length, backslash + 3, end, error_list, flags | YP_UNESCAPE_FLAG_META, write_to_str);
+                return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, write_to_str);
            }

            if (char_is_ascii_printable(backslash[3])) {
@ -379,7 +393,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
                return backslash + 4;
            }

-            yp_diagnostic_list_append(error_list, backslash, backslash + 2, "Invalid meta escape sequence");
+            yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence");
            return backslash + 3;
        }
        // \n
@ -390,14 +404,17 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
            if (backslash + 2 < end && backslash[2] == '\n') {
                return backslash + 3;
            }
-
-            /* fallthrough */
+        /* fallthrough */
        // In this case we're escaping something that doesn't need escaping.
        default: {
+            size_t width = yp_char_width(parser, backslash + 1, end);
+
            if (write_to_str) {
-                dest[(*dest_length)++] = backslash[1];
+                memcpy(dest + *dest_length, backslash + 1, width);
+                *dest_length += width;
            }
-            return backslash + 2;
+
+            return backslash + 1 + width;
        }
    }
 }
@ -431,7 +448,7 @@ unescape(char *dest, size_t *dest_length, const char *backslash, const char *end
 // \c? or \C-?    delete, ASCII 7Fh (DEL)
 //
 YP_EXPORTED_FUNCTION void
-yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list) {
+yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
    if (unescape_type == YP_UNESCAPE_NONE) {
        // If we're not unescaping then we can reference the source directly.
        return;
@ -448,7 +465,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
    // within the string.
    char *allocated = malloc(string->length);
    if (allocated == NULL) {
-        yp_diagnostic_list_append(error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
+        yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping.");
        return;
    }

@ -493,7 +510,7 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
                // This is the only type of unescaping left. In this case we need to
                // handle all of the different unescapes.
                assert(unescape_type == YP_UNESCAPE_ALL);
-                cursor = unescape(dest, &dest_length, backslash, end, error_list, YP_UNESCAPE_FLAG_NONE, true);
+                cursor = unescape(parser, dest, &dest_length, backslash, end, YP_UNESCAPE_FLAG_NONE, true);
                break;
        }

@ -521,29 +538,11 @@ yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unesc
    yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
 }

-YP_EXPORTED_FUNCTION bool
-yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
-    bool success;
-
-    yp_parser_t parser;
-    yp_parser_init(&parser, start, length, "");
-
-    yp_list_t error_list = YP_LIST_EMPTY;
-    yp_string_shared_init(result, start, start + length);
-    yp_unescape_manipulate_string(&parser, result, unescape_type, &error_list);
-    success = yp_list_empty_p(&error_list);
-
-    yp_list_free(&error_list);
-    yp_parser_free(&parser);
-
-    return success;
-}
-
 // This function is similar to yp_unescape_manipulate_string, except it doesn't
 // actually perform any string manipulations. Instead, it calculates how long
 // the unescaped character is, and returns that value
-YP_EXPORTED_FUNCTION size_t
-yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list) {
+size_t
+yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
    assert(unescape_type != YP_UNESCAPE_NONE);

    switch (backslash[1]) {
@ -551,7 +550,9 @@ yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unes
        case '\'':
            return 2;
        default: {
-            if (unescape_type == YP_UNESCAPE_MINIMAL) return 2;
+            if (unescape_type == YP_UNESCAPE_MINIMAL) {
+                return 1 + yp_char_width(parser, backslash + 1, parser->end);
+            }

            // This is the only type of unescaping left. In this case we need to
            // handle all of the different unescapes.
@ -561,10 +562,27 @@ yp_unescape_calculate_difference(const char *backslash, const char *end, yp_unes
            if (expect_single_codepoint)
                flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;

-            const char *cursor = unescape(NULL, 0, backslash, end, error_list, flags, false);
+            const char *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, false);
            assert(cursor > backslash);

            return (size_t) (cursor - backslash);
        }
    }
 }
+
+// This is one of the main entry points into the extension. It accepts a source
+// string, a type of unescaping, and a pointer to a result string. It returns a
+// boolean indicating whether or not the unescaping was successful.
+YP_EXPORTED_FUNCTION bool
+yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
+    yp_parser_t parser;
+    yp_parser_init(&parser, start, length, NULL);
+
+    yp_string_shared_init(result, start, start + length);
+    yp_unescape_manipulate_string(&parser, result, unescape_type);
+
+    bool success = yp_list_empty_p(&parser.error_list);
+    yp_parser_free(&parser);
+
+    return success;
+}
--- a/yarp/unescape.h
+++ b/yarp/unescape.h
@ -31,12 +31,14 @@ typedef enum {

 // Unescape the contents of the given token into the given string using the
 // given unescape mode.
-YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, yp_list_t *error_list);
+YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type);

 // Accepts a source string and a type of unescaping and returns the unescaped version.
 // The caller must yp_string_free(result); after calling this function.
 YP_EXPORTED_FUNCTION bool yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result);

-YP_EXPORTED_FUNCTION size_t yp_unescape_calculate_difference(const char *value, const char *end, yp_unescape_type_t unescape_type, bool expect_single_codepoint, yp_list_t *error_list);
+// Returns the number of bytes that encompass the first escape sequence in the
+// given string.
+size_t yp_unescape_calculate_difference(yp_parser_t *parser, const char *value, yp_unescape_type_t unescape_type, bool expect_single_codepoint);

 #endif
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@ -3600,7 +3600,7 @@ yp_symbol_node_label_create(yp_parser_t *parser, const yp_token_t *token) {
            assert((label.end - label.start) >= 0);
            yp_string_shared_init(&node->unescaped, label.start, label.end);

-            yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL, &parser->error_list);
+            yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL);
            break;
        }
        case YP_TOKEN_MISSING: {
@ -5104,7 +5104,7 @@ lex_question_mark(yp_parser_t *parser) {

    if (parser->current.start[1] == '\\') {
        lex_state_set(parser, YP_LEX_STATE_END);
-        parser->current.end += yp_unescape_calculate_difference(parser->current.start + 1, parser->end, YP_UNESCAPE_ALL, true, &parser->error_list);
+        parser->current.end += yp_unescape_calculate_difference(parser, parser->current.start + 1, YP_UNESCAPE_ALL, true);
        return YP_TOKEN_CHARACTER_LITERAL;
    } else {
        size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end);
@ -6493,7 +6493,7 @@ parser_lex(yp_parser_t *parser) {
                // and find the next breakpoint.
                if (*breakpoint == '\\') {
                    yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
-                    size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
+                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);

                    // If the result is an escaped newline, then we need to
                    // track that newline.
@ -6606,7 +6606,7 @@ parser_lex(yp_parser_t *parser) {
                // literally. In this case we'll skip past the next character
                // and find the next breakpoint.
                if (*breakpoint == '\\') {
-                    size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, YP_UNESCAPE_ALL, false, &parser->error_list);
+                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);

                    // If the result is an escaped newline, then we need to
                    // track that newline.
@ -6746,7 +6746,7 @@ parser_lex(yp_parser_t *parser) {
                        // literally. In this case we'll skip past the next character and
                        // find the next breakpoint.
                        yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
-                        size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
+                        size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);

                        // If the result is an escaped newline, then we need to
                        // track that newline.
@ -6907,7 +6907,7 @@ parser_lex(yp_parser_t *parser) {
                            breakpoint += le_len;
                        } else {
                            yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
-                            size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
+                            size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);

                            yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);

@ -6963,7 +6963,7 @@ yp_regular_expression_node_create_and_unescape(yp_parser_t *parser, const yp_tok
    assert((content->end - content->start) >= 0);
    yp_string_shared_init(&node->unescaped, content->start, content->end);

-    yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type, &parser->error_list);
+    yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
    return node;
 }

@ -6974,7 +6974,7 @@ yp_symbol_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openin
    assert((content->end - content->start) >= 0);
    yp_string_shared_init(&node->unescaped, content->start, content->end);

-    yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type, &parser->error_list);
+    yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
    return node;
 }

@ -6985,7 +6985,7 @@ yp_string_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openin
    assert((content->end - content->start) >= 0);
    yp_string_shared_init(&node->unescaped, content->start, content->end);

-    yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type, &parser->error_list);
+    yp_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
    return node;
 }

@ -6996,7 +6996,7 @@ yp_xstring_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openi
    assert((content->end - content->start) >= 0);
    yp_string_shared_init(&node->unescaped, content->start, content->end);

-    yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL, &parser->error_list);
+    yp_unescape_manipulate_string(parser, &node->unescaped, YP_UNESCAPE_ALL);
    return node;
 }

@ -9334,7 +9334,7 @@ parse_heredoc_dedent(yp_parser_t *parser, yp_node_t *node, yp_heredoc_quote_t qu
            yp_node_destroy(parser, node);
        } else {
            string->length = dest_length;
-            yp_unescape_manipulate_string(parser, string, (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL, &parser->error_list);
+            yp_unescape_manipulate_string(parser, string, (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL);
            nodes->nodes[write_index++] = node;
        }