[ruby/yarp] fix: octal, hex, and unicode strings at the end of a

file (https://github.com/ruby/yarp/pull/1371) * refactor: move EOF check into yp_unescape_calculate_difference parser_lex is a bit more readable when we can rely on that behavior * fix: octal and hex digits at the end of a file Previously this resulted in invalid memory access. * fix: unicode strings at the end of a file Previously this resulted in invalid memory access. * Unterminated curly-bracket unicode is a syntax error https://github.com/ruby/yarp/commit/21cf11acb5
2023-08-31 18:40:29 -04:00 · 2023-08-31 18:40:29 -04:00 · df4c77608e
--- a/test/yarp/errors_test.rb
+++ b/test/yarp/errors_test.rb
@ -621,6 +621,13 @@ module YARP
      ]
    end

+    def test_unterminated_unicode_brackets_should_be_a_syntax_error
+      assert_errors expression('?\\u{3'), '?\\u{3', [
+        ["invalid Unicode escape.", 1..5],
+        ["invalid Unicode escape.", 1..5],
+      ]
+    end
+
    def test_method_parameters_after_block
      expected = DefNode(
        Location(),
--- a/test/yarp/fuzzer_test.rb
+++ b/test/yarp/fuzzer_test.rb
@ -22,6 +22,19 @@ module YARP
    snippet "incomplete escaped list", "%w[\\"
    snippet "incomplete escaped regex", "/a\\"
    snippet "unterminated heredoc with unterminated escape at end of file", "<<A\n\\"
+    snippet "escaped octal at end of file 1", '"\\3'
+    snippet "escaped octal at end of file 2", '"\\33'
+    snippet "escaped hex at end of file 1", '"\\x'
+    snippet "escaped hex at end of file 2", '"\\x3'
+    snippet "escaped unicode at end of file 1", '"\\u{3'
+    snippet "escaped unicode at end of file 2", '"\\u{33'
+    snippet "escaped unicode at end of file 3", '"\\u{333'
+    snippet "escaped unicode at end of file 4", '"\\u{3333'
+    snippet "escaped unicode at end of file 5", '"\\u{33333'
+    snippet "escaped unicode at end of file 6", '"\\u{333333'
+    snippet "escaped unicode at end of file 7", '"\\u3'
+    snippet "escaped unicode at end of file 8", '"\\u33'
+    snippet "escaped unicode at end of file 9", '"\\u333'

    snippet "statements node with multiple heredocs", <<~EOF
      for <<A + <<B
--- a/yarp/unescape.c
+++ b/yarp/unescape.c
@ -69,17 +69,15 @@ char_is_ascii_printable(const uint8_t b) {
 // Scan the 1-3 digits of octal into the value. Returns the number of digits
 // scanned.
 static inline size_t
-unescape_octal(const uint8_t *backslash, uint8_t *value) {
+unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
    *value = (uint8_t) (backslash[1] - '0');
-    if (!yp_char_is_octal_digit(backslash[2])) {
+    if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
        return 2;
    }
-
    *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
-    if (!yp_char_is_octal_digit(backslash[3])) {
+    if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
        return 3;
    }
-
    *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
    return 4;
 }
@ -93,12 +91,15 @@ unescape_hexadecimal_digit(const uint8_t value) {
 // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
 // digits scanned.
 static inline size_t
-unescape_hexadecimal(const uint8_t *backslash, uint8_t *value) {
+unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
+    *value = 0;
+    if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
+        return 2;
+    }
    *value = unescape_hexadecimal_digit(backslash[2]);
-    if (!yp_char_is_hexadecimal_digit(backslash[3])) {
+    if (backslash + 3 >=  end || !yp_char_is_hexadecimal_digit(backslash[3])) {
        return 3;
    }
-
    *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
    return 4;
 }
@ -204,7 +205,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9': {
            uint8_t value;
-            const uint8_t *cursor = backslash + unescape_octal(backslash, &value);
+            const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);

            if (dest) {
                dest[(*dest_length)++] = unescape_char(value, flags);
@ -214,7 +215,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
        // \xnn         hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
        case 'x': {
            uint8_t value;
-            const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value);
+            const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end);

            if (dest) {
                dest[(*dest_length)++] = unescape_char(value, flags);
@ -236,13 +237,14 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t

                unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);

-                while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
+                while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
                    const uint8_t *unicode_start = unicode_cursor;
                    size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);

                    // \u{nnnn} character literal allows only 1-6 hexadecimal digits
-                    if (hexadecimal_length > 6)
+                    if (hexadecimal_length > 6) {
                        yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
+                    }

                    // there are not hexadecimal characters
                    if (hexadecimal_length == 0) {
@ -269,10 +271,16 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
                if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
                    yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");

-                return unicode_cursor + 1;
+
+                if (unicode_cursor < end && *unicode_cursor == '}') {
+                    unicode_cursor++;
+                } else {
+                    yp_diagnostic_list_append(&parser->error_list, backslash, unicode_cursor, "invalid Unicode escape.");
+                }
+                return unicode_cursor;
            }

-            if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
+            if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
                uint32_t value;
                unescape_unicode(backslash + 2, 4, &value);

@ -538,6 +546,10 @@ size_t
 yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
    assert(unescape_type != YP_UNESCAPE_NONE);

+    if (backslash + 1 >= parser->end) {
+        return 0;
+    }
+
    switch (backslash[1]) {
        case '\\':
        case '\'':
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@ -7002,17 +7002,16 @@ parser_lex(yp_parser_t *parser) {
                // literally. In this case we'll skip past the next character
                // and find the next breakpoint.
                if (*breakpoint == '\\') {
-                    // Check that we're not at the end of the file.
-                    if (breakpoint + 1 >= parser->end) {
+                    yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
+                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+                    if (difference == 0) {
+                        // we're at the end of the file
                        breakpoint = NULL;
                        continue;
                    }

-                    yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
-                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
-
                    // If the result is an escaped newline ...
-                    if (*(breakpoint + difference - 1) == '\n') {
+                    if (breakpoint[difference - 1] == '\n') {
                        if (parser->heredoc_end) {
                            // ... if we are on the same line as a heredoc, flush the heredoc and
                            // continue parsing after heredoc_end.
@ -7141,16 +7140,15 @@ parser_lex(yp_parser_t *parser) {
                // literally. In this case we'll skip past the next character
                // and find the next breakpoint.
                if (*breakpoint == '\\') {
-                    // Check that we're not at the end of the file.
-                    if (breakpoint + 1 >= parser->end) {
+                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
+                    if (difference == 0) {
+                        // we're at the end of the file
                        breakpoint = NULL;
                        continue;
                    }

-                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
-
                    // If the result is an escaped newline ...
-                    if (*(breakpoint + difference - 1) == '\n') {
+                    if (breakpoint[difference - 1] == '\n') {
                        if (parser->heredoc_end) {
                            // ... if we are on the same line as a heredoc, flush the heredoc and
                            // continue parsing after heredoc_end.
@ -7293,20 +7291,19 @@ parser_lex(yp_parser_t *parser) {
                        breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
                        break;
                    case '\\': {
-                        // Check that we're not at the end of the file.
-                        if (breakpoint + 1 >= parser->end) {
-                            breakpoint = NULL;
-                            break;
-                        }
-
                        // If we hit escapes, then we need to treat the next token
                        // literally. In this case we'll skip past the next character and
                        // find the next breakpoint.
                        yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
                        size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+                        if (difference == 0) {
+                            // we're at the end of the file
+                            breakpoint = NULL;
+                            break;
+                        }

                        // If the result is an escaped newline ...
-                        if (*(breakpoint + difference - 1) == '\n') {
+                        if (breakpoint[difference - 1] == '\n') {
                            if (parser->heredoc_end) {
                                // ... if we are on the same line as a heredoc, flush the heredoc and
                                // continue parsing after heredoc_end.
@ -7463,12 +7460,6 @@ parser_lex(yp_parser_t *parser) {
                        break;
                    }
                    case '\\': {
-                        // Check that we're not at the end of the file.
-                        if (breakpoint + 1 >= parser->end) {
-                            breakpoint = NULL;
-                            break;
-                        }
-
                        // If we hit an escape, then we need to skip past
                        // however many characters the escape takes up. However
                        // it's important that if \n or \r\n are escaped that we
@ -7481,6 +7472,11 @@ parser_lex(yp_parser_t *parser) {
                        } else {
                            yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
                            size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+                            if (difference == 0) {
+                                // we're at the end of the file
+                                breakpoint = NULL;
+                                break;
+                            }

                            yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);