[ruby/prism] Use current_string for escapes in heredocs

https://github.com/ruby/prism/commit/fc49acfc59
2023-10-11 10:59:26 -04:00 · 2023-10-11 10:59:26 -04:00 · 24768d8a57
--- a/prism/prism.c
+++ b/prism/prism.c
@ -8426,14 +8426,15 @@ parser_lex(pm_parser_t *parser) {

            // Now let's grab the information about the identifier off of the current
            // lex mode.
-            const uint8_t *ident_start = parser->lex_modes.current->as.heredoc.ident_start;
-            size_t ident_length = parser->lex_modes.current->as.heredoc.ident_length;
+            pm_lex_mode_t *lex_mode = parser->lex_modes.current;
+            const uint8_t *ident_start = lex_mode->as.heredoc.ident_start;
+            size_t ident_length = lex_mode->as.heredoc.ident_length;

            // If we are immediately following a newline and we have hit the
            // terminator, then we need to return the ending of the heredoc.
            if (current_token_starts_line(parser)) {
                const uint8_t *start = parser->current.start;
-                if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
+                if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
                    start += pm_strspn_inline_whitespace(start, parser->end - start);
                }

@ -8453,10 +8454,10 @@ parser_lex(pm_parser_t *parser) {
                    }

                    if (matched) {
-                        if (*parser->lex_modes.current->as.heredoc.next_start == '\\') {
+                        if (*lex_mode->as.heredoc.next_start == '\\') {
                            parser->next_start = NULL;
                        } else {
-                            parser->next_start = parser->lex_modes.current->as.heredoc.next_start;
+                            parser->next_start = lex_mode->as.heredoc.next_start;
                            parser->heredoc_end = parser->current.end;
                        }

@ -8469,17 +8470,18 @@ parser_lex(pm_parser_t *parser) {
                }
            }

-            // Otherwise we'll be parsing string content. These are the places where
-            // we need to split up the content of the heredoc. We'll use strpbrk to
-            // find the first of these characters.
+            // Otherwise we'll be parsing string content. These are the places
+            // where we need to split up the content of the heredoc. We'll use
+            // strpbrk to find the first of these characters.
            uint8_t breakpoints[] = "\n\\#";

-            pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
+            pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
            if (quote == PM_HEREDOC_QUOTE_SINGLE) {
                breakpoints[2] = '\0';
            }

            const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
+            pm_token_buffer_t token_buffer = { 0 };

            while (breakpoint != NULL) {
                switch (*breakpoint) {
@ -8491,13 +8493,14 @@ parser_lex(pm_parser_t *parser) {
                        if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
                            parser_flush_heredoc_end(parser);
                            parser->current.end = breakpoint + 1;
+                            pm_token_buffer_flush(parser, &token_buffer);
                            LEX(PM_TOKEN_STRING_CONTENT);
                        }

                        pm_newline_list_append(&parser->newline_list, breakpoint);

                        const uint8_t *start = breakpoint + 1;
-                        if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
+                        if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
                            start += pm_strspn_inline_whitespace(start, parser->end - start);
                        }

@ -8515,6 +8518,7 @@ parser_lex(pm_parser_t *parser) {
                                match_eol_at(parser, start + ident_length)
                            ) {
                                parser->current.end = breakpoint + 1;
+                                pm_token_buffer_flush(parser, &token_buffer);
                                LEX(PM_TOKEN_STRING_CONTENT);
                            }
                        }
@ -8531,37 +8535,83 @@ parser_lex(pm_parser_t *parser) {
                        // stop looping before the newline and not after the
                        // newline so that we can still potentially find the
                        // terminator of the heredoc.
-                        size_t eol_length = match_eol_at(parser, breakpoint + 1);
-                        if (eol_length) {
-                            breakpoint += eol_length;
-                        } else {
-                            pm_unescape_type_t unescape_type = (quote == PM_HEREDOC_QUOTE_SINGLE) ? PM_UNESCAPE_MINIMAL : PM_UNESCAPE_ALL;
-                            size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type);
-                            if (difference == 0) {
-                                // we're at the end of the file
-                                breakpoint = NULL;
-                                break;
-                            }
+                        parser->current.end = breakpoint + 1;
+                        pm_token_buffer_escape(parser, &token_buffer);

-                            pm_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
-
-                            breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
+                        // If we've hit the end of the file, then break out of
+                        // the loop by setting the breakpoint to NULL.
+                        if (parser->current.end == parser->end) {
+                            breakpoint = NULL;
+                            continue;
                        }

+                        uint8_t peeked = peek(parser);
+                        switch (peeked) {
+                            case '\r':
+                                parser->current.end++;
+                                if (peek(parser) != '\n') {
+                                    pm_token_buffer_push(&token_buffer, '\r');
+                                    break;
+                                }
+                            /* fallthrough */
+                            case '\n':
+                                // If this is a dedenting heredoc then we need
+                                // to leave the escaped newline in place so that
+                                // it can be removed later when we dedent the
+                                // heredoc.
+                                if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
+                                    pm_token_buffer_push(&token_buffer, '\\');
+                                    pm_token_buffer_push(&token_buffer, '\n');
+                                }
+
+                                if (parser->heredoc_end) {
+                                    // ... if we are on the same line as a heredoc,
+                                    // flush the heredoc and continue parsing after
+                                    // heredoc_end.
+                                    parser_flush_heredoc_end(parser);
+                                    pm_token_buffer_copy(parser, &token_buffer);
+                                    LEX(PM_TOKEN_STRING_CONTENT);
+                                } else {
+                                    // ... else track the newline.
+                                    pm_newline_list_append(&parser->newline_list, parser->current.end);
+                                }
+
+                                parser->current.end++;
+                                break;
+                            default:
+                                if (quote == PM_HEREDOC_QUOTE_SINGLE) {
+                                    pm_token_buffer_push(&token_buffer, '\\');
+                                    pm_token_buffer_push(&token_buffer, peeked);
+                                    parser->current.end++;
+                                } else {
+                                    escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
+                                }
+
+                                break;
+                        }
+
+                        token_buffer.cursor = parser->current.end;
+                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
                        break;
                    }
                    case '#': {
                        pm_token_type_t type = lex_interpolation(parser, breakpoint);
-                        if (type != PM_TOKEN_NOT_PROVIDED) {
-                            LEX(type);
+
+                        if (type == PM_TOKEN_NOT_PROVIDED) {
+                            // If we haven't returned at this point then we had
+                            // something that looked like an interpolated class
+                            // or instance variable like "#@" but wasn't
+                            // actually. In this case we'll just skip to the
+                            // next breakpoint.
+                            breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
+                            break;
                        }

-                        // If we haven't returned at this point then we had something
-                        // that looked like an interpolated class or instance variable
-                        // like "#@" but wasn't actually. In this case we'll just skip
-                        // to the next breakpoint.
-                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
-                        break;
+                        if (type == PM_TOKEN_STRING_CONTENT) {
+                            pm_token_buffer_flush(parser, &token_buffer);
+                        }
+
+                        LEX(type);
                    }
                    default:
                        assert(false && "unreachable");
@ -12499,7 +12549,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {

                node->location.end = opening.end;
            } else {
-                part = parse_string_part(parser);
+                if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
+                    pm_token_t opening = not_provided(parser);
+                    pm_token_t closing = not_provided(parser);
+                    part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
+                    parser_lex(parser);
+                } else {
+                    part = parse_string_part(parser);
+                }

                if (part == NULL) {
                    // If we get here, then we tried to find something in the
@ -12539,7 +12596,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
                    pm_node_list_append(&parts, part);

                    while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
-                        if ((part = parse_string_part(parser)) != NULL) {
+                        if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
+                            pm_token_t opening = not_provided(parser);
+                            pm_token_t closing = not_provided(parser);
+                            part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
+                            parser_lex(parser);
+                        } else {
+                            part = parse_string_part(parser);
+                        }
+
+                        if (part != NULL) {
                            pm_node_list_append(&parts, part);
                        }
                    }
--- a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt
+++ b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt
@ -114,7 +114,7 @@
        │   ├── opening_loc: (51,0)-(51,9) = "<<-'HERE'"
        │   ├── content_loc: (52,0)-(53,0) = "a\\\nb\n"
        │   ├── closing_loc: (54,0)-(54,0) = "HERE\n"
-        │   └── unescaped: "a\\\nb\n"
+        │   └── unescaped: "ab\n"
        ├── @ XStringNode (location: (56,0)-(56,9))
        │   ├── opening_loc: (56,0)-(56,9) = "<<-`HERE`"
        │   ├── content_loc: (57,0)-(58,0) = "a\\\nb\n"
--- a/test/prism/snapshots/whitequark/ruby_bug_11989.txt
+++ b/test/prism/snapshots/whitequark/ruby_bug_11989.txt
@ -16,7 +16,7 @@
            │           ├── opening_loc: (1,2)-(1,8) = "<<~\"E\""
            │           ├── content_loc: (2,0)-(2,0) = "  x\\n   y\n"
            │           ├── closing_loc: (3,0)-(3,0) = "E\n"
-            │           └── unescaped: "x\n   y\n"
+            │           └── unescaped: "x\n y\n"
            ├── closing_loc: ∅
            ├── block: ∅
            ├── flags: ∅