[ruby/yarp] Fix heredocs inside %W and %w lists

The problem was that we were treating heredoc bodies as part of the %W list because we didn't push the scanning cursor past the heredoc after lexing out the here doc. To fix this, we changed the whitespace scanning function to quit scanning when it reaches a newline but only in the case that a heredoc is present. Additionally, we need to prevent double counting newlines in the case of a heredoc. For example: ```ruby %W(<<foo 123) foo ``` The newline after the `)` is counted as part of scanning the heredoc, so we added logic to prevent double counting the newline when scanning the rest of the %W list. https://github.com/ruby/yarp/commit/eb090d8126 Co-authored-by: Jemma Issroff <jemmaissroff@gmail.com>
2023-07-18 14:37:26 -07:00 · 2023-07-18 14:37:26 -07:00 · abce8583e2
--- a/test/snapshots/seattlerb/pct_w_heredoc_interp_nested.txt
+++ b/test/snapshots/seattlerb/pct_w_heredoc_interp_nested.txt
@ -0,0 +1,28 @@
+ProgramNode(0...30)(
+  [],
+  StatementsNode(0...30)(
+    [ArrayNode(0...30)(
+       [StringNode(4...5)(nil, (4...5), nil, "1"),
+        InterpolatedStringNode(0...12)(
+          nil,
+          [EmbeddedStatementsNode(6...12)(
+             (6...8),
+             StatementsNode(8...19)(
+               [InterpolatedStringNode(8...19)(
+                  (8...11),
+                  [StringNode(15...17)(nil, (15...17), nil, "2\n")],
+                  (17...19)
+                )]
+             ),
+             (11...12)
+           )],
+          nil
+        ),
+        StringNode(13...14)(nil, (13...14), nil, "3"),
+        StringNode(25...26)(nil, (25...26), nil, "4"),
+        StringNode(27...28)(nil, (27...28), nil, "5")],
+       (0...3),
+       (29...30)
+     )]
+  )
+)
--- a/test/yarp/parse_test.rb
+++ b/test/yarp/parse_test.rb
@ -28,7 +28,6 @@ class ParseTest < Test::Unit::TestCase

  known_failures = %w[
    seattlerb/heredoc_nested.txt
-    seattlerb/pct_w_heredoc_interp_nested.txt
  ]

  def find_source_file_node(node)
--- a/yarp/util/yp_char.c
+++ b/yarp/util/yp_char.c
@ -75,7 +75,7 @@ yp_strspn_whitespace(const char *string, ptrdiff_t length) {
 // whitespace while also tracking the location of each newline. Disallows
 // searching past the given maximum number of characters.
 size_t
-yp_strspn_whitespace_newlines(const char *string, long length, yp_newline_list_t *newline_list) {
+yp_strspn_whitespace_newlines(const char *string, long length, yp_newline_list_t *newline_list, bool stop_at_newline) {
    if (length <= 0) return 0;

    size_t size = 0;
@ -83,7 +83,12 @@ yp_strspn_whitespace_newlines(const char *string, long length, yp_newline_list_t

    while (size < maximum && (yp_char_table[(unsigned char) string[size]] & YP_CHAR_BIT_WHITESPACE)) {
        if (string[size] == '\n') {
-            yp_newline_list_append(newline_list, string + size);
+            if (stop_at_newline) {
+                return size + 1;
+            }
+            else {
+                yp_newline_list_append(newline_list, string + size);
+            }
        }

        size++;
--- a/yarp/util/yp_char.h
+++ b/yarp/util/yp_char.h
@ -15,7 +15,7 @@ size_t yp_strspn_whitespace(const char *string, ptrdiff_t length);
 // whitespace while also tracking the location of each newline. Disallows
 // searching past the given maximum number of characters.
 size_t
-yp_strspn_whitespace_newlines(const char *string, long length, yp_newline_list_t *newline_list);
+yp_strspn_whitespace_newlines(const char *string, long length, yp_newline_list_t *newline_list, bool);

 // Returns the number of characters at the start of the string that are inline
 // whitespace. Disallows searching past the given maximum number of characters.
--- a/yarp/util/yp_newline_list.c
+++ b/yarp/util/yp_newline_list.c
@ -25,13 +25,15 @@ yp_newline_list_init(yp_newline_list_t *list, const char *start, size_t capacity
 bool
 yp_newline_list_append(yp_newline_list_t *list, const char *cursor) {
    if (list->size == list->capacity) {
-        list->capacity = list->capacity * 3 / 2;
+        list->capacity = (list->capacity * 3) / 2;
        list->offsets = (size_t *) realloc(list->offsets, list->capacity * sizeof(size_t));
        if (list->offsets == NULL) return false;
    }

    assert(cursor >= list->start);
-    list->offsets[list->size++] = (size_t) (cursor - list->start + 1);
+    size_t newline_offset = (size_t) (cursor - list->start + 1);
+    assert(list->size == 0 || newline_offset > list->offsets[list->size - 1]);
+    list->offsets[list->size++] = newline_offset;

    return true;
 }
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@ -6505,14 +6505,26 @@ parser_lex(yp_parser_t *parser) {
            }
        }
        case YP_LEX_LIST:
+            if (parser->next_start != NULL) {
+                parser->current.end = parser->next_start;
+                parser->next_start = NULL;
+            }
+
            // First we'll set the beginning of the token.
            parser->current.start = parser->current.end;

            // If there's any whitespace at the start of the list, then we're
            // going to trim it off the beginning and create a new token.
            size_t whitespace;
-            if ((whitespace = yp_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->newline_list)) > 0) {
+
+            bool should_stop = parser->heredoc_end;
+
+            if ((whitespace = yp_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->newline_list, should_stop)) > 0) {
                parser->current.end += whitespace;
+                if (parser->current.end[-1] == '\n') {
+                    // mutates next_start
+                    parser_flush_heredoc_end(parser);
+                }
                LEX(YP_TOKEN_WORDS_SEP);
            }