From 0cda3ac45441a8325d40ab71074e93fe4c628c97 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Fri, 15 Sep 2023 21:17:37 -0400 Subject: [PATCH] [ruby/yarp] fix: handling escaped whitespace in a %w list Introduces a new flavor of unescaping, YP_UNESCAPE_WHITESPACE, which is the same as MINIMAL but also unescapes whitespace. Note that a spanning_heredoc.txt fixture test is updated to be less wrong, but YARP's behavior doesn't yet fully match Ruby in this case. Fixes https://github.com/ruby/yarp/pull/1505 https://github.com/ruby/yarp/commit/0af69bdeb1 --- test/yarp/snapshots/spanning_heredoc.txt | 2 +- test/yarp/snapshots/strings.txt | 4 ++-- ...ser_slash_slash_n_escaping_in_literals.txt | 2 +- test/yarp/unescape_test.rb | 7 ++++++ yarp/extension.c | 7 ++++++ yarp/unescape.c | 23 +++++++++++++++++-- yarp/unescape.h | 6 ++++- yarp/yarp.c | 2 +- 8 files changed, 45 insertions(+), 8 deletions(-) diff --git a/test/yarp/snapshots/spanning_heredoc.txt b/test/yarp/snapshots/spanning_heredoc.txt index 63a9727927..591f4bbf7e 100644 --- a/test/yarp/snapshots/spanning_heredoc.txt +++ b/test/yarp/snapshots/spanning_heredoc.txt @@ -179,7 +179,7 @@ │ │ │ │ ├── opening_loc: ∅ │ │ │ │ ├── content_loc: (532...535) = "j\\\n" │ │ │ │ ├── closing_loc: ∅ - │ │ │ │ └── unescaped: "j\\\n" + │ │ │ │ └── unescaped: "j\n" │ │ │ └── @ StringNode (location: (539...540)) │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: ∅ diff --git a/test/yarp/snapshots/strings.txt b/test/yarp/snapshots/strings.txt index ef08cb6994..0040191d3e 100644 --- a/test/yarp/snapshots/strings.txt +++ b/test/yarp/snapshots/strings.txt @@ -292,7 +292,7 @@ │ │ │ ├── opening_loc: ∅ │ │ │ ├── content_loc: (290...298) = "foo\\ bar" │ │ │ ├── closing_loc: ∅ - │ │ │ └── unescaped: "foo\\ bar" + │ │ │ └── unescaped: "foo bar" │ │ └── @ StringNode (location: (299...304)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ @@ -308,7 +308,7 @@ │ │ │ ├── opening_loc: ∅ │ │ │ ├── content_loc: (310...318) = "foo\\ bar" │ │ │ ├── closing_loc: ∅ - │ │ │ └── unescaped: "foo\\ bar" + │ │ │ └── unescaped: "foo bar" │ │ └── @ StringNode (location: (319...322)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ diff --git a/test/yarp/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt b/test/yarp/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt index ae5873a7fa..ee11d2662c 100644 --- a/test/yarp/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt +++ b/test/yarp/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt @@ -67,7 +67,7 @@ │ │ ├── opening_loc: ∅ │ │ ├── content_loc: (81...85) = "a\\\nb" │ │ ├── closing_loc: ∅ - │ │ └── unescaped: "a\\\nb" + │ │ └── unescaped: "a\nb" │ ├── opening_loc: (78...81) = "%w{" │ └── closing_loc: (85...86) = "}" ├── @ XStringNode (location: (88...96)) diff --git a/test/yarp/unescape_test.rb b/test/yarp/unescape_test.rb index f39bdd0e39..a7d955b315 100644 --- a/test/yarp/unescape_test.rb +++ b/test/yarp/unescape_test.rb @@ -136,6 +136,13 @@ module YARP assert_unescape_all("g", "\\g") end + def test_whitespace_escaping_string_list + assert_equal("a b", Debug.unescape_whitespace("a\\ b")) + assert_equal("a\tb", Debug.unescape_whitespace("a\\\tb")) + assert_equal("a\nb", Debug.unescape_whitespace("a\\\nb")) + assert_equal("a\nb", Debug.unescape_whitespace("a\\\r\nb")) + end + private def unescape_all(source) diff --git a/yarp/extension.c b/yarp/extension.c index f951b0c4a7..b5fe43b353 100644 --- a/yarp/extension.c +++ b/yarp/extension.c @@ -491,6 +491,12 @@ unescape_minimal(VALUE self, VALUE source) { return unescape(source, YP_UNESCAPE_MINIMAL); } +// Escape the given string minimally plus whitespace. Returns the unescaped string. +static VALUE +unescape_whitespace(VALUE self, VALUE source) { + return unescape(source, YP_UNESCAPE_WHITESPACE); +} + // Unescape everything in the given string. Return the unescaped string. static VALUE unescape_all(VALUE self, VALUE source) { @@ -608,6 +614,7 @@ Init_yarp(void) { rb_define_singleton_method(rb_cYARPDebug, "named_captures", named_captures, 1); rb_define_singleton_method(rb_cYARPDebug, "unescape_none", unescape_none, 1); rb_define_singleton_method(rb_cYARPDebug, "unescape_minimal", unescape_minimal, 1); + rb_define_singleton_method(rb_cYARPDebug, "unescape_whitespace", unescape_whitespace, 1); rb_define_singleton_method(rb_cYARPDebug, "unescape_all", unescape_all, 1); rb_define_singleton_method(rb_cYARPDebug, "memsize", memsize, 1); rb_define_singleton_method(rb_cYARPDebug, "profile_file", profile_file, 1); diff --git a/yarp/unescape.c b/yarp/unescape.c index 9da52eaea9..6ecb8f49c4 100644 --- a/yarp/unescape.c +++ b/yarp/unescape.c @@ -509,7 +509,17 @@ yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t * cursor = backslash + 2; break; default: - if (unescape_type == YP_UNESCAPE_MINIMAL) { + if (unescape_type == YP_UNESCAPE_WHITESPACE) { + if (backslash[1] == '\r' && backslash[2] == '\n') { + cursor = backslash + 2; + break; + } + if (yp_strspn_whitespace(backslash + 1, 1)) { + cursor = backslash + 1; + break; + } + } + if (unescape_type == YP_UNESCAPE_WHITESPACE || unescape_type == YP_UNESCAPE_MINIMAL) { // In this case we're escaping something that doesn't need escaping. dest[dest_length++] = '\\'; cursor = backslash + 1; @@ -579,7 +589,16 @@ yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, case '\'': return 2; default: { - if (unescape_type == YP_UNESCAPE_MINIMAL) { + if (unescape_type == YP_UNESCAPE_WHITESPACE) { + if (backslash[1] == '\r' && backslash[2] == '\n') { + return 2; + } + size_t whitespace = yp_strspn_whitespace(backslash + 1, 1); + if (whitespace > 0) { + return whitespace; + } + } + if (unescape_type == YP_UNESCAPE_WHITESPACE || unescape_type == YP_UNESCAPE_MINIMAL) { return 1 + yp_char_width(parser, backslash + 1, parser->end); } diff --git a/yarp/unescape.h b/yarp/unescape.h index fb0df9fcb0..bb0ef315a9 100644 --- a/yarp/unescape.h +++ b/yarp/unescape.h @@ -24,9 +24,13 @@ typedef enum { // single quotes and backslashes. YP_UNESCAPE_MINIMAL, + // When we're unescaping a string list, in addition to MINIMAL, we need to + // unescape whitespace. + YP_UNESCAPE_WHITESPACE, + // When we're unescaping a double-quoted string, we need to unescape all // escapes. - YP_UNESCAPE_ALL + YP_UNESCAPE_ALL, } yp_unescape_type_t; // Unescape the contents of the given token into the given string using the given unescape mode. diff --git a/yarp/yarp.c b/yarp/yarp.c index 30491fe7a1..d38871f070 100644 --- a/yarp/yarp.c +++ b/yarp/yarp.c @@ -12987,7 +12987,7 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) { yp_token_t opening = not_provided(parser); yp_token_t closing = not_provided(parser); - yp_node_t *string = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, YP_UNESCAPE_MINIMAL); + yp_node_t *string = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, YP_UNESCAPE_WHITESPACE); yp_array_node_elements_append(array, string); }