From 3605d6076dab516c7b483a8be6038d5b6da1845a Mon Sep 17 00:00:00 2001 From: Koichi ITO Date: Sat, 16 Mar 2024 15:21:36 +0900 Subject: [PATCH] [ruby/prism] Fix token incompatibility for `Prism::Translation::Parser::Lexer` This PR fixes token incompatibility for `Prism::Translation::Parser::Lexer` when using backquoted heredoc indetiner: ```ruby <<-` FOO` a b FOO ``` ## Parser gem (Expected) Returns `tXSTRING_BEG` as the first token: ```console $ bundle exec ruby -Ilib -rparser/ruby33 -ve \ 'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Parser::Ruby33.new.tokenize(buf)' ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22] [s(:xstr, s(:str, "a\n"), s(:str, "b\n")), [], [[:tXSTRING_BEG, ["<<`", #]], [:tSTRING_CONTENT, ["a\n", #]], [:tSTRING_CONTENT, ["b\n", #]], [:tSTRING_END, [" FOO", #]], [:tNL, [nil, #]]]] ``` ## `Prism::Translation::Parser` (Actual) Previously, the tokens returned by the Parser gem were different. The escaped backslash does not match in the `tSTRING_BEG` token and value of `tSTRING_END` token. ```console $ bundle exec ruby -Ilib -rprism -rprism/translation/parser33 -ve \ 'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Prism::Translation::Parser33.new.tokenize(buf)' ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22] [s(:xstr, s(:str, "a\n"), s(:str, "b\n")), [], [[:tSTRING_BEG, ["<<\"", #]], [:tSTRING_CONTENT, ["a\n", #]], [:tSTRING_CONTENT, ["b\n", #]], [:tSTRING_END, ["` FOO`", #]], [:tNL, [nil, #]]]] ``` After this correction, the AST and tokens returned by the Parser gem are the same: ```console $ bunlde exec ruby -Ilib -rprism -rprism/translation/parser33 -ve \ 'buf = Parser::Source::Buffer.new("example.rb"); buf.source = File.read("example.rb"); p Prism::Translation::Parser33.new.tokenize(buf)' ruby 3.3.0 (2023-12-25 revision https://github.com/ruby/prism/commit/5124f9ac75) [x86_64-darwin22] [s(:xstr, s(:str, "a\n"), s(:str, "b\n")), [], [[:tXSTRING_BEG, ["<<`", #]], [:tSTRING_CONTENT, ["a\n", #]], [:tSTRING_CONTENT, ["b\n", #]], [:tSTRING_END, [" FOO", #]], [:tNL, [nil, #]]]] ``` https://github.com/ruby/prism/commit/308f8d85a1 --- lib/prism/translation/parser/lexer.rb | 9 +++- .../fixtures/heredocs_leading_whitespace.txt | 5 ++ .../snapshots/heredocs_leading_whitespace.txt | 46 +++++++++++-------- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb index cb23fe8ac0..9d7caae0ba 100644 --- a/lib/prism/translation/parser/lexer.rb +++ b/lib/prism/translation/parser/lexer.rb @@ -278,7 +278,7 @@ module Prism value = nil when :tSTRING_BEG if token.type == :HEREDOC_START - heredoc_identifier_stack.push(value.match(/<<[-~]?["']?(?.*?)["']?\z/)[:heredoc_identifier]) + heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?.*?)["'`]?\z/)[:heredoc_identifier]) end if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END next_location = token.location.join(next_token.location) @@ -294,7 +294,12 @@ module Prism index += 2 elsif value.start_with?("<<") quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2] - value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}" + if quote == "`" + type = :tXSTRING_BEG + value = "<<`" + else + value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}" + end end when :tSTRING_CONTENT unless (lines = token.value.lines).one? diff --git a/test/prism/fixtures/heredocs_leading_whitespace.txt b/test/prism/fixtures/heredocs_leading_whitespace.txt index 8f19836943..660ecb4543 100644 --- a/test/prism/fixtures/heredocs_leading_whitespace.txt +++ b/test/prism/fixtures/heredocs_leading_whitespace.txt @@ -8,6 +8,11 @@ a b FOO +<<-` FOO` +a +b + FOO + <<-' FOO' a b diff --git a/test/prism/snapshots/heredocs_leading_whitespace.txt b/test/prism/snapshots/heredocs_leading_whitespace.txt index 5412f7d290..332dfa2986 100644 --- a/test/prism/snapshots/heredocs_leading_whitespace.txt +++ b/test/prism/snapshots/heredocs_leading_whitespace.txt @@ -1,8 +1,8 @@ -@ ProgramNode (location: (1,0)-(21,10)) +@ ProgramNode (location: (1,0)-(26,10)) ├── locals: [] └── statements: - @ StatementsNode (location: (1,0)-(21,10)) - └── body: (length: 5) + @ StatementsNode (location: (1,0)-(26,10)) + └── body: (length: 6) ├── @ StringNode (location: (1,0)-(1,10)) │ ├── flags: ∅ │ ├── opening_loc: (1,0)-(1,10) = "<<-' FOO'" @@ -15,41 +15,47 @@ │ ├── content_loc: (7,0)-(9,0) = "a\nb\n" │ ├── closing_loc: (9,0)-(10,0) = " FOO\n" │ └── unescaped: "a\nb\n" - ├── @ StringNode (location: (11,0)-(11,10)) + ├── @ XStringNode (location: (11,0)-(11,10)) │ ├── flags: ∅ - │ ├── opening_loc: (11,0)-(11,10) = "<<-' FOO'" + │ ├── opening_loc: (11,0)-(11,10) = "<<-` FOO`" │ ├── content_loc: (12,0)-(14,0) = "a\nb\n" - │ ├── closing_loc: (14,0)-(15,0) = " FOO\n" + │ ├── closing_loc: (14,0)-(15,0) = " FOO\n" │ └── unescaped: "a\nb\n" - ├── @ InterpolatedStringNode (location: (16,0)-(16,10)) - │ ├── opening_loc: (16,0)-(16,10) = "<<~' FOO'" + ├── @ StringNode (location: (16,0)-(16,10)) + │ ├── flags: ∅ + │ ├── opening_loc: (16,0)-(16,10) = "<<-' FOO'" + │ ├── content_loc: (17,0)-(19,0) = "a\nb\n" + │ ├── closing_loc: (19,0)-(20,0) = " FOO\n" + │ └── unescaped: "a\nb\n" + ├── @ InterpolatedStringNode (location: (21,0)-(21,10)) + │ ├── opening_loc: (21,0)-(21,10) = "<<~' FOO'" │ ├── parts: (length: 2) - │ │ ├── @ StringNode (location: (17,0)-(18,0)) + │ │ ├── @ StringNode (location: (22,0)-(23,0)) │ │ │ ├── flags: ∅ │ │ │ ├── opening_loc: ∅ - │ │ │ ├── content_loc: (17,0)-(18,0) = "a\n" + │ │ │ ├── content_loc: (22,0)-(23,0) = "a\n" │ │ │ ├── closing_loc: ∅ │ │ │ └── unescaped: "a\n" - │ │ └── @ StringNode (location: (18,0)-(19,0)) + │ │ └── @ StringNode (location: (23,0)-(24,0)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ - │ │ ├── content_loc: (18,0)-(19,0) = "b\n" + │ │ ├── content_loc: (23,0)-(24,0) = "b\n" │ │ ├── closing_loc: ∅ │ │ └── unescaped: "b\n" - │ └── closing_loc: (19,0)-(20,0) = " FOO\n" - └── @ InterpolatedStringNode (location: (21,0)-(21,10)) - ├── opening_loc: (21,0)-(21,10) = "<<~' FOO'" + │ └── closing_loc: (24,0)-(25,0) = " FOO\n" + └── @ InterpolatedStringNode (location: (26,0)-(26,10)) + ├── opening_loc: (26,0)-(26,10) = "<<~' FOO'" ├── parts: (length: 2) - │ ├── @ StringNode (location: (22,0)-(23,0)) + │ ├── @ StringNode (location: (27,0)-(28,0)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ - │ │ ├── content_loc: (22,0)-(23,0) = "a\n" + │ │ ├── content_loc: (27,0)-(28,0) = "a\n" │ │ ├── closing_loc: ∅ │ │ └── unescaped: "a\n" - │ └── @ StringNode (location: (23,0)-(24,0)) + │ └── @ StringNode (location: (28,0)-(29,0)) │ ├── flags: ∅ │ ├── opening_loc: ∅ - │ ├── content_loc: (23,0)-(24,0) = "b\n" + │ ├── content_loc: (28,0)-(29,0) = "b\n" │ ├── closing_loc: ∅ │ └── unescaped: "b\n" - └── closing_loc: (24,0)-(25,0) = " FOO\n" + └── closing_loc: (29,0)-(30,0) = " FOO\n"