ruby/lib/syntax_suggest/clean_document.rb

307 строки
8.7 KiB
Ruby

# frozen_string_literal: true
module SyntaxSuggest
# Parses and sanitizes source into a lexically aware document
#
# Internally the document is represented by an array with each
# index containing a CodeLine correlating to a line from the source code.
#
# There are three main phases in the algorithm:
#
# 1. Sanitize/format input source
# 2. Search for invalid blocks
# 3. Format invalid blocks into something meaninful
#
# This class handles the first part.
#
# The reason this class exists is to format input source
# for better/easier/cleaner exploration.
#
# The CodeSearch class operates at the line level so
# we must be careful to not introduce lines that look
# valid by themselves, but when removed will trigger syntax errors
# or strange behavior.
#
# ## Join Trailing slashes
#
# Code with a trailing slash is logically treated as a single line:
#
# 1 it "code can be split" \
# 2 "across multiple lines" do
#
# In this case removing line 2 would add a syntax error. We get around
# this by internally joining the two lines into a single "line" object
#
# ## Logically Consecutive lines
#
# Code that can be broken over multiple
# lines such as method calls are on different lines:
#
# 1 User.
# 2 where(name: "schneems").
# 3 first
#
# Removing line 2 can introduce a syntax error. To fix this, all lines
# are joined into one.
#
# ## Heredocs
#
# A heredoc is an way of defining a multi-line string. They can cause many
# problems. If left as a single line, the parser would try to parse the contents
# as ruby code rather than as a string. Even without this problem, we still
# hit an issue with indentation:
#
# 1 foo = <<~HEREDOC
# 2 "Be yourself; everyone else is already taken.""
# 3 ― Oscar Wilde
# 4 puts "I look like ruby code" # but i'm still a heredoc
# 5 HEREDOC
#
# If we didn't join these lines then our algorithm would think that line 4
# is separate from the rest, has a higher indentation, then look at it first
# and remove it.
#
# If the code evaluates line 5 by itself it will think line 5 is a constant,
# remove it, and introduce a syntax errror.
#
# All of these problems are fixed by joining the whole heredoc into a single
# line.
#
# ## Comments and whitespace
#
# Comments can throw off the way the lexer tells us that the line
# logically belongs with the next line. This is valid ruby but
# results in a different lex output than before:
#
# 1 User.
# 2 where(name: "schneems").
# 3 # Comment here
# 4 first
#
# To handle this we can replace comment lines with empty lines
# and then re-lex the source. This removal and re-lexing preserves
# line index and document size, but generates an easier to work with
# document.
#
class CleanDocument
def initialize(source:)
lines = clean_sweep(source: source)
@document = CodeLine.from_source(lines.join, lines: lines)
end
# Call all of the document "cleaners"
# and return self
def call
join_trailing_slash!
join_consecutive!
join_heredoc!
self
end
# Return an array of CodeLines in the
# document
def lines
@document
end
# Renders the document back to a string
def to_s
@document.join
end
# Remove comments
#
# replace with empty newlines
#
# source = <<~'EOM'
# # Comment 1
# puts "hello"
# # Comment 2
# puts "world"
# EOM
#
# lines = CleanDocument.new(source: source).lines
# expect(lines[0].to_s).to eq("\n")
# expect(lines[1].to_s).to eq("puts "hello")
# expect(lines[2].to_s).to eq("\n")
# expect(lines[3].to_s).to eq("puts "world")
#
# Important: This must be done before lexing.
#
# After this change is made, we lex the document because
# removing comments can change how the doc is parsed.
#
# For example:
#
# values = LexAll.new(source: <<~EOM))
# User.
# # comment
# where(name: 'schneems')
# EOM
# expect(
# values.count {|v| v.type == :on_ignored_nl}
# ).to eq(1)
#
# After the comment is removed:
#
# values = LexAll.new(source: <<~EOM))
# User.
#
# where(name: 'schneems')
# EOM
# expect(
# values.count {|v| v.type == :on_ignored_nl}
# ).to eq(2)
#
def clean_sweep(source:)
# Match comments, but not HEREDOC strings with #{variable} interpolation
# https://rubular.com/r/HPwtW9OYxKUHXQ
source.lines.map do |line|
if line.match?(/^\s*#([^{].*|)$/)
$/
else
line
end
end
end
# Smushes all heredoc lines into one line
#
# source = <<~'EOM'
# foo = <<~HEREDOC
# lol
# hehehe
# HEREDOC
# EOM
#
# lines = CleanDocument.new(source: source).join_heredoc!.lines
# expect(lines[0].to_s).to eq(source)
# expect(lines[1].to_s).to eq("")
def join_heredoc!
start_index_stack = []
heredoc_beg_end_index = []
lines.each do |line|
line.lex.each do |lex_value|
case lex_value.type
when :on_heredoc_beg
start_index_stack << line.index
when :on_heredoc_end
start_index = start_index_stack.pop
end_index = line.index
heredoc_beg_end_index << [start_index, end_index]
end
end
end
heredoc_groups = heredoc_beg_end_index.map { |start_index, end_index| @document[start_index..end_index] }
join_groups(heredoc_groups)
self
end
# Smushes logically "consecutive" lines
#
# source = <<~'EOM'
# User.
# where(name: 'schneems').
# first
# EOM
#
# lines = CleanDocument.new(source: source).join_consecutive!.lines
# expect(lines[0].to_s).to eq(source)
# expect(lines[1].to_s).to eq("")
#
# The one known case this doesn't handle is:
#
# Ripper.lex <<~EOM
# a &&
# b ||
# c
# EOM
#
# For some reason this introduces `on_ignore_newline` but with BEG type
#
def join_consecutive!
consecutive_groups = @document.select(&:ignore_newline_not_beg?).map do |code_line|
take_while_including(code_line.index..) do |line|
line.ignore_newline_not_beg?
end
end
join_groups(consecutive_groups)
self
end
# Join lines with a trailing slash
#
# source = <<~'EOM'
# it "code can be split" \
# "across multiple lines" do
# EOM
#
# lines = CleanDocument.new(source: source).join_consecutive!.lines
# expect(lines[0].to_s).to eq(source)
# expect(lines[1].to_s).to eq("")
def join_trailing_slash!
trailing_groups = @document.select(&:trailing_slash?).map do |code_line|
take_while_including(code_line.index..) { |x| x.trailing_slash? }
end
join_groups(trailing_groups)
self
end
# Helper method for joining "groups" of lines
#
# Input is expected to be type Array<Array<CodeLine>>
#
# The outer array holds the various "groups" while the
# inner array holds code lines.
#
# All code lines are "joined" into the first line in
# their group.
#
# To preserve document size, empty lines are placed
# in the place of the lines that were "joined"
def join_groups(groups)
groups.each do |lines|
line = lines.first
# Handle the case of multiple groups in a a row
# if one is already replaced, move on
next if @document[line.index].empty?
# Join group into the first line
@document[line.index] = CodeLine.new(
lex: lines.map(&:lex).flatten,
line: lines.join,
index: line.index
)
# Hide the rest of the lines
lines[1..].each do |line|
# The above lines already have newlines in them, if add more
# then there will be double newline, use an empty line instead
@document[line.index] = CodeLine.new(line: "", index: line.index, lex: [])
end
end
self
end
# Helper method for grabbing elements from document
#
# Like `take_while` except when it stops
# iterating, it also returns the line
# that caused it to stop
def take_while_including(range = 0..)
take_next_and_stop = false
@document[range].take_while do |line|
next if take_next_and_stop
take_next_and_stop = !(yield line)
true
end
end
end
end