Enhance keep_tokens option for RubyVM::AbstractSyntaxTree parsing methods

Implementation for Language Server Protocol (LSP) sometimes needs token information.
For example both `m(1)` and `m(1, )` has same AST structure other than node locations
then it's impossible to check the existence of `,` from AST. However in later case,
it might be better to suggest variables list for the second argument.
Token information is important for such case.

This commit adds these methods.

* Add `keep_tokens` option for `RubyVM::AbstractSyntaxTree.parse`, `.parse_file` and `.of`
* Add `RubyVM::AbstractSyntaxTree::Node#tokens` which returns tokens for the node including tokens for descendants nodes.
* Add `RubyVM::AbstractSyntaxTree::Node#all_tokens` which returns all tokens for the input script regardless the receiver node.

[Feature #19070]

Impacts on memory usage and performance are below:

Memory usage:

```
$ cat test.rb
root = RubyVM::AbstractSyntaxTree.parse_file(File.expand_path('../test/ruby/test_keyword.rb', __FILE__), keep_tokens: true)

$ /usr/bin/time -f %Mkb /usr/local/bin/ruby -v
ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux]
11408kb

# keep_tokens :false
$ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb
17508kb

# keep_tokens :true
$ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb
30960kb
```

Performance:

```
$ cat ../ast_keep_tokens.yml
prelude: |
  src = <<~SRC
    module M
      class C
        def m1(a, b)
          1 + a + b
        end
      end
    end
  SRC
benchmark:
  without_keep_tokens: |
    RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: false)
  with_keep_tokens: |
    RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: true)

$ make benchmark COMPARE_RUBY="./ruby" ARGS=../ast_keep_tokens.yml
/home/kaneko.y/.rbenv/shims/ruby --disable=gems -rrubygems -I../benchmark/lib ../benchmark/benchmark-driver/exe/benchmark-driver \
            --executables="compare-ruby::./ruby -I.ext/common --disable-gem" \
            --executables="built-ruby::./miniruby -I../lib -I. -I.ext/common  ../tool/runruby.rb --extout=.ext  -- --disable-gems --disable-gem" \
            --output=markdown --output-compare -v ../ast_keep_tokens.yml
compare-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux]
built-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux]
warming up..

|                     |compare-ruby|built-ruby|
|:--------------------|-----------:|---------:|
|without_keep_tokens  |     21.659k|   21.303k|
|                     |       1.02x|         -|
|with_keep_tokens     |      6.220k|    5.691k|
|                     |       1.09x|         -|
```
This commit is contained in:
yui-knk 2022-09-23 22:40:02 +09:00 коммит произвёл Yuichiro Kaneko
Родитель bbc4cf5f76
Коммит d8601621ed
9 изменённых файлов: 556 добавлений и 104 удалений

Просмотреть файл

@ -192,6 +192,14 @@ Note: We're only listing outstanding class updates.
* RubyVM::AbstractSyntaxTree * RubyVM::AbstractSyntaxTree
* Add `error_tolerant` option for `parse`, `parse_file` and `of`. [[Feature #19013]] * Add `error_tolerant` option for `parse`, `parse_file` and `of`. [[Feature #19013]]
* Add `keep_tokens` option for `parse`, `parse_file` and `of`. Add `#tokens` and `#all_tokens`
for `RubyVM::AbstractSyntaxTree::Node` [[Feature #19070]]
```ruby
root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
root.tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
root.tokens.map{_1[2]}.join # => "x = 1 + 2"
```
* Set * Set
* Set is now available as a built-in class without the need for `require "set"`. [[Feature #16989]] * Set is now available as a built-in class without the need for `require "set"`. [[Feature #16989]]

38
ast.c
Просмотреть файл

@ -64,8 +64,8 @@ ast_new_internal(rb_ast_t *ast, const NODE *node)
return obj; return obj;
} }
static VALUE rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant); static VALUE rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens);
static VALUE rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant); static VALUE rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens);
static VALUE static VALUE
ast_parse_new(void) ast_parse_new(void)
@ -85,13 +85,13 @@ ast_parse_done(rb_ast_t *ast)
} }
static VALUE static VALUE
ast_s_parse(rb_execution_context_t *ec, VALUE module, VALUE str, VALUE keep_script_lines, VALUE error_tolerant) ast_s_parse(rb_execution_context_t *ec, VALUE module, VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{ {
return rb_ast_parse_str(str, keep_script_lines, error_tolerant); return rb_ast_parse_str(str, keep_script_lines, error_tolerant, keep_tokens);
} }
static VALUE static VALUE
rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant) rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{ {
rb_ast_t *ast = 0; rb_ast_t *ast = 0;
@ -99,18 +99,19 @@ rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant)
VALUE vparser = ast_parse_new(); VALUE vparser = ast_parse_new();
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser); if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser); if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
ast = rb_parser_compile_string_path(vparser, Qnil, str, 1); ast = rb_parser_compile_string_path(vparser, Qnil, str, 1);
return ast_parse_done(ast); return ast_parse_done(ast);
} }
static VALUE static VALUE
ast_s_parse_file(rb_execution_context_t *ec, VALUE module, VALUE path, VALUE keep_script_lines, VALUE error_tolerant) ast_s_parse_file(rb_execution_context_t *ec, VALUE module, VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{ {
return rb_ast_parse_file(path, keep_script_lines, error_tolerant); return rb_ast_parse_file(path, keep_script_lines, error_tolerant, keep_tokens);
} }
static VALUE static VALUE
rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant) rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{ {
VALUE f; VALUE f;
rb_ast_t *ast = 0; rb_ast_t *ast = 0;
@ -122,6 +123,7 @@ rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant)
VALUE vparser = ast_parse_new(); VALUE vparser = ast_parse_new();
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser); if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser); if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
ast = rb_parser_compile_file_path(vparser, Qnil, f, 1); ast = rb_parser_compile_file_path(vparser, Qnil, f, 1);
rb_io_close(f); rb_io_close(f);
return ast_parse_done(ast); return ast_parse_done(ast);
@ -141,7 +143,7 @@ lex_array(VALUE array, int index)
} }
static VALUE static VALUE
rb_ast_parse_array(VALUE array, VALUE keep_script_lines, VALUE error_tolerant) rb_ast_parse_array(VALUE array, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{ {
rb_ast_t *ast = 0; rb_ast_t *ast = 0;
@ -149,6 +151,7 @@ rb_ast_parse_array(VALUE array, VALUE keep_script_lines, VALUE error_tolerant)
VALUE vparser = ast_parse_new(); VALUE vparser = ast_parse_new();
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser); if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser); if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
ast = rb_parser_compile_generic(vparser, lex_array, Qnil, array, 1); ast = rb_parser_compile_generic(vparser, lex_array, Qnil, array, 1);
return ast_parse_done(ast); return ast_parse_done(ast);
} }
@ -208,7 +211,7 @@ node_id_for_backtrace_location(rb_execution_context_t *ec, VALUE module, VALUE l
} }
static VALUE static VALUE
ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script_lines, VALUE error_tolerant) ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{ {
VALUE node, lines = Qnil; VALUE node, lines = Qnil;
const rb_iseq_t *iseq; const rb_iseq_t *iseq;
@ -247,13 +250,13 @@ ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script
} }
if (!NIL_P(lines) || !NIL_P(lines = script_lines(path))) { if (!NIL_P(lines) || !NIL_P(lines = script_lines(path))) {
node = rb_ast_parse_array(lines, keep_script_lines, error_tolerant); node = rb_ast_parse_array(lines, keep_script_lines, error_tolerant, keep_tokens);
} }
else if (e_option) { else if (e_option) {
node = rb_ast_parse_str(rb_e_script, keep_script_lines, error_tolerant); node = rb_ast_parse_str(rb_e_script, keep_script_lines, error_tolerant, keep_tokens);
} }
else { else {
node = rb_ast_parse_file(path, keep_script_lines, error_tolerant); node = rb_ast_parse_file(path, keep_script_lines, error_tolerant, keep_tokens);
} }
return node_find(node, node_id); return node_find(node, node_id);
@ -715,6 +718,15 @@ ast_node_last_column(rb_execution_context_t *ec, VALUE self)
return INT2NUM(nd_last_column(data->node)); return INT2NUM(nd_last_column(data->node));
} }
static VALUE
ast_node_all_tokens(rb_execution_context_t *ec, VALUE self)
{
struct ASTNodeData *data;
TypedData_Get_Struct(self, struct ASTNodeData, &rb_node_type, data);
return rb_ast_tokens(data->ast);
}
static VALUE static VALUE
ast_node_inspect(rb_execution_context_t *ec, VALUE self) ast_node_inspect(rb_execution_context_t *ec, VALUE self)
{ {

52
ast.rb
Просмотреть файл

@ -29,8 +29,8 @@ module RubyVM::AbstractSyntaxTree
# #
# RubyVM::AbstractSyntaxTree.parse("x = 1 + 2") # RubyVM::AbstractSyntaxTree.parse("x = 1 + 2")
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-1:9> # # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-1:9>
def self.parse string, keep_script_lines: false, error_tolerant: false def self.parse string, keep_script_lines: false, error_tolerant: false, keep_tokens: false
Primitive.ast_s_parse string, keep_script_lines, error_tolerant Primitive.ast_s_parse string, keep_script_lines, error_tolerant, keep_tokens
end end
# call-seq: # call-seq:
@ -44,8 +44,8 @@ module RubyVM::AbstractSyntaxTree
# #
# RubyVM::AbstractSyntaxTree.parse_file("my-app/app.rb") # RubyVM::AbstractSyntaxTree.parse_file("my-app/app.rb")
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-31:3> # # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-31:3>
def self.parse_file pathname, keep_script_lines: false, error_tolerant: false def self.parse_file pathname, keep_script_lines: false, error_tolerant: false, keep_tokens: false
Primitive.ast_s_parse_file pathname, keep_script_lines, error_tolerant Primitive.ast_s_parse_file pathname, keep_script_lines, error_tolerant, keep_tokens
end end
# call-seq: # call-seq:
@ -63,8 +63,8 @@ module RubyVM::AbstractSyntaxTree
# #
# RubyVM::AbstractSyntaxTree.of(method(:hello)) # RubyVM::AbstractSyntaxTree.of(method(:hello))
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-3:3> # # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-3:3>
def self.of body, keep_script_lines: false, error_tolerant: false def self.of body, keep_script_lines: false, error_tolerant: false, keep_tokens: false
Primitive.ast_s_of body, keep_script_lines, error_tolerant Primitive.ast_s_of body, keep_script_lines, error_tolerant, keep_tokens
end end
# call-seq: # call-seq:
@ -136,6 +136,46 @@ module RubyVM::AbstractSyntaxTree
Primitive.ast_node_last_column Primitive.ast_node_last_column
end end
# call-seq:
# node.tokens -> array
#
# Returns tokens corresponding to the location of the node.
# Returns nil if keep_tokens is not enabled when parse method is called.
# Token is an array of:
#
# - id
# - token type
# - source code text
# - location [first_lineno, first_column, last_lineno, last_column]
#
# root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
# root.tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
# root.tokens.map{_1[2]}.join # => "x = 1 + 2"
def tokens
return nil unless all_tokens
all_tokens.each_with_object([]) do |token, a|
loc = token.last
if ([first_lineno, first_column] <=> [loc[0], loc[1]]) <= 0 &&
([last_lineno, last_column] <=> [loc[2], loc[3]]) >= 0
a << token
end
end
end
# call-seq:
# node.all_tokens -> array
#
# Returns all tokens for the input script regardless the receiver node.
# Returns nil if keep_tokens is not enabled when parse method is called.
#
# root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
# root.all_tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
# root.children[-1].all_tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
def all_tokens
Primitive.ast_node_all_tokens
end
# call-seq: # call-seq:
# node.children -> array # node.children -> array
# #

Просмотреть файл

@ -1,22 +1,3 @@
enum {
tIGNORED_NL = tLAST_TOKEN + 1,
# define tIGNORED_NL ((enum yytokentype)tIGNORED_NL)
tCOMMENT,
# define tCOMMENT ((enum yytokentype)tCOMMENT)
tEMBDOC_BEG,
# define tEMBDOC_BEG ((enum yytokentype)tEMBDOC_BEG)
tEMBDOC,
# define tEMBDOC ((enum yytokentype)tEMBDOC)
tEMBDOC_END,
# define tEMBDOC_END ((enum yytokentype)tEMBDOC_END)
tHEREDOC_BEG,
# define tHEREDOC_BEG ((enum yytokentype)tHEREDOC_BEG)
tHEREDOC_END,
# define tHEREDOC_END ((enum yytokentype)tHEREDOC_END)
k__END__,
# define k__END__ ((enum yytokentype)k__END__)
};
typedef struct { typedef struct {
ID ripper_id_backref; ID ripper_id_backref;
ID ripper_id_backtick; ID ripper_id_backtick;

Просмотреть файл

@ -16,6 +16,7 @@ VALUE rb_parser_set_yydebug(VALUE, VALUE);
void *rb_parser_load_file(VALUE parser, VALUE name); void *rb_parser_load_file(VALUE parser, VALUE name);
void rb_parser_keep_script_lines(VALUE vparser); void rb_parser_keep_script_lines(VALUE vparser);
void rb_parser_error_tolerant(VALUE vparser); void rb_parser_error_tolerant(VALUE vparser);
void rb_parser_keep_tokens(VALUE vparser);
RUBY_SYMBOL_EXPORT_BEGIN RUBY_SYMBOL_EXPORT_BEGIN
VALUE rb_parser_set_context(VALUE, const struct rb_iseq_struct *, int); VALUE rb_parser_set_context(VALUE, const struct rb_iseq_struct *, int);

24
node.c
Просмотреть файл

@ -1161,6 +1161,12 @@ struct node_buffer_struct {
node_buffer_list_t markable; node_buffer_list_t markable;
struct rb_ast_local_table_link *local_tables; struct rb_ast_local_table_link *local_tables;
VALUE mark_hash; VALUE mark_hash;
// - id (sequence number)
// - token_type
// - text of token
// - location info
// Array, whose entry is array
VALUE tokens;
}; };
static void static void
@ -1187,6 +1193,7 @@ rb_node_buffer_new(void)
init_node_buffer_list(&nb->markable, (node_buffer_elem_t*)((size_t)nb->unmarkable.head + bucket_size)); init_node_buffer_list(&nb->markable, (node_buffer_elem_t*)((size_t)nb->unmarkable.head + bucket_size));
nb->local_tables = 0; nb->local_tables = 0;
nb->mark_hash = Qnil; nb->mark_hash = Qnil;
nb->tokens = Qnil;
return nb; return nb;
} }
@ -1418,7 +1425,10 @@ rb_ast_update_references(rb_ast_t *ast)
void void
rb_ast_mark(rb_ast_t *ast) rb_ast_mark(rb_ast_t *ast)
{ {
if (ast->node_buffer) rb_gc_mark(ast->node_buffer->mark_hash); if (ast->node_buffer) {
rb_gc_mark(ast->node_buffer->mark_hash);
rb_gc_mark(ast->node_buffer->tokens);
}
if (ast->body.compile_option) rb_gc_mark(ast->body.compile_option); if (ast->body.compile_option) rb_gc_mark(ast->body.compile_option);
if (ast->node_buffer) { if (ast->node_buffer) {
node_buffer_t *nb = ast->node_buffer; node_buffer_t *nb = ast->node_buffer;
@ -1477,3 +1487,15 @@ rb_ast_add_mark_object(rb_ast_t *ast, VALUE obj)
} }
rb_hash_aset(ast->node_buffer->mark_hash, obj, Qtrue); rb_hash_aset(ast->node_buffer->mark_hash, obj, Qtrue);
} }
VALUE
rb_ast_tokens(rb_ast_t *ast)
{
return ast->node_buffer->tokens;
}
void
rb_ast_set_tokens(rb_ast_t *ast, VALUE tokens)
{
RB_OBJ_WRITE(ast, &ast->node_buffer->tokens, tokens);
}

2
node.h
Просмотреть файл

@ -421,6 +421,8 @@ void rb_ast_dispose(rb_ast_t*);
void rb_ast_free(rb_ast_t*); void rb_ast_free(rb_ast_t*);
size_t rb_ast_memsize(const rb_ast_t*); size_t rb_ast_memsize(const rb_ast_t*);
void rb_ast_add_mark_object(rb_ast_t*, VALUE); void rb_ast_add_mark_object(rb_ast_t*, VALUE);
void rb_ast_set_tokens(rb_ast_t*, VALUE);
VALUE rb_ast_tokens(rb_ast_t *ast);
NODE *rb_ast_newnode(rb_ast_t*, enum node_type type); NODE *rb_ast_newnode(rb_ast_t*, enum node_type type);
void rb_ast_delete_node(rb_ast_t*, NODE *n); void rb_ast_delete_node(rb_ast_t*, NODE *n);
rb_ast_id_table_t *rb_ast_new_local_table(rb_ast_t*, int); rb_ast_id_table_t *rb_ast_new_local_table(rb_ast_t*, int);

480
parse.y
Просмотреть файл

@ -124,7 +124,13 @@ RBIMPL_WARNING_POP()
#define RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(Current) \ #define RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(Current) \
rb_parser_set_location_from_strterm_heredoc(p, &p->lex.strterm->u.heredoc, &(Current)) rb_parser_set_location_from_strterm_heredoc(p, &p->lex.strterm->u.heredoc, &(Current))
#define RUBY_SET_YYLLOC_OF_NONE(Current) \ #define RUBY_SET_YYLLOC_OF_DELAYED_TOKEN(Current) \
rb_parser_set_location_of_delayed_token(p, &(Current))
#define RUBY_SET_YYLLOC_OF_HEREDOC_END(Current) \
rb_parser_set_location_of_heredoc_end(p, &(Current))
#define RUBY_SET_YYLLOC_OF_DUMMY_END(Current) \
rb_parser_set_location_of_dummy_end(p, &(Current))
#define RUBY_SET_YYLLOC_OF_NONE(Current) \
rb_parser_set_location_of_none(p, &(Current)) rb_parser_set_location_of_none(p, &(Current))
#define RUBY_SET_YYLLOC(Current) \ #define RUBY_SET_YYLLOC(Current) \
rb_parser_set_location(p, &(Current)) rb_parser_set_location(p, &(Current))
@ -272,12 +278,12 @@ struct parser_params {
rb_imemo_tmpbuf_t *heap; rb_imemo_tmpbuf_t *heap;
YYSTYPE *lval; YYSTYPE *lval;
YYLTYPE *yylloc;
struct { struct {
rb_strterm_t *strterm; rb_strterm_t *strterm;
VALUE (*gets)(struct parser_params*,VALUE); VALUE (*gets)(struct parser_params*,VALUE);
VALUE input; VALUE input;
VALUE prevline;
VALUE lastline; VALUE lastline;
VALUE nextline; VALUE nextline;
const char *pbeg; const char *pbeg;
@ -320,6 +326,14 @@ struct parser_params {
VALUE debug_buffer; VALUE debug_buffer;
VALUE debug_output; VALUE debug_output;
struct {
VALUE token;
int beg_line;
int beg_col;
int end_line;
int end_col;
} delayed;
ID cur_arg; ID cur_arg;
rb_ast_t *ast; rb_ast_t *ast;
@ -351,6 +365,7 @@ struct parser_params {
unsigned int do_split: 1; unsigned int do_split: 1;
unsigned int keep_script_lines: 1; unsigned int keep_script_lines: 1;
unsigned int error_tolerant: 1; unsigned int error_tolerant: 1;
unsigned int keep_tokens: 1;
NODE *eval_tree_begin; NODE *eval_tree_begin;
NODE *eval_tree; NODE *eval_tree;
@ -359,15 +374,13 @@ struct parser_params {
const struct rb_iseq_struct *parent_iseq; const struct rb_iseq_struct *parent_iseq;
/* store specific keyword locations to generate dummy end token */ /* store specific keyword locations to generate dummy end token */
VALUE end_expect_token_locations; VALUE end_expect_token_locations;
/* id for terms */
int token_id;
/* Array for term tokens */
VALUE tokens;
#else #else
/* Ripper only */ /* Ripper only */
struct {
VALUE token;
int line;
int col;
} delayed;
VALUE value; VALUE value;
VALUE result; VALUE result;
VALUE parsing_thread; VALUE parsing_thread;
@ -447,6 +460,177 @@ peek_end_expect_token_locations(struct parser_params *p)
if(NIL_P(p->end_expect_token_locations)) return Qnil; if(NIL_P(p->end_expect_token_locations)) return Qnil;
return rb_ary_last(0, 0, p->end_expect_token_locations); return rb_ary_last(0, 0, p->end_expect_token_locations);
} }
static ID
parser_token2id(enum yytokentype tok)
{
switch ((int) tok) {
#define TOKEN2ID(tok) case tok: return rb_intern(#tok);
#define TOKEN2ID2(tok, name) case tok: return rb_intern(name);
TOKEN2ID2(' ', "words_sep")
TOKEN2ID2('!', "!")
TOKEN2ID2('%', "%");
TOKEN2ID2('&', "&");
TOKEN2ID2('*', "*");
TOKEN2ID2('+', "+");
TOKEN2ID2('-', "-");
TOKEN2ID2('/', "/");
TOKEN2ID2('<', "<");
TOKEN2ID2('=', "=");
TOKEN2ID2('>', ">");
TOKEN2ID2('?', "?");
TOKEN2ID2('^', "^");
TOKEN2ID2('|', "|");
TOKEN2ID2('~', "~");
TOKEN2ID2(':', ":");
TOKEN2ID2(',', ",");
TOKEN2ID2('.', ".");
TOKEN2ID2(';', ";");
TOKEN2ID2('`', "`");
TOKEN2ID2('\n', "nl");
TOKEN2ID2('{', "{");
TOKEN2ID2('}', "}");
TOKEN2ID2('[', "[");
TOKEN2ID2(']', "]");
TOKEN2ID2('(', "(");
TOKEN2ID2(')', ")");
TOKEN2ID(keyword_class);
TOKEN2ID(keyword_module);
TOKEN2ID(keyword_def);
TOKEN2ID(keyword_undef);
TOKEN2ID(keyword_begin);
TOKEN2ID(keyword_rescue);
TOKEN2ID(keyword_ensure);
TOKEN2ID(keyword_end);
TOKEN2ID(keyword_if);
TOKEN2ID(keyword_unless);
TOKEN2ID(keyword_then);
TOKEN2ID(keyword_elsif);
TOKEN2ID(keyword_else);
TOKEN2ID(keyword_case);
TOKEN2ID(keyword_when);
TOKEN2ID(keyword_while);
TOKEN2ID(keyword_until);
TOKEN2ID(keyword_for);
TOKEN2ID(keyword_break);
TOKEN2ID(keyword_next);
TOKEN2ID(keyword_redo);
TOKEN2ID(keyword_retry);
TOKEN2ID(keyword_in);
TOKEN2ID(keyword_do);
TOKEN2ID(keyword_do_cond);
TOKEN2ID(keyword_do_block);
TOKEN2ID(keyword_do_LAMBDA);
TOKEN2ID(keyword_return);
TOKEN2ID(keyword_yield);
TOKEN2ID(keyword_super);
TOKEN2ID(keyword_self);
TOKEN2ID(keyword_nil);
TOKEN2ID(keyword_true);
TOKEN2ID(keyword_false);
TOKEN2ID(keyword_and);
TOKEN2ID(keyword_or);
TOKEN2ID(keyword_not);
TOKEN2ID(modifier_if);
TOKEN2ID(modifier_unless);
TOKEN2ID(modifier_while);
TOKEN2ID(modifier_until);
TOKEN2ID(modifier_rescue);
TOKEN2ID(keyword_alias);
TOKEN2ID(keyword_defined);
TOKEN2ID(keyword_BEGIN);
TOKEN2ID(keyword_END);
TOKEN2ID(keyword__LINE__);
TOKEN2ID(keyword__FILE__);
TOKEN2ID(keyword__ENCODING__);
TOKEN2ID(tIDENTIFIER);
TOKEN2ID(tFID);
TOKEN2ID(tGVAR);
TOKEN2ID(tIVAR);
TOKEN2ID(tCONSTANT);
TOKEN2ID(tCVAR);
TOKEN2ID(tLABEL);
TOKEN2ID(tINTEGER);
TOKEN2ID(tFLOAT);
TOKEN2ID(tRATIONAL);
TOKEN2ID(tIMAGINARY);
TOKEN2ID(tCHAR);
TOKEN2ID(tNTH_REF);
TOKEN2ID(tBACK_REF);
TOKEN2ID(tSTRING_CONTENT);
TOKEN2ID(tREGEXP_END);
TOKEN2ID(tDUMNY_END);
TOKEN2ID(tSP);
TOKEN2ID(tUPLUS);
TOKEN2ID(tUMINUS);
TOKEN2ID(tPOW);
TOKEN2ID(tCMP);
TOKEN2ID(tEQ);
TOKEN2ID(tEQQ);
TOKEN2ID(tNEQ);
TOKEN2ID(tGEQ);
TOKEN2ID(tLEQ);
TOKEN2ID(tANDOP);
TOKEN2ID(tOROP);
TOKEN2ID(tMATCH);
TOKEN2ID(tNMATCH);
TOKEN2ID(tDOT2);
TOKEN2ID(tDOT3);
TOKEN2ID(tBDOT2);
TOKEN2ID(tBDOT3);
TOKEN2ID(tAREF);
TOKEN2ID(tASET);
TOKEN2ID(tLSHFT);
TOKEN2ID(tRSHFT);
TOKEN2ID(tANDDOT);
TOKEN2ID(tCOLON2);
TOKEN2ID(tCOLON3);
TOKEN2ID(tOP_ASGN);
TOKEN2ID(tASSOC);
TOKEN2ID(tLPAREN);
TOKEN2ID(tLPAREN_ARG);
TOKEN2ID(tRPAREN);
TOKEN2ID(tLBRACK);
TOKEN2ID(tLBRACE);
TOKEN2ID(tLBRACE_ARG);
TOKEN2ID(tSTAR);
TOKEN2ID(tDSTAR);
TOKEN2ID(tAMPER);
TOKEN2ID(tLAMBDA);
TOKEN2ID(tSYMBEG);
TOKEN2ID(tSTRING_BEG);
TOKEN2ID(tXSTRING_BEG);
TOKEN2ID(tREGEXP_BEG);
TOKEN2ID(tWORDS_BEG);
TOKEN2ID(tQWORDS_BEG);
TOKEN2ID(tSYMBOLS_BEG);
TOKEN2ID(tQSYMBOLS_BEG);
TOKEN2ID(tSTRING_END);
TOKEN2ID(tSTRING_DEND);
TOKEN2ID(tSTRING_DBEG);
TOKEN2ID(tSTRING_DVAR);
TOKEN2ID(tLAMBEG);
TOKEN2ID(tLABEL_END);
TOKEN2ID(tIGNORED_NL);
TOKEN2ID(tCOMMENT);
TOKEN2ID(tEMBDOC_BEG);
TOKEN2ID(tEMBDOC);
TOKEN2ID(tEMBDOC_END);
TOKEN2ID(tHEREDOC_BEG);
TOKEN2ID(tHEREDOC_END);
TOKEN2ID(k__END__);
TOKEN2ID(tLOWEST);
TOKEN2ID(tUMINUS_NUM);
TOKEN2ID(tLAST_TOKEN);
#undef TOKEN2ID
#undef TOKEN2ID2
}
rb_bug("parser_token2id: unknown token %d", tok);
UNREACHABLE_RETURN(0);
}
#endif #endif
RBIMPL_ATTR_NONNULL((1, 2, 3)) RBIMPL_ATTR_NONNULL((1, 2, 3))
@ -457,6 +641,9 @@ static int parser_yyerror0(struct parser_params*, const char*);
#define yyerror1(loc, msg) parser_yyerror(p, (loc), (msg)) #define yyerror1(loc, msg) parser_yyerror(p, (loc), (msg))
#define yyerror(yylloc, p, msg) parser_yyerror(p, yylloc, msg) #define yyerror(yylloc, p, msg) parser_yyerror(p, yylloc, msg)
#define token_flush(ptr) ((ptr)->lex.ptok = (ptr)->lex.pcur) #define token_flush(ptr) ((ptr)->lex.ptok = (ptr)->lex.pcur)
#define lex_goto_eol(p) ((p)->lex.pcur = (p)->lex.pend)
#define lex_eol_p(p) ((p)->lex.pcur >= (p)->lex.pend)
#define lex_eol_n_p(p,n) ((p)->lex.pcur+(n) >= (p)->lex.pend)
static void token_info_setup(token_info *ptinfo, const char *ptr, const rb_code_location_t *loc); static void token_info_setup(token_info *ptinfo, const char *ptr, const rb_code_location_t *loc);
static void token_info_push(struct parser_params*, const char *token, const rb_code_location_t *loc); static void token_info_push(struct parser_params*, const char *token, const rb_code_location_t *loc);
@ -707,6 +894,9 @@ VALUE rb_parser_lex_state_name(enum lex_state_e state);
void rb_parser_show_bitstack(struct parser_params *, stack_type, const char *, int); void rb_parser_show_bitstack(struct parser_params *, stack_type, const char *, int);
PRINTF_ARGS(void rb_parser_fatal(struct parser_params *p, const char *fmt, ...), 2, 3); PRINTF_ARGS(void rb_parser_fatal(struct parser_params *p, const char *fmt, ...), 2, 3);
YYLTYPE *rb_parser_set_location_from_strterm_heredoc(struct parser_params *p, rb_strterm_heredoc_t *here, YYLTYPE *yylloc); YYLTYPE *rb_parser_set_location_from_strterm_heredoc(struct parser_params *p, rb_strterm_heredoc_t *here, YYLTYPE *yylloc);
YYLTYPE *rb_parser_set_location_of_delayed_token(struct parser_params *p, YYLTYPE *yylloc);
YYLTYPE *rb_parser_set_location_of_heredoc_end(struct parser_params *p, YYLTYPE *yylloc);
YYLTYPE *rb_parser_set_location_of_dummy_end(struct parser_params *p, YYLTYPE *yylloc);
YYLTYPE *rb_parser_set_location_of_none(struct parser_params *p, YYLTYPE *yylloc); YYLTYPE *rb_parser_set_location_of_none(struct parser_params *p, YYLTYPE *yylloc);
YYLTYPE *rb_parser_set_location(struct parser_params *p, YYLTYPE *yylloc); YYLTYPE *rb_parser_set_location(struct parser_params *p, YYLTYPE *yylloc);
RUBY_SYMBOL_EXPORT_END RUBY_SYMBOL_EXPORT_END
@ -1057,6 +1247,8 @@ endless_method_name(struct parser_params *p, NODE *defn, const YYLTYPE *loc)
token_info_drop(p, "def", loc->beg_pos); token_info_drop(p, "def", loc->beg_pos);
} }
#define debug_token_line(p, name, line) if (p->debug) rb_parser_printf(p, name ":%d (%d: %ld|%ld|%ld)\n", line, p->ruby_sourceline, p->lex.ptok - p->lex.pbeg, p->lex.pcur - p->lex.ptok, p->lex.pend - p->lex.pcur)
#ifndef RIPPER #ifndef RIPPER
# define Qnone 0 # define Qnone 0
# define Qnull 0 # define Qnull 0
@ -1356,6 +1548,9 @@ static int looking_at_eol_p(struct parser_params *p);
%token tSTRING_DEND "'}'" %token tSTRING_DEND "'}'"
%token tSTRING_DBEG tSTRING_DVAR tLAMBEG tLABEL_END %token tSTRING_DBEG tSTRING_DVAR tLAMBEG tLABEL_END
%token tIGNORED_NL tCOMMENT tEMBDOC_BEG tEMBDOC tEMBDOC_END
%token tHEREDOC_BEG tHEREDOC_END k__END__
/* /*
* precedence table * precedence table
*/ */
@ -3447,7 +3642,7 @@ k_if : keyword_if
token_info_push(p, "if", &@$); token_info_push(p, "if", &@$);
if (p->token_info && p->token_info->nonspc && if (p->token_info && p->token_info->nonspc &&
p->token_info->next && !strcmp(p->token_info->next->token, "else")) { p->token_info->next && !strcmp(p->token_info->next->token, "else")) {
const char *tok = p->lex.ptok; const char *tok = p->lex.ptok - rb_strlen_lit("if");
const char *beg = p->lex.pbeg + p->token_info->next->beg.column; const char *beg = p->lex.pbeg + p->token_info->next->beg.column;
beg += rb_strlen_lit("else"); beg += rb_strlen_lit("else");
while (beg < tok && ISSPACE(*beg)) beg++; while (beg < tok && ISSPACE(*beg)) beg++;
@ -5906,7 +6101,11 @@ trailer : opt_nl
; ;
term : ';' {yyerrok;token_flush(p);} term : ';' {yyerrok;token_flush(p);}
| '\n' {token_flush(p);} | '\n'
{
@$.end_pos = @$.beg_pos;
token_flush(p);
}
; ;
terms : term terms : term
@ -5967,12 +6166,91 @@ ripper_yylval_id(struct parser_params *p, ID x)
#endif #endif
#define set_yylval_noname() set_yylval_id(keyword_nil) #define set_yylval_noname() set_yylval_id(keyword_nil)
#define has_delayed_token(p) (!NIL_P(p->delayed.token))
#ifndef RIPPER #ifndef RIPPER
#define literal_flush(p, ptr) ((p)->lex.ptok = (ptr)) #define literal_flush(p, ptr) ((p)->lex.ptok = (ptr))
#define dispatch_scan_event(p, t) ((void)0) #define dispatch_scan_event(p, t) parser_dispatch_scan_event(p, t, __LINE__)
#define dispatch_delayed_token(p, t) ((void)0)
#define has_delayed_token(p) (0) static bool
parser_has_token(struct parser_params *p)
{
if (p->keep_tokens && (p->lex.pcur < p->lex.ptok)) rb_bug("lex.pcur < lex.ptok. (line: %d) %ld|%ld|%ld", p->ruby_sourceline, p->lex.ptok - p->lex.pbeg, p->lex.pcur - p->lex.ptok, p->lex.pend - p->lex.pcur);
return p->lex.pcur > p->lex.ptok;
}
static VALUE
code_loc_to_ary(const rb_code_location_t *loc)
{
VALUE ary = rb_ary_new_from_args(4,
INT2NUM(loc->beg_pos.lineno), INT2NUM(loc->beg_pos.column),
INT2NUM(loc->end_pos.lineno), INT2NUM(loc->end_pos.column));
rb_obj_freeze(ary);
return ary;
}
static void
parser_append_tokens(struct parser_params *p, VALUE str, enum yytokentype t, int line)
{
VALUE ary;
int token_id;
ary = rb_ary_new2(4);
token_id = p->token_id;
rb_ary_push(ary, INT2FIX(token_id));
rb_ary_push(ary, ID2SYM(parser_token2id(t)));
rb_ary_push(ary, str);
rb_ary_push(ary, code_loc_to_ary(p->yylloc));
rb_obj_freeze(ary);
rb_ary_push(p->tokens, ary);
p->token_id++;
if (p->debug) {
rb_parser_printf(p, "Append tokens (line: %d) %"PRIsVALUE"\n", line, ary);
}
}
static void
parser_dispatch_scan_event(struct parser_params *p, enum yytokentype t, int line)
{
debug_token_line(p, "parser_dispatch_scan_event", line);
if (!parser_has_token(p)) return;
RUBY_SET_YYLLOC(*p->yylloc);
if (p->keep_tokens) {
VALUE str = STR_NEW(p->lex.ptok, p->lex.pcur - p->lex.ptok);
parser_append_tokens(p, str, t, line);
}
token_flush(p);
}
#define dispatch_delayed_token(p, t) parser_dispatch_delayed_token(p, t, __LINE__)
static void
parser_dispatch_delayed_token(struct parser_params *p, enum yytokentype t, int line)
{
int saved_line = p->ruby_sourceline;
const char *saved_tokp = p->lex.ptok;
debug_token_line(p, "parser_dispatch_delayed_token", line);
if (!has_delayed_token(p)) return;
RUBY_SET_YYLLOC_OF_DELAYED_TOKEN(*p->yylloc);
if (p->keep_tokens) {
p->ruby_sourceline = p->delayed.beg_line;
p->lex.ptok = p->lex.pbeg + p->delayed.beg_col;
parser_append_tokens(p, p->delayed.token, t, line);
p->ruby_sourceline = saved_line;
p->lex.ptok = saved_tokp;
}
p->delayed.token = Qnil;
}
#else #else
#define literal_flush(p, ptr) ((void)(ptr)) #define literal_flush(p, ptr) ((void)(ptr))
@ -5997,6 +6275,7 @@ ripper_scan_event_val(struct parser_params *p, enum yytokentype t)
{ {
VALUE str = STR_NEW(p->lex.ptok, p->lex.pcur - p->lex.ptok); VALUE str = STR_NEW(p->lex.ptok, p->lex.pcur - p->lex.ptok);
VALUE rval = ripper_dispatch1(p, ripper_token2eventid(t), str); VALUE rval = ripper_dispatch1(p, ripper_token2eventid(t), str);
RUBY_SET_YYLLOC(*p->yylloc);
token_flush(p); token_flush(p);
return rval; return rval;
} }
@ -6016,15 +6295,14 @@ ripper_dispatch_delayed_token(struct parser_params *p, enum yytokentype t)
const char *saved_tokp = p->lex.ptok; const char *saved_tokp = p->lex.ptok;
if (NIL_P(p->delayed.token)) return; if (NIL_P(p->delayed.token)) return;
p->ruby_sourceline = p->delayed.line; p->ruby_sourceline = p->delayed.beg_line;
p->lex.ptok = p->lex.pbeg + p->delayed.col; p->lex.ptok = p->lex.pbeg + p->delayed.beg_col;
add_mark_object(p, yylval_rval = ripper_dispatch1(p, ripper_token2eventid(t), p->delayed.token)); add_mark_object(p, yylval_rval = ripper_dispatch1(p, ripper_token2eventid(t), p->delayed.token));
p->delayed.token = Qnil; p->delayed.token = Qnil;
p->ruby_sourceline = saved_line; p->ruby_sourceline = saved_line;
p->lex.ptok = saved_tokp; p->lex.ptok = saved_tokp;
} }
#define dispatch_delayed_token(p, t) ripper_dispatch_delayed_token(p, t) #define dispatch_delayed_token(p, t) ripper_dispatch_delayed_token(p, t)
#define has_delayed_token(p) (!NIL_P(p->delayed.token))
#endif /* RIPPER */ #endif /* RIPPER */
static inline int static inline int
@ -6495,7 +6773,6 @@ yycompile0(VALUE arg)
p->lex.strterm = 0; p->lex.strterm = 0;
p->lex.pcur = p->lex.pbeg = p->lex.pend = 0; p->lex.pcur = p->lex.pbeg = p->lex.pend = 0;
p->lex.prevline = p->lex.lastline = p->lex.nextline = 0;
if (n || p->error_p) { if (n || p->error_p) {
VALUE mesg = p->error_buffer; VALUE mesg = p->error_buffer;
if (!mesg) { if (!mesg) {
@ -6512,6 +6789,7 @@ yycompile0(VALUE arg)
} }
else { else {
VALUE opt = p->compile_option; VALUE opt = p->compile_option;
VALUE tokens = p->tokens;
NODE *prelude; NODE *prelude;
NODE *body = parser_append_options(p, tree->nd_body); NODE *body = parser_append_options(p, tree->nd_body);
if (!opt) opt = rb_obj_hide(rb_ident_hash_new()); if (!opt) opt = rb_obj_hide(rb_ident_hash_new());
@ -6519,6 +6797,10 @@ yycompile0(VALUE arg)
prelude = block_append(p, p->eval_tree_begin, body); prelude = block_append(p, p->eval_tree_begin, body);
tree->nd_body = prelude; tree->nd_body = prelude;
RB_OBJ_WRITE(p->ast, &p->ast->body.compile_option, opt); RB_OBJ_WRITE(p->ast, &p->ast->body.compile_option, opt);
if (p->keep_tokens) {
rb_obj_freeze(tokens);
rb_ast_set_tokens(p->ast, tokens);
}
} }
p->ast->body.root = tree; p->ast->body.root = tree;
if (!p->ast->body.script_lines) p->ast->body.script_lines = INT2FIX(p->line_count); if (!p->ast->body.script_lines) p->ast->body.script_lines = INT2FIX(p->line_count);
@ -6709,32 +6991,31 @@ parser_str_new(const char *ptr, long len, rb_encoding *enc, int func, rb_encodin
return str; return str;
} }
#define lex_goto_eol(p) ((p)->lex.pcur = (p)->lex.pend)
#define lex_eol_p(p) ((p)->lex.pcur >= (p)->lex.pend)
#define lex_eol_n_p(p,n) ((p)->lex.pcur+(n) >= (p)->lex.pend)
#define peek(p,c) peek_n(p, (c), 0) #define peek(p,c) peek_n(p, (c), 0)
#define peek_n(p,c,n) (!lex_eol_n_p(p, n) && (c) == (unsigned char)(p)->lex.pcur[n]) #define peek_n(p,c,n) (!lex_eol_n_p(p, n) && (c) == (unsigned char)(p)->lex.pcur[n])
#define peekc(p) peekc_n(p, 0) #define peekc(p) peekc_n(p, 0)
#define peekc_n(p,n) (lex_eol_n_p(p, n) ? -1 : (unsigned char)(p)->lex.pcur[n]) #define peekc_n(p,n) (lex_eol_n_p(p, n) ? -1 : (unsigned char)(p)->lex.pcur[n])
#ifdef RIPPER
static void static void
add_delayed_token(struct parser_params *p, const char *tok, const char *end) add_delayed_token(struct parser_params *p, const char *tok, const char *end, int line)
{ {
#ifndef RIPPER
debug_token_line(p, "add_delayed_token", line);
#endif
if (tok < end) { if (tok < end) {
if (!has_delayed_token(p)) { if (!has_delayed_token(p)) {
p->delayed.token = rb_str_buf_new(end - tok); p->delayed.token = rb_str_buf_new(end - tok);
rb_enc_associate(p->delayed.token, p->enc); rb_enc_associate(p->delayed.token, p->enc);
p->delayed.line = p->ruby_sourceline; p->delayed.beg_line = p->ruby_sourceline;
p->delayed.col = rb_long2int(tok - p->lex.pbeg); p->delayed.beg_col = rb_long2int(tok - p->lex.pbeg);
} }
rb_str_buf_cat(p->delayed.token, tok, end - tok); rb_str_buf_cat(p->delayed.token, tok, end - tok);
p->delayed.end_line = p->ruby_sourceline;
p->delayed.end_col = rb_long2int(end - p->lex.pbeg);
p->lex.ptok = end; p->lex.ptok = end;
} }
} }
#else
#define add_delayed_token(p, tok, end) ((void)(tok), (void)(end))
#endif
static int static int
nextline(struct parser_params *p, int set_encoding) nextline(struct parser_params *p, int set_encoding)
@ -6767,7 +7048,7 @@ nextline(struct parser_params *p, int set_encoding)
/* after here-document without terminator */ /* after here-document without terminator */
goto end_of_input; goto end_of_input;
} }
add_delayed_token(p, p->lex.ptok, p->lex.pend); add_delayed_token(p, p->lex.ptok, p->lex.pend, __LINE__);
if (p->heredoc_end > 0) { if (p->heredoc_end > 0) {
p->ruby_sourceline = p->heredoc_end; p->ruby_sourceline = p->heredoc_end;
p->heredoc_end = 0; p->heredoc_end = 0;
@ -6776,7 +7057,6 @@ nextline(struct parser_params *p, int set_encoding)
p->lex.pbeg = p->lex.pcur = RSTRING_PTR(v); p->lex.pbeg = p->lex.pcur = RSTRING_PTR(v);
p->lex.pend = p->lex.pcur + RSTRING_LEN(v); p->lex.pend = p->lex.pcur + RSTRING_LEN(v);
token_flush(p); token_flush(p);
p->lex.prevline = p->lex.lastline;
p->lex.lastline = v; p->lex.lastline = v;
return 0; return 0;
} }
@ -6929,20 +7209,22 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
{ {
size_t numlen; size_t numlen;
int codepoint = scan_hex(p->lex.pcur, wide ? p->lex.pend - p->lex.pcur : 4, &numlen); int codepoint = scan_hex(p->lex.pcur, wide ? p->lex.pend - p->lex.pcur : 4, &numlen);
literal_flush(p, p->lex.pcur);
p->lex.pcur += numlen; p->lex.pcur += numlen;
if (p->lex.strterm == NULL || if (p->lex.strterm == NULL ||
(p->lex.strterm->flags & STRTERM_HEREDOC) || (p->lex.strterm->flags & STRTERM_HEREDOC) ||
(p->lex.strterm->u.literal.u1.func != str_regexp)) { (p->lex.strterm->u.literal.u1.func != str_regexp)) {
if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4)) { if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4)) {
literal_flush(p, p->lex.pcur);
yyerror0("invalid Unicode escape"); yyerror0("invalid Unicode escape");
return wide && numlen > 0; return wide && numlen > 0;
} }
if (codepoint > 0x10ffff) { if (codepoint > 0x10ffff) {
literal_flush(p, p->lex.pcur);
yyerror0("invalid Unicode codepoint (too large)"); yyerror0("invalid Unicode codepoint (too large)");
return wide; return wide;
} }
if ((codepoint & 0xfffff800) == 0xd800) { if ((codepoint & 0xfffff800) == 0xd800) {
literal_flush(p, p->lex.pcur);
yyerror0("invalid Unicode codepoint"); yyerror0("invalid Unicode codepoint");
return wide; return wide;
} }
@ -7363,7 +7645,6 @@ tokadd_string(struct parser_params *p,
} }
} }
else if (c == '\\') { else if (c == '\\') {
literal_flush(p, p->lex.pcur - 1);
c = nextc(p); c = nextc(p);
switch (c) { switch (c) {
case '\n': case '\n':
@ -7511,7 +7792,21 @@ flush_string_content(struct parser_params *p, rb_encoding *enc)
yylval.val = content; yylval.val = content;
} }
#else #else
#define flush_string_content(p, enc) ((void)(enc)) static void
flush_string_content(struct parser_params *p, rb_encoding *enc)
{
if (has_delayed_token(p)) {
ptrdiff_t len = p->lex.pcur - p->lex.ptok;
if (len > 0) {
rb_enc_str_buf_cat(p->delayed.token, p->lex.ptok, len, enc);
p->delayed.end_line = p->ruby_sourceline;
p->delayed.end_col = rb_long2int(p->lex.pcur - p->lex.pbeg);
}
dispatch_delayed_token(p, tSTRING_CONTENT);
p->lex.ptok = p->lex.pcur;
}
dispatch_scan_event(p, tSTRING_CONTENT);
}
#endif #endif
RUBY_FUNC_EXPORTED const unsigned int ruby_global_name_punct_bits[(0x7e - 0x20 + 31) / 32]; RUBY_FUNC_EXPORTED const unsigned int ruby_global_name_punct_bits[(0x7e - 0x20 + 31) / 32];
@ -7630,14 +7925,14 @@ parse_string(struct parser_params *p, rb_strterm_literal_t *quote)
if (func & STR_FUNC_QWORDS) { if (func & STR_FUNC_QWORDS) {
quote->u1.func |= STR_FUNC_TERM; quote->u1.func |= STR_FUNC_TERM;
pushback(p, c); /* dispatch the term at tSTRING_END */ pushback(p, c); /* dispatch the term at tSTRING_END */
add_delayed_token(p, p->lex.ptok, p->lex.pcur); add_delayed_token(p, p->lex.ptok, p->lex.pcur, __LINE__);
return ' '; return ' ';
} }
return parser_string_term(p, func); return parser_string_term(p, func);
} }
if (space) { if (space) {
pushback(p, c); pushback(p, c);
add_delayed_token(p, p->lex.ptok, p->lex.pcur); add_delayed_token(p, p->lex.ptok, p->lex.pcur, __LINE__);
return ' '; return ' ';
} }
newtok(p); newtok(p);
@ -7997,12 +8292,29 @@ dispatch_heredoc_end(struct parser_params *p)
dispatch_delayed_token(p, tSTRING_CONTENT); dispatch_delayed_token(p, tSTRING_CONTENT);
str = STR_NEW(p->lex.ptok, p->lex.pend - p->lex.ptok); str = STR_NEW(p->lex.ptok, p->lex.pend - p->lex.ptok);
ripper_dispatch1(p, ripper_token2eventid(tHEREDOC_END), str); ripper_dispatch1(p, ripper_token2eventid(tHEREDOC_END), str);
RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(*p->yylloc);
lex_goto_eol(p); lex_goto_eol(p);
token_flush(p); token_flush(p);
} }
#else #else
#define dispatch_heredoc_end(p) ((void)0) #define dispatch_heredoc_end(p) parser_dispatch_heredoc_end(p, __LINE__)
static void
parser_dispatch_heredoc_end(struct parser_params *p, int line)
{
if (has_delayed_token(p))
dispatch_delayed_token(p, tSTRING_CONTENT);
if (p->keep_tokens) {
VALUE str = STR_NEW(p->lex.ptok, p->lex.pend - p->lex.ptok);
RUBY_SET_YYLLOC_OF_HEREDOC_END(*p->yylloc);
parser_append_tokens(p, str, tHEREDOC_END, line);
}
RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(*p->yylloc);
lex_goto_eol(p);
token_flush(p);
}
#endif #endif
static enum yytokentype static enum yytokentype
@ -9430,6 +9742,16 @@ parse_ident(struct parser_params *p, int c, int cmd_state)
return result; return result;
} }
static void
warn_cr(struct parser_params *p)
{
if (!p->cr_seen) {
p->cr_seen = TRUE;
/* carried over with p->lex.nextline for nextc() */
rb_warn0("encountered \\r in middle of line, treated as a mere space");
}
}
static enum yytokentype static enum yytokentype
parser_yylex(struct parser_params *p) parser_yylex(struct parser_params *p)
{ {
@ -9443,6 +9765,7 @@ parser_yylex(struct parser_params *p)
if (p->lex.strterm) { if (p->lex.strterm) {
if (p->lex.strterm->flags & STRTERM_HEREDOC) { if (p->lex.strterm->flags & STRTERM_HEREDOC) {
token_flush(p);
return here_document(p, &p->lex.strterm->u.heredoc); return here_document(p, &p->lex.strterm->u.heredoc);
} }
else { else {
@ -9453,11 +9776,11 @@ parser_yylex(struct parser_params *p)
cmd_state = p->command_start; cmd_state = p->command_start;
p->command_start = FALSE; p->command_start = FALSE;
p->token_seen = TRUE; p->token_seen = TRUE;
retry:
last_state = p->lex.state;
#ifndef RIPPER #ifndef RIPPER
token_flush(p); token_flush(p);
#endif #endif
retry:
last_state = p->lex.state;
switch (c = nextc(p)) { switch (c = nextc(p)) {
case '\0': /* NUL */ case '\0': /* NUL */
case '\004': /* ^D */ case '\004': /* ^D */
@ -9467,26 +9790,27 @@ parser_yylex(struct parser_params *p)
#ifndef RIPPER #ifndef RIPPER
if (!NIL_P(p->end_expect_token_locations) && RARRAY_LEN(p->end_expect_token_locations) > 0) { if (!NIL_P(p->end_expect_token_locations) && RARRAY_LEN(p->end_expect_token_locations) > 0) {
pop_end_expect_token_locations(p); pop_end_expect_token_locations(p);
RUBY_SET_YYLLOC_OF_DUMMY_END(*p->yylloc);
return tDUMNY_END; return tDUMNY_END;
} }
#endif #endif
/* Set location for end-of-input because dispatch_scan_event is not called. */
RUBY_SET_YYLLOC(*p->yylloc);
return 0; return 0;
/* white spaces */ /* white spaces */
case '\r': case '\r':
if (!p->cr_seen) { warn_cr(p);
p->cr_seen = TRUE;
/* carried over with p->lex.nextline for nextc() */
rb_warn0("encountered \\r in middle of line, treated as a mere space");
}
/* fall through */ /* fall through */
case ' ': case '\t': case '\f': case ' ': case '\t': case '\f':
case '\13': /* '\v' */ case '\13': /* '\v' */
space_seen = 1; space_seen = 1;
#ifdef RIPPER
while ((c = nextc(p))) { while ((c = nextc(p))) {
switch (c) { switch (c) {
case ' ': case '\t': case '\f': case '\r': case '\r':
warn_cr(p);
/* fall through */
case ' ': case '\t': case '\f':
case '\13': /* '\v' */ case '\13': /* '\v' */
break; break;
default: default:
@ -9496,6 +9820,8 @@ parser_yylex(struct parser_params *p)
outofloop: outofloop:
pushback(p, c); pushback(p, c);
dispatch_scan_event(p, tSP); dispatch_scan_event(p, tSP);
#ifndef RIPPER
token_flush(p);
#endif #endif
goto retry; goto retry;
@ -9533,7 +9859,10 @@ parser_yylex(struct parser_params *p)
break; break;
case '#': case '#':
pushback(p, c); pushback(p, c);
if (space_seen) dispatch_scan_event(p, tSP); if (space_seen) {
dispatch_scan_event(p, tSP);
token_flush(p);
}
goto retry; goto retry;
case '&': case '&':
case '.': { case '.': {
@ -9548,18 +9877,10 @@ parser_yylex(struct parser_params *p)
p->ruby_sourceline--; p->ruby_sourceline--;
p->lex.nextline = p->lex.lastline; p->lex.nextline = p->lex.lastline;
case -1: /* EOF no decrement*/ case -1: /* EOF no decrement*/
#ifndef RIPPER
if (p->lex.prevline && !p->eofp) p->lex.lastline = p->lex.prevline;
p->lex.pbeg = RSTRING_PTR(p->lex.lastline);
p->lex.pend = p->lex.pcur = p->lex.pbeg + RSTRING_LEN(p->lex.lastline);
pushback(p, 1); /* always pushback */
p->lex.ptok = p->lex.pcur;
#else
lex_goto_eol(p); lex_goto_eol(p);
if (c != -1) { if (c != -1) {
p->lex.ptok = p->lex.pcur; p->lex.ptok = p->lex.pcur;
} }
#endif
goto normal_newline; goto normal_newline;
} }
} }
@ -10157,12 +10478,9 @@ yylex(YYSTYPE *lval, YYLTYPE *yylloc, struct parser_params *p)
p->lval = lval; p->lval = lval;
lval->val = Qundef; lval->val = Qundef;
t = parser_yylex(p); p->yylloc = yylloc;
if (p->lex.strterm && (p->lex.strterm->flags & STRTERM_HEREDOC)) t = parser_yylex(p);
RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(*yylloc);
else
RUBY_SET_YYLLOC(*yylloc);
if (has_delayed_token(p)) if (has_delayed_token(p))
dispatch_delayed_token(p, t); dispatch_delayed_token(p, t);
@ -11053,6 +11371,34 @@ rb_parser_set_location_from_strterm_heredoc(struct parser_params *p, rb_strterm_
return rb_parser_set_pos(yylloc, sourceline, beg_pos, end_pos); return rb_parser_set_pos(yylloc, sourceline, beg_pos, end_pos);
} }
YYLTYPE *
rb_parser_set_location_of_delayed_token(struct parser_params *p, YYLTYPE *yylloc)
{
yylloc->beg_pos.lineno = p->delayed.beg_line;
yylloc->beg_pos.column = p->delayed.beg_col;
yylloc->end_pos.lineno = p->delayed.end_line;
yylloc->end_pos.column = p->delayed.end_col;
return yylloc;
}
YYLTYPE *
rb_parser_set_location_of_heredoc_end(struct parser_params *p, YYLTYPE *yylloc)
{
int sourceline = p->ruby_sourceline;
int beg_pos = (int)(p->lex.ptok - p->lex.pbeg);
int end_pos = (int)(p->lex.pend - p->lex.pbeg);
return rb_parser_set_pos(yylloc, sourceline, beg_pos, end_pos);
}
YYLTYPE *
rb_parser_set_location_of_dummy_end(struct parser_params *p, YYLTYPE *yylloc)
{
yylloc->end_pos = yylloc->beg_pos;
return yylloc;
}
YYLTYPE * YYLTYPE *
rb_parser_set_location_of_none(struct parser_params *p, YYLTYPE *yylloc) rb_parser_set_location_of_none(struct parser_params *p, YYLTYPE *yylloc)
{ {
@ -13329,13 +13675,15 @@ parser_initialize(struct parser_params *p)
p->ruby_sourcefile_string = Qnil; p->ruby_sourcefile_string = Qnil;
p->lex.lpar_beg = -1; /* make lambda_beginning_p() == FALSE at first */ p->lex.lpar_beg = -1; /* make lambda_beginning_p() == FALSE at first */
p->node_id = 0; p->node_id = 0;
#ifdef RIPPER
p->delayed.token = Qnil; p->delayed.token = Qnil;
#ifdef RIPPER
p->result = Qnil; p->result = Qnil;
p->parsing_thread = Qnil; p->parsing_thread = Qnil;
#else #else
p->error_buffer = Qfalse; p->error_buffer = Qfalse;
p->end_expect_token_locations = Qnil; p->end_expect_token_locations = Qnil;
p->token_id = 0;
p->tokens = Qnil;
#endif #endif
p->debug_buffer = Qnil; p->debug_buffer = Qnil;
p->debug_output = rb_ractor_stdout(); p->debug_output = rb_ractor_stdout();
@ -13353,20 +13701,20 @@ parser_mark(void *ptr)
struct parser_params *p = (struct parser_params*)ptr; struct parser_params *p = (struct parser_params*)ptr;
rb_gc_mark(p->lex.input); rb_gc_mark(p->lex.input);
rb_gc_mark(p->lex.prevline);
rb_gc_mark(p->lex.lastline); rb_gc_mark(p->lex.lastline);
rb_gc_mark(p->lex.nextline); rb_gc_mark(p->lex.nextline);
rb_gc_mark(p->ruby_sourcefile_string); rb_gc_mark(p->ruby_sourcefile_string);
rb_gc_mark((VALUE)p->lex.strterm); rb_gc_mark((VALUE)p->lex.strterm);
rb_gc_mark((VALUE)p->ast); rb_gc_mark((VALUE)p->ast);
rb_gc_mark(p->case_labels); rb_gc_mark(p->case_labels);
rb_gc_mark(p->delayed.token);
#ifndef RIPPER #ifndef RIPPER
rb_gc_mark(p->debug_lines); rb_gc_mark(p->debug_lines);
rb_gc_mark(p->compile_option); rb_gc_mark(p->compile_option);
rb_gc_mark(p->error_buffer); rb_gc_mark(p->error_buffer);
rb_gc_mark(p->end_expect_token_locations); rb_gc_mark(p->end_expect_token_locations);
rb_gc_mark(p->tokens);
#else #else
rb_gc_mark(p->delayed.token);
rb_gc_mark(p->value); rb_gc_mark(p->value);
rb_gc_mark(p->result); rb_gc_mark(p->result);
rb_gc_mark(p->parsing_thread); rb_gc_mark(p->parsing_thread);
@ -13480,6 +13828,16 @@ rb_parser_error_tolerant(VALUE vparser)
p->end_expect_token_locations = rb_ary_new(); p->end_expect_token_locations = rb_ary_new();
} }
void
rb_parser_keep_tokens(VALUE vparser)
{
struct parser_params *p;
TypedData_Get_Struct(vparser, struct parser_params, &parser_data_type, p);
p->keep_tokens = 1;
p->tokens = rb_ary_new();
}
#endif #endif
#ifdef RIPPER #ifdef RIPPER

Просмотреть файл

@ -132,6 +132,34 @@ class TestAst < Test::Unit::TestCase
end end
end end
Dir.glob("test/**/*.rb", base: SRCDIR).each do |path|
define_method("test_all_tokens:#{path}") do
node = RubyVM::AbstractSyntaxTree.parse_file("#{SRCDIR}/#{path}", keep_tokens: true)
tokens = node.all_tokens.sort_by { [_1.last[0], _1.last[1]] }
tokens_bytes = tokens.map { _1[2]}.join.bytes
source_bytes = File.read("#{SRCDIR}/#{path}").bytes
assert_equal(source_bytes, tokens_bytes)
(tokens.count - 1).times do |i|
token_0 = tokens[i]
token_1 = tokens[i + 1]
end_pos = token_0.last[2..3]
beg_pos = token_1.last[0..1]
if end_pos[0] == beg_pos[0]
# When both tokens are same line, column should be consecutives
assert_equal(beg_pos[1], end_pos[1], "#{token_0}. #{token_1}")
else
# Line should be next
assert_equal(beg_pos[0], end_pos[0] + 1, "#{token_0}. #{token_1}")
# It should be on the beginning of the line
assert_equal(0, beg_pos[1], "#{token_0}. #{token_1}")
end
end
end
end
private def parse(src) private def parse(src)
EnvUtil.suppress_warning { EnvUtil.suppress_warning {
RubyVM::AbstractSyntaxTree.parse(src) RubyVM::AbstractSyntaxTree.parse(src)
@ -705,11 +733,11 @@ dummy
a = 1 a = 1
else else
STR STR
(SCOPE@1:0-3:5 (SCOPE@1:0-3:4
tbl: [:a] tbl: [:a]
args: nil args: nil
body: body:
(IF@1:0-3:5 (VCALL@1:3-1:7 :cond) (LASGN@2:2-2:7 :a (LIT@2:6-2:7 1)) (IF@1:0-3:4 (VCALL@1:3-1:7 :cond) (LASGN@2:2-2:7 :a (LIT@2:6-2:7 1))
(BEGIN@3:4-3:4 nil))) (BEGIN@3:4-3:4 nil)))
EXP EXP
end end
@ -732,11 +760,11 @@ dummy
a = 1 a = 1
else else
STR STR
(SCOPE@1:0-3:5 (SCOPE@1:0-3:4
tbl: [:a] tbl: [:a]
args: nil args: nil
body: body:
(UNLESS@1:0-3:5 (VCALL@1:7-1:11 :cond) (LASGN@2:2-2:7 :a (LIT@2:6-2:7 1)) (UNLESS@1:0-3:4 (VCALL@1:7-1:11 :cond) (LASGN@2:2-2:7 :a (LIT@2:6-2:7 1))
(BEGIN@3:4-3:4 nil))) (BEGIN@3:4-3:4 nil)))
EXP EXP
end end