Skip to content

Commit

Permalink
Enhance keep_tokens option for RubyVM::AbstractSyntaxTree parsing met…
Browse files Browse the repository at this point in the history
…hods

Implementation for Language Server Protocol (LSP) sometimes needs token information.
For example both `m(1)` and `m(1, )` has same AST structure other than node locations
then it's impossible to check the existence of `,` from AST. However in later case,
it might be better to suggest variables list for the second argument.
Token information is important for such case.

This commit adds these methods.

* Add `keep_tokens` option for `RubyVM::AbstractSyntaxTree.parse`, `.parse_file` and `.of`
* Add `RubyVM::AbstractSyntaxTree::Node#tokens` which returns tokens for the node including tokens for descendants nodes.
* Add `RubyVM::AbstractSyntaxTree::Node#all_tokens` which returns all tokens for the input script regardless the receiver node.

[Feature #19070]

Impacts on memory usage and performance are below:

Memory usage:

```
$ cat test.rb
root = RubyVM::AbstractSyntaxTree.parse_file(File.expand_path('../test/ruby/test_keyword.rb', __FILE__), keep_tokens: true)

$ /usr/bin/time -f %Mkb /usr/local/bin/ruby -v
ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8) [x86_64-linux]
11408kb

# keep_tokens :false
$ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb
17508kb

# keep_tokens :true
$ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb
30960kb
```

Performance:

```
$ cat ../ast_keep_tokens.yml
prelude: |
  src = <<~SRC
    module M
      class C
        def m1(a, b)
          1 + a + b
        end
      end
    end
  SRC
benchmark:
  without_keep_tokens: |
    RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: false)
  with_keep_tokens: |
    RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: true)

$ make benchmark COMPARE_RUBY="./ruby" ARGS=../ast_keep_tokens.yml
/home/kaneko.y/.rbenv/shims/ruby --disable=gems -rrubygems -I../benchmark/lib ../benchmark/benchmark-driver/exe/benchmark-driver \
            --executables="compare-ruby::./ruby -I.ext/common --disable-gem" \
            --executables="built-ruby::./miniruby -I../lib -I. -I.ext/common  ../tool/runruby.rb --extout=.ext  -- --disable-gems --disable-gem" \
            --output=markdown --output-compare -v ../ast_keep_tokens.yml
compare-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8) [x86_64-linux]
built-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8) [x86_64-linux]
warming up..

|                     |compare-ruby|built-ruby|
|:--------------------|-----------:|---------:|
|without_keep_tokens  |     21.659k|   21.303k|
|                     |       1.02x|         -|
|with_keep_tokens     |      6.220k|    5.691k|
|                     |       1.09x|         -|
```
  • Loading branch information
yui-knk committed Nov 21, 2022
1 parent bbc4cf5 commit d860162
Show file tree
Hide file tree
Showing 9 changed files with 556 additions and 104 deletions.
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,14 @@ Note: We're only listing outstanding class updates.

* RubyVM::AbstractSyntaxTree
* Add `error_tolerant` option for `parse`, `parse_file` and `of`. [[Feature #19013]]
* Add `keep_tokens` option for `parse`, `parse_file` and `of`. Add `#tokens` and `#all_tokens`
for `RubyVM::AbstractSyntaxTree::Node` [[Feature #19070]]

```ruby
root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
root.tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
root.tokens.map{_1[2]}.join # => "x = 1 + 2"
```

* Set
* Set is now available as a built-in class without the need for `require "set"`. [[Feature #16989]]
Expand Down
38 changes: 25 additions & 13 deletions ast.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ ast_new_internal(rb_ast_t *ast, const NODE *node)
return obj;
}

static VALUE rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant);
static VALUE rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant);
static VALUE rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens);
static VALUE rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens);

static VALUE
ast_parse_new(void)
Expand All @@ -85,32 +85,33 @@ ast_parse_done(rb_ast_t *ast)
}

static VALUE
ast_s_parse(rb_execution_context_t *ec, VALUE module, VALUE str, VALUE keep_script_lines, VALUE error_tolerant)
ast_s_parse(rb_execution_context_t *ec, VALUE module, VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{
return rb_ast_parse_str(str, keep_script_lines, error_tolerant);
return rb_ast_parse_str(str, keep_script_lines, error_tolerant, keep_tokens);
}

static VALUE
rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant)
rb_ast_parse_str(VALUE str, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{
rb_ast_t *ast = 0;

StringValue(str);
VALUE vparser = ast_parse_new();
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
ast = rb_parser_compile_string_path(vparser, Qnil, str, 1);
return ast_parse_done(ast);
}

static VALUE
ast_s_parse_file(rb_execution_context_t *ec, VALUE module, VALUE path, VALUE keep_script_lines, VALUE error_tolerant)
ast_s_parse_file(rb_execution_context_t *ec, VALUE module, VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{
return rb_ast_parse_file(path, keep_script_lines, error_tolerant);
return rb_ast_parse_file(path, keep_script_lines, error_tolerant, keep_tokens);
}

static VALUE
rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant)
rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{
VALUE f;
rb_ast_t *ast = 0;
Expand All @@ -122,6 +123,7 @@ rb_ast_parse_file(VALUE path, VALUE keep_script_lines, VALUE error_tolerant)
VALUE vparser = ast_parse_new();
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
ast = rb_parser_compile_file_path(vparser, Qnil, f, 1);
rb_io_close(f);
return ast_parse_done(ast);
Expand All @@ -141,14 +143,15 @@ lex_array(VALUE array, int index)
}

static VALUE
rb_ast_parse_array(VALUE array, VALUE keep_script_lines, VALUE error_tolerant)
rb_ast_parse_array(VALUE array, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{
rb_ast_t *ast = 0;

array = rb_check_array_type(array);
VALUE vparser = ast_parse_new();
if (RTEST(keep_script_lines)) rb_parser_keep_script_lines(vparser);
if (RTEST(error_tolerant)) rb_parser_error_tolerant(vparser);
if (RTEST(keep_tokens)) rb_parser_keep_tokens(vparser);
ast = rb_parser_compile_generic(vparser, lex_array, Qnil, array, 1);
return ast_parse_done(ast);
}
Expand Down Expand Up @@ -208,7 +211,7 @@ node_id_for_backtrace_location(rb_execution_context_t *ec, VALUE module, VALUE l
}

static VALUE
ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script_lines, VALUE error_tolerant)
ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script_lines, VALUE error_tolerant, VALUE keep_tokens)
{
VALUE node, lines = Qnil;
const rb_iseq_t *iseq;
Expand Down Expand Up @@ -247,13 +250,13 @@ ast_s_of(rb_execution_context_t *ec, VALUE module, VALUE body, VALUE keep_script
}

if (!NIL_P(lines) || !NIL_P(lines = script_lines(path))) {
node = rb_ast_parse_array(lines, keep_script_lines, error_tolerant);
node = rb_ast_parse_array(lines, keep_script_lines, error_tolerant, keep_tokens);
}
else if (e_option) {
node = rb_ast_parse_str(rb_e_script, keep_script_lines, error_tolerant);
node = rb_ast_parse_str(rb_e_script, keep_script_lines, error_tolerant, keep_tokens);
}
else {
node = rb_ast_parse_file(path, keep_script_lines, error_tolerant);
node = rb_ast_parse_file(path, keep_script_lines, error_tolerant, keep_tokens);
}

return node_find(node, node_id);
Expand Down Expand Up @@ -715,6 +718,15 @@ ast_node_last_column(rb_execution_context_t *ec, VALUE self)
return INT2NUM(nd_last_column(data->node));
}

static VALUE
ast_node_all_tokens(rb_execution_context_t *ec, VALUE self)
{
struct ASTNodeData *data;
TypedData_Get_Struct(self, struct ASTNodeData, &rb_node_type, data);

return rb_ast_tokens(data->ast);
}

static VALUE
ast_node_inspect(rb_execution_context_t *ec, VALUE self)
{
Expand Down
52 changes: 46 additions & 6 deletions ast.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ module RubyVM::AbstractSyntaxTree
#
# RubyVM::AbstractSyntaxTree.parse("x = 1 + 2")
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-1:9>
def self.parse string, keep_script_lines: false, error_tolerant: false
Primitive.ast_s_parse string, keep_script_lines, error_tolerant
def self.parse string, keep_script_lines: false, error_tolerant: false, keep_tokens: false
Primitive.ast_s_parse string, keep_script_lines, error_tolerant, keep_tokens
end

# call-seq:
Expand All @@ -44,8 +44,8 @@ def self.parse string, keep_script_lines: false, error_tolerant: false
#
# RubyVM::AbstractSyntaxTree.parse_file("my-app/app.rb")
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-31:3>
def self.parse_file pathname, keep_script_lines: false, error_tolerant: false
Primitive.ast_s_parse_file pathname, keep_script_lines, error_tolerant
def self.parse_file pathname, keep_script_lines: false, error_tolerant: false, keep_tokens: false
Primitive.ast_s_parse_file pathname, keep_script_lines, error_tolerant, keep_tokens
end

# call-seq:
Expand All @@ -63,8 +63,8 @@ def self.parse_file pathname, keep_script_lines: false, error_tolerant: false
#
# RubyVM::AbstractSyntaxTree.of(method(:hello))
# # => #<RubyVM::AbstractSyntaxTree::Node:SCOPE@1:0-3:3>
def self.of body, keep_script_lines: false, error_tolerant: false
Primitive.ast_s_of body, keep_script_lines, error_tolerant
def self.of body, keep_script_lines: false, error_tolerant: false, keep_tokens: false
Primitive.ast_s_of body, keep_script_lines, error_tolerant, keep_tokens
end

# call-seq:
Expand Down Expand Up @@ -136,6 +136,46 @@ def last_column
Primitive.ast_node_last_column
end

# call-seq:
# node.tokens -> array
#
# Returns tokens corresponding to the location of the node.
# Returns nil if keep_tokens is not enabled when parse method is called.
# Token is an array of:
#
# - id
# - token type
# - source code text
# - location [first_lineno, first_column, last_lineno, last_column]
#
# root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
# root.tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
# root.tokens.map{_1[2]}.join # => "x = 1 + 2"
def tokens
return nil unless all_tokens

all_tokens.each_with_object([]) do |token, a|
loc = token.last
if ([first_lineno, first_column] <=> [loc[0], loc[1]]) <= 0 &&
([last_lineno, last_column] <=> [loc[2], loc[3]]) >= 0
a << token
end
end
end

# call-seq:
# node.all_tokens -> array
#
# Returns all tokens for the input script regardless the receiver node.
# Returns nil if keep_tokens is not enabled when parse method is called.
#
# root = RubyVM::AbstractSyntaxTree.parse("x = 1 + 2", keep_tokens: true)
# root.all_tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
# root.children[-1].all_tokens # => [[0, :tIDENTIFIER, "x", [1, 0, 1, 1]], [1, :tSP, " ", [1, 1, 1, 2]], ...]
def all_tokens
Primitive.ast_node_all_tokens
end

# call-seq:
# node.children -> array
#
Expand Down
19 changes: 0 additions & 19 deletions ext/ripper/eventids2.c
Original file line number Diff line number Diff line change
@@ -1,22 +1,3 @@
enum {
tIGNORED_NL = tLAST_TOKEN + 1,
# define tIGNORED_NL ((enum yytokentype)tIGNORED_NL)
tCOMMENT,
# define tCOMMENT ((enum yytokentype)tCOMMENT)
tEMBDOC_BEG,
# define tEMBDOC_BEG ((enum yytokentype)tEMBDOC_BEG)
tEMBDOC,
# define tEMBDOC ((enum yytokentype)tEMBDOC)
tEMBDOC_END,
# define tEMBDOC_END ((enum yytokentype)tEMBDOC_END)
tHEREDOC_BEG,
# define tHEREDOC_BEG ((enum yytokentype)tHEREDOC_BEG)
tHEREDOC_END,
# define tHEREDOC_END ((enum yytokentype)tHEREDOC_END)
k__END__,
# define k__END__ ((enum yytokentype)k__END__)
};

typedef struct {
ID ripper_id_backref;
ID ripper_id_backtick;
Expand Down
1 change: 1 addition & 0 deletions internal/parse.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ VALUE rb_parser_set_yydebug(VALUE, VALUE);
void *rb_parser_load_file(VALUE parser, VALUE name);
void rb_parser_keep_script_lines(VALUE vparser);
void rb_parser_error_tolerant(VALUE vparser);
void rb_parser_keep_tokens(VALUE vparser);

RUBY_SYMBOL_EXPORT_BEGIN
VALUE rb_parser_set_context(VALUE, const struct rb_iseq_struct *, int);
Expand Down
24 changes: 23 additions & 1 deletion node.c
Original file line number Diff line number Diff line change
Expand Up @@ -1161,6 +1161,12 @@ struct node_buffer_struct {
node_buffer_list_t markable;
struct rb_ast_local_table_link *local_tables;
VALUE mark_hash;
// - id (sequence number)
// - token_type
// - text of token
// - location info
// Array, whose entry is array
VALUE tokens;
};

static void
Expand All @@ -1187,6 +1193,7 @@ rb_node_buffer_new(void)
init_node_buffer_list(&nb->markable, (node_buffer_elem_t*)((size_t)nb->unmarkable.head + bucket_size));
nb->local_tables = 0;
nb->mark_hash = Qnil;
nb->tokens = Qnil;
return nb;
}

Expand Down Expand Up @@ -1418,7 +1425,10 @@ rb_ast_update_references(rb_ast_t *ast)
void
rb_ast_mark(rb_ast_t *ast)
{
if (ast->node_buffer) rb_gc_mark(ast->node_buffer->mark_hash);
if (ast->node_buffer) {
rb_gc_mark(ast->node_buffer->mark_hash);
rb_gc_mark(ast->node_buffer->tokens);
}
if (ast->body.compile_option) rb_gc_mark(ast->body.compile_option);
if (ast->node_buffer) {
node_buffer_t *nb = ast->node_buffer;
Expand Down Expand Up @@ -1477,3 +1487,15 @@ rb_ast_add_mark_object(rb_ast_t *ast, VALUE obj)
}
rb_hash_aset(ast->node_buffer->mark_hash, obj, Qtrue);
}

VALUE
rb_ast_tokens(rb_ast_t *ast)
{
return ast->node_buffer->tokens;
}

void
rb_ast_set_tokens(rb_ast_t *ast, VALUE tokens)
{
RB_OBJ_WRITE(ast, &ast->node_buffer->tokens, tokens);
}
2 changes: 2 additions & 0 deletions node.h
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,8 @@ void rb_ast_dispose(rb_ast_t*);
void rb_ast_free(rb_ast_t*);
size_t rb_ast_memsize(const rb_ast_t*);
void rb_ast_add_mark_object(rb_ast_t*, VALUE);
void rb_ast_set_tokens(rb_ast_t*, VALUE);
VALUE rb_ast_tokens(rb_ast_t *ast);
NODE *rb_ast_newnode(rb_ast_t*, enum node_type type);
void rb_ast_delete_node(rb_ast_t*, NODE *n);
rb_ast_id_table_t *rb_ast_new_local_table(rb_ast_t*, int);
Expand Down
Loading

0 comments on commit d860162

Please sign in to comment.