Skip to content

Commit

Permalink
Extract and recover structure of extras (#48)
Browse files Browse the repository at this point in the history
* Collect and translate named extras

* Test and fix the extraction of extras, and in particular of extras
embedded within other extras.

* Add locations to the root of extra subtrees.
We could add locations to all nodes of the CST but that would require
massive (but easy) changes to the legacy code in semgrep.

* Update test expectations

* Add a test (which currently fails)

* Fix: Translators for rules appearing only in extras were removed.

* Port extras to the new dumping mechanism

* Add a naming test

* Fix extra dumpers

* Use polymorphic variants to represent extras. This is for consistency
with the CST as well as allowing tricks such as merging the TypeScript
and TSX CST types.

* Populate tree-sitter-version file without an extra step

* Fix 'cd: downloads: No such file or directory'

* Make script more robust (?)
  • Loading branch information
mjambon authored Sep 18, 2024
1 parent e74ffc6 commit e063ec5
Show file tree
Hide file tree
Showing 29 changed files with 435 additions and 86 deletions.
2 changes: 0 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ build:
#
.PHONY: setup
setup:
test -f tree-sitter-version \
|| cp tree-sitter-version.default tree-sitter-version
./scripts/check-prerequisites
./scripts/install-tree-sitter-cli
./scripts/install-tree-sitter-lib
Expand Down
1 change: 1 addition & 0 deletions scripts/download-tree-sitter
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ while [[ $# -gt 0 ]]; do
shift
done

./scripts/update-version-symlinks
version=$(cat tree-sitter-version)

mkdir -p downloads
Expand Down
4 changes: 4 additions & 0 deletions scripts/update-version-symlinks
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@
#
set -eu

if [[ ! -e tree-sitter-version ]]; then
cp tree-sitter-version.default tree-sitter-version
fi
version=$(cat tree-sitter-version)

echo "Updating symlinks 'downloads/tree-sitter' and 'tree-sitter'"
mkdir -p downloads

(
cd downloads
Expand Down
4 changes: 2 additions & 2 deletions src/bindings/lib/Tree_sitter_output.atd
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ type node = {
}

type position = {
row: int;
column: int;
row: int; (* 0-based *)
column: int; (* 0-based *)
}
5 changes: 3 additions & 2 deletions src/gen/lib/CST_grammar.ml
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ type grammar = {
(* rules, grouped and sorted in dependency order. *)

extras: string list;
(* node names that don't belong to any rule and can occur anywhere,
such as comments. *)
(* rules names for constructs that can occur anywhere independently from
the grammar, such as comments. Other extras such as string literals
and patterns were removed because we don't need them. *)
}
[@@deriving show {with_path = false}]

Expand Down
25 changes: 14 additions & 11 deletions src/gen/lib/CST_grammar_conv.ml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
open CST_grammar

(*
Traverse the grammar starting from the entrypoint, return the set of
visited rule names.
Traverse the grammar starting from all the entrypoints, which
are the main entrypoint (first rule in the tree-sitter grammar)
and the extras. Return the set of visited rule names.
*)
let detect_used ~entrypoint rules =
let detect_used ~entrypoints rules =
let rule_tbl = Hashtbl.create 100 in
List.iter (fun (name, x) -> Hashtbl.add rule_tbl name x) rules;
let get_rule name =
Expand Down Expand Up @@ -48,7 +49,7 @@ let detect_used ~entrypoint rules =
| None -> ()
| Some x -> scan x
in
visit entrypoint;
List.iter visit entrypoints;
was_visited

let name_of_body opt_rule_name body =
Expand Down Expand Up @@ -241,14 +242,16 @@ let tsort_rules rules =
) group
) sorted

(*
Extras can be rule names, strings or patterns.
Here we only keep rule names. We need them to identify tree nodes that
are extras and should be handled independently from the grammar.
*)
let filter_extras bodies =
List.filter_map (fun (x : Tree_sitter_t.rule_body) ->
match x with
| SYMBOL name -> Some name
| STRING name ->
(* Results in tree-sitter parse error at the moment.
Presumably not super useful. *)
Some name
| STRING _ -> None
| _ -> None
) bodies

Expand All @@ -259,7 +262,8 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t =
| (name, _) :: _ -> name
| _ -> "program"
in
let is_used = detect_used ~entrypoint x.rules in
let extras = filter_extras x.extras in
let is_used = detect_used ~entrypoints:(entrypoint :: extras) x.rules in
let grammar_rules = translate_rules x.rules in
let all_rules =
make_external_rules x.externals @ grammar_rules
Expand All @@ -270,12 +274,11 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t =
body;
is_rec = true; (* set correctly by tsort below *)
is_inlined_rule = is_inlined_rule;
is_inlined_type = false
is_inlined_type = false;
}
)
in
let sorted_rules = tsort_rules all_rules in
let extras = filter_extras x.extras in
{
name = x.name;
entrypoint;
Expand Down
2 changes: 2 additions & 0 deletions src/gen/lib/Codegen.ml
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,14 @@ let () =
~parse_source_file:%s.parse_source_file
~parse_input_tree:%s.parse_input_tree
~dump_tree:%s.dump_tree
~dump_extras:%s.dump_extras
"
lib_module_name
lang
parse_module_name
parse_module_name
boilerplate_module_name
boilerplate_module_name

let ocaml ?out_dir ~lang grammar =
let cst_module_name = "CST" in
Expand Down
50 changes: 43 additions & 7 deletions src/gen/lib/Codegen_CST.ml
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,20 @@ module Fmt = struct
let type_app param type_name =
label param (atom type_name)

let product l =
(* About the need for parentheses:
- For polymorphic variants, the presence of parentheses changes nothing,
a standalone tuple is always created.
- For classic variants though, the presence parentheses forces the
creation of a tuple which is detachable from the variant constructor.
Without parentheses though, we save the allocation of a block. *)
let product ~paren l =
match l with
| [x] -> x
| l -> E.List (("(", "*", ")", Style.left_sep_paren_list), l)
| l ->
if paren then
E.List (("(", "*", ")", Style.left_sep_paren_list), l)
else
E.List (("", "*", "", Style.left_sep_paren_list), l)

let classic_variant l =
let cases =
Expand Down Expand Up @@ -110,7 +120,7 @@ module Fmt = struct
let top_sequence l =
E.List (("", "", "", Style.vert_seq), l)

let typedef pos (name, inlined, rhs) =
let typedef pos (name, inlined, opt_rhs) =
let is_first = (pos = 0) in
let type_ =
if is_first then
Expand All @@ -122,7 +132,11 @@ module Fmt = struct
if inlined then " (* inlined *)"
else ""
in
let code = def (sprintf "%s %s%s =" type_ name comment) rhs in
let code =
match opt_rhs with
| Some rhs -> def (sprintf "%s %s%s =" type_ name comment) rhs
| None -> atom (sprintf "%s %s%s" type_ name comment)
in
if is_first then code
else
top_sequence [
Expand Down Expand Up @@ -206,7 +220,7 @@ let rec format_body ?def_name body : E.t =
| Optional body ->
Fmt.type_app (format_body body) "option"
| Seq body_list ->
Fmt.product (format_seq body_list)
Fmt.product ~paren:true (format_seq body_list)

and format_choice l =
List.map (fun (name, body) ->
Expand All @@ -219,7 +233,29 @@ and format_seq l =
let format_rule (rule : rule) =
(trans rule.name,
rule.is_inlined_type,
format_body ~def_name:rule.name rule.body)
Some (format_body ~def_name:rule.name rule.body))

let format_extra_def extras =
let rhs =
match extras with
| [] -> None
| extras ->
let cases =
extras
|> List.map (fun rule_name ->
let constructor =
Codegen_util.translate_ident_uppercase rule_name in
(constructor, Some (Fmt.product ~paren:false [
format_body (Symbol "Loc.t");
format_body (Symbol rule_name);
]))
)
|> Fmt.poly_variant
in
Some cases
in
[[("extra", false, rhs)];
[("extras", false, Some (Fmt.atom "extra list"))]]

(*
1. Identify names that are used at most once, becoming candidates
Expand All @@ -242,7 +278,7 @@ let format_types grammar =
Fmt.recursive_typedefs x;
Fmt.atom ""
]
) semi_formatted_defs
) (semi_formatted_defs @ format_extra_def grammar.extras)
|> Fmt.top_sequence

let generate grammar =
Expand Down
59 changes: 54 additions & 5 deletions src/gen/lib/Codegen_boilerplate.ml
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ let gen_rule_mapper_binding ~cst_module_name (rule : rule) =
Block (gen_mapper_body_multi env);
]

let gen ~cst_module_name grammar =
let gen ~cst_module_name ~is_extra grammar =
List.filter_map (fun rule_group ->
let is_rec =
match rule_group with
Expand All @@ -182,7 +182,7 @@ let gen ~cst_module_name grammar =
in
let bindings =
List.filter_map (fun rule ->
if rule.is_inlined_type then
if rule.is_inlined_type && not (is_extra rule.name) then
None
else
Some (gen_rule_mapper_binding ~cst_module_name rule)
Expand All @@ -204,14 +204,63 @@ let generate_dumper grammar =
let dump_tree root =
map_%s () root
|> Tree_sitter_run.Raw_tree.to_string
|> print_string
|> Tree_sitter_run.Raw_tree.to_channel stdout
"
(trans grammar.entrypoint)

let generate_map_extra grammar =
let cases =
grammar.extras
|> List.map (fun rule_name ->
let constructor =
Codegen_util.translate_ident_uppercase rule_name in
sprintf " | `%s (_loc, x) -> (%S, %S, map_%s env x)\n"
constructor rule_name (trans rule_name) (trans rule_name)
)
|> String.concat ""
in
sprintf "\
let map_extra (env : env) (x : CST.extra) =
match x with
%s"
cases

let generate_extra_dumper grammar =
match grammar.extras with
| [] ->
"\
let dump_extras (extras : CST.extras) = ()
"
| _ ->
sprintf "\
%s
let dump_extras (extras : CST.extras) =
List.iter (fun extra ->
let ts_rule_name, ocaml_type_name, raw_tree = map_extra () extra in
let details =
if ocaml_type_name <> ts_rule_name then
Printf.sprintf \" (OCaml type '%%s')\" ocaml_type_name
else
\"\"
in
Printf.printf \"%%s%%s:\\n\" ts_rule_name details;
Tree_sitter_run.Raw_tree.to_channel stdout raw_tree
) extras
"
(generate_map_extra grammar)

let make_is_extra grammar =
let tbl = Hashtbl.create 100 in
List.iter (fun rule_name -> Hashtbl.replace tbl rule_name ()) grammar.extras;
fun rule_name ->
Hashtbl.mem tbl rule_name

let generate ~cst_module_name grammar =
let inline_grammar = Nice_typedefs.rearrange_rules grammar in
let tree = gen ~cst_module_name inline_grammar in
let is_extra = make_is_extra grammar in
let tree = gen ~cst_module_name ~is_extra inline_grammar in
make_header grammar
^ Indent.to_string tree
^ generate_dumper grammar
^ generate_extra_dumper grammar
Loading

0 comments on commit e063ec5

Please sign in to comment.