diff --git a/Makefile b/Makefile index 3299e3d..3c98a4b 100644 --- a/Makefile +++ b/Makefile @@ -18,8 +18,6 @@ build: # .PHONY: setup setup: - test -f tree-sitter-version \ - || cp tree-sitter-version.default tree-sitter-version ./scripts/check-prerequisites ./scripts/install-tree-sitter-cli ./scripts/install-tree-sitter-lib diff --git a/scripts/download-tree-sitter b/scripts/download-tree-sitter index 7a8f5b7..6062705 100755 --- a/scripts/download-tree-sitter +++ b/scripts/download-tree-sitter @@ -41,6 +41,7 @@ while [[ $# -gt 0 ]]; do shift done +./scripts/update-version-symlinks version=$(cat tree-sitter-version) mkdir -p downloads diff --git a/scripts/update-version-symlinks b/scripts/update-version-symlinks index e95f6b6..1d4b2ed 100755 --- a/scripts/update-version-symlinks +++ b/scripts/update-version-symlinks @@ -15,9 +15,13 @@ # set -eu +if [[ ! -e tree-sitter-version ]]; then + cp tree-sitter-version.default tree-sitter-version +fi version=$(cat tree-sitter-version) echo "Updating symlinks 'downloads/tree-sitter' and 'tree-sitter'" +mkdir -p downloads ( cd downloads diff --git a/src/bindings/lib/Tree_sitter_output.atd b/src/bindings/lib/Tree_sitter_output.atd index 973d827..56480e2 100644 --- a/src/bindings/lib/Tree_sitter_output.atd +++ b/src/bindings/lib/Tree_sitter_output.atd @@ -42,6 +42,6 @@ type node = { } type position = { - row: int; - column: int; + row: int; (* 0-based *) + column: int; (* 0-based *) } diff --git a/src/gen/lib/CST_grammar.ml b/src/gen/lib/CST_grammar.ml index b0a4a03..87ac503 100644 --- a/src/gen/lib/CST_grammar.ml +++ b/src/gen/lib/CST_grammar.ml @@ -163,8 +163,9 @@ type grammar = { (* rules, grouped and sorted in dependency order. *) extras: string list; - (* node names that don't belong to any rule and can occur anywhere, - such as comments. *) + (* rules names for constructs that can occur anywhere independently from + the grammar, such as comments. Other extras such as string literals + and patterns were removed because we don't need them. *) } [@@deriving show {with_path = false}] diff --git a/src/gen/lib/CST_grammar_conv.ml b/src/gen/lib/CST_grammar_conv.ml index 22f162e..6e06a8b 100644 --- a/src/gen/lib/CST_grammar_conv.ml +++ b/src/gen/lib/CST_grammar_conv.ml @@ -5,10 +5,11 @@ open CST_grammar (* - Traverse the grammar starting from the entrypoint, return the set of - visited rule names. + Traverse the grammar starting from all the entrypoints, which + are the main entrypoint (first rule in the tree-sitter grammar) + and the extras. Return the set of visited rule names. *) -let detect_used ~entrypoint rules = +let detect_used ~entrypoints rules = let rule_tbl = Hashtbl.create 100 in List.iter (fun (name, x) -> Hashtbl.add rule_tbl name x) rules; let get_rule name = @@ -48,7 +49,7 @@ let detect_used ~entrypoint rules = | None -> () | Some x -> scan x in - visit entrypoint; + List.iter visit entrypoints; was_visited let name_of_body opt_rule_name body = @@ -241,14 +242,16 @@ let tsort_rules rules = ) group ) sorted +(* + Extras can be rule names, strings or patterns. + Here we only keep rule names. We need them to identify tree nodes that + are extras and should be handled independently from the grammar. +*) let filter_extras bodies = List.filter_map (fun (x : Tree_sitter_t.rule_body) -> match x with | SYMBOL name -> Some name - | STRING name -> - (* Results in tree-sitter parse error at the moment. - Presumably not super useful. *) - Some name + | STRING _ -> None | _ -> None ) bodies @@ -259,7 +262,8 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t = | (name, _) :: _ -> name | _ -> "program" in - let is_used = detect_used ~entrypoint x.rules in + let extras = filter_extras x.extras in + let is_used = detect_used ~entrypoints:(entrypoint :: extras) x.rules in let grammar_rules = translate_rules x.rules in let all_rules = make_external_rules x.externals @ grammar_rules @@ -270,12 +274,11 @@ let of_tree_sitter (x : Tree_sitter_t.grammar) : t = body; is_rec = true; (* set correctly by tsort below *) is_inlined_rule = is_inlined_rule; - is_inlined_type = false + is_inlined_type = false; } ) in let sorted_rules = tsort_rules all_rules in - let extras = filter_extras x.extras in { name = x.name; entrypoint; diff --git a/src/gen/lib/Codegen.ml b/src/gen/lib/Codegen.ml index 700625b..349a38a 100644 --- a/src/gen/lib/Codegen.ml +++ b/src/gen/lib/Codegen.ml @@ -50,12 +50,14 @@ let () = ~parse_source_file:%s.parse_source_file ~parse_input_tree:%s.parse_input_tree ~dump_tree:%s.dump_tree + ~dump_extras:%s.dump_extras " lib_module_name lang parse_module_name parse_module_name boilerplate_module_name + boilerplate_module_name let ocaml ?out_dir ~lang grammar = let cst_module_name = "CST" in diff --git a/src/gen/lib/Codegen_CST.ml b/src/gen/lib/Codegen_CST.ml index b64f2ba..950c720 100644 --- a/src/gen/lib/Codegen_CST.ml +++ b/src/gen/lib/Codegen_CST.ml @@ -71,10 +71,20 @@ module Fmt = struct let type_app param type_name = label param (atom type_name) - let product l = + (* About the need for parentheses: + - For polymorphic variants, the presence of parentheses changes nothing, + a standalone tuple is always created. + - For classic variants though, the presence parentheses forces the + creation of a tuple which is detachable from the variant constructor. + Without parentheses though, we save the allocation of a block. *) + let product ~paren l = match l with | [x] -> x - | l -> E.List (("(", "*", ")", Style.left_sep_paren_list), l) + | l -> + if paren then + E.List (("(", "*", ")", Style.left_sep_paren_list), l) + else + E.List (("", "*", "", Style.left_sep_paren_list), l) let classic_variant l = let cases = @@ -110,7 +120,7 @@ module Fmt = struct let top_sequence l = E.List (("", "", "", Style.vert_seq), l) - let typedef pos (name, inlined, rhs) = + let typedef pos (name, inlined, opt_rhs) = let is_first = (pos = 0) in let type_ = if is_first then @@ -122,7 +132,11 @@ module Fmt = struct if inlined then " (* inlined *)" else "" in - let code = def (sprintf "%s %s%s =" type_ name comment) rhs in + let code = + match opt_rhs with + | Some rhs -> def (sprintf "%s %s%s =" type_ name comment) rhs + | None -> atom (sprintf "%s %s%s" type_ name comment) + in if is_first then code else top_sequence [ @@ -206,7 +220,7 @@ let rec format_body ?def_name body : E.t = | Optional body -> Fmt.type_app (format_body body) "option" | Seq body_list -> - Fmt.product (format_seq body_list) + Fmt.product ~paren:true (format_seq body_list) and format_choice l = List.map (fun (name, body) -> @@ -219,7 +233,29 @@ and format_seq l = let format_rule (rule : rule) = (trans rule.name, rule.is_inlined_type, - format_body ~def_name:rule.name rule.body) + Some (format_body ~def_name:rule.name rule.body)) + +let format_extra_def extras = + let rhs = + match extras with + | [] -> None + | extras -> + let cases = + extras + |> List.map (fun rule_name -> + let constructor = + Codegen_util.translate_ident_uppercase rule_name in + (constructor, Some (Fmt.product ~paren:false [ + format_body (Symbol "Loc.t"); + format_body (Symbol rule_name); + ])) + ) + |> Fmt.poly_variant + in + Some cases + in + [[("extra", false, rhs)]; + [("extras", false, Some (Fmt.atom "extra list"))]] (* 1. Identify names that are used at most once, becoming candidates @@ -242,7 +278,7 @@ let format_types grammar = Fmt.recursive_typedefs x; Fmt.atom "" ] - ) semi_formatted_defs + ) (semi_formatted_defs @ format_extra_def grammar.extras) |> Fmt.top_sequence let generate grammar = diff --git a/src/gen/lib/Codegen_boilerplate.ml b/src/gen/lib/Codegen_boilerplate.ml index bf5186e..da4bd88 100644 --- a/src/gen/lib/Codegen_boilerplate.ml +++ b/src/gen/lib/Codegen_boilerplate.ml @@ -173,7 +173,7 @@ let gen_rule_mapper_binding ~cst_module_name (rule : rule) = Block (gen_mapper_body_multi env); ] -let gen ~cst_module_name grammar = +let gen ~cst_module_name ~is_extra grammar = List.filter_map (fun rule_group -> let is_rec = match rule_group with @@ -182,7 +182,7 @@ let gen ~cst_module_name grammar = in let bindings = List.filter_map (fun rule -> - if rule.is_inlined_type then + if rule.is_inlined_type && not (is_extra rule.name) then None else Some (gen_rule_mapper_binding ~cst_module_name rule) @@ -204,14 +204,63 @@ let generate_dumper grammar = let dump_tree root = map_%s () root - |> Tree_sitter_run.Raw_tree.to_string - |> print_string + |> Tree_sitter_run.Raw_tree.to_channel stdout " (trans grammar.entrypoint) +let generate_map_extra grammar = + let cases = + grammar.extras + |> List.map (fun rule_name -> + let constructor = + Codegen_util.translate_ident_uppercase rule_name in + sprintf " | `%s (_loc, x) -> (%S, %S, map_%s env x)\n" + constructor rule_name (trans rule_name) (trans rule_name) + ) + |> String.concat "" + in + sprintf "\ + +let map_extra (env : env) (x : CST.extra) = + match x with +%s" + cases + +let generate_extra_dumper grammar = + match grammar.extras with + | [] -> + "\ +let dump_extras (extras : CST.extras) = () +" + | _ -> + sprintf "\ +%s +let dump_extras (extras : CST.extras) = + List.iter (fun extra -> + let ts_rule_name, ocaml_type_name, raw_tree = map_extra () extra in + let details = + if ocaml_type_name <> ts_rule_name then + Printf.sprintf \" (OCaml type '%%s')\" ocaml_type_name + else + \"\" + in + Printf.printf \"%%s%%s:\\n\" ts_rule_name details; + Tree_sitter_run.Raw_tree.to_channel stdout raw_tree + ) extras +" + (generate_map_extra grammar) + +let make_is_extra grammar = + let tbl = Hashtbl.create 100 in + List.iter (fun rule_name -> Hashtbl.replace tbl rule_name ()) grammar.extras; + fun rule_name -> + Hashtbl.mem tbl rule_name + let generate ~cst_module_name grammar = let inline_grammar = Nice_typedefs.rearrange_rules grammar in - let tree = gen ~cst_module_name inline_grammar in + let is_extra = make_is_extra grammar in + let tree = gen ~cst_module_name ~is_extra inline_grammar in make_header grammar ^ Indent.to_string tree ^ generate_dumper grammar + ^ generate_extra_dumper grammar diff --git a/src/gen/lib/Codegen_parse.ml b/src/gen/lib/Codegen_parse.ml index 6e29c5e..bdd00f8 100644 --- a/src/gen/lib/Codegen_parse.ml +++ b/src/gen/lib/Codegen_parse.ml @@ -32,13 +32,13 @@ let mli_contents grammar : string = *) val string : ?src_file:string -> string -> - CST.%s Tree_sitter_run.Parsing_result.t + (CST.%s, CST.extra) Tree_sitter_run.Parsing_result.t (** Parse a %s program from a file into a typed OCaml CST. See the [string] function above for details. *) val file : string -> - CST.%s Tree_sitter_run.Parsing_result.t + (CST.%s, CST.extra) Tree_sitter_run.Parsing_result.t (** Whether to print debugging information. Default: false. *) val debug : bool ref @@ -56,7 +56,7 @@ val parse_source_file : string -> Tree_sitter_run.Tree_sitter_parsing.t (** Parse a tree-sitter CST into an OCaml typed CST. *) val parse_input_tree : Tree_sitter_run.Tree_sitter_parsing.t -> - CST.%s Tree_sitter_run.Parsing_result.t + (CST.%s, CST.extra) Tree_sitter_run.Parsing_result.t |} lang lang (trans root_type) @@ -385,24 +385,92 @@ let gen_translators ~cst_module_name grammar = |> Codegen_util.interleave [Line ""] |> List.flatten +let trans_tree = + {|(* + Costly operation that translates a whole tree or subtree. + + The first pass translates it into a generic tree structure suitable + to guess which node corresponds to each grammar rule. + The second pass is a translation into a typed tree where each grammar + node has its own type. + + This function is called: + - once on the root of the program after removing extras + (comments and other nodes that occur anywhere independently from + the grammar); + - once of each extra node, resulting in its own independent tree of type + 'extra'. +*) +let translate_tree src node trans_x = + let matched_tree = Run.match_tree children_regexps src node in + Option.map trans_x matched_tree +|} + +let gen_trans_extras grammar = + let body = + match grammar.extras with + | [] -> [ Line "None" ] + | extras -> + let cases = + (extras + |> List.map (fun name -> + Inline [ + Line (sprintf "| %S ->" name); + Block [ + Block [ + Line (sprintf + "(match translate_tree src node trans_%s with" + (trans name)); + Line "| None -> None"; + Line (sprintf "| Some x -> Some (`%s (Run.get_loc node, x)))" + (Codegen_util.translate_ident_uppercase name)); + ]; + ] + ] + )) + @ [ Line "| _ -> None" ] + in + [ + Line "match node.type_ with"; + Inline cases; + ] + in + [ + Line "let translate_extra src (node : Tree_sitter_output_t.node) \ + : CST.extra option ="; + Block body + ] + let gen ~cst_module_name grammar = let regexps = gen_regexps grammar in let translators = gen_translators ~cst_module_name grammar in + let trans_extras = gen_trans_extras grammar in [ Inline regexps; Line ""; Inline translators; + Line ""; + Line trans_tree; + Line ""; + Inline trans_extras; ] let ml_trailer grammar = sprintf {| +let translate_root src root_node = + translate_tree src root_node trans_%s + let parse_input_tree input_tree = let orig_root_node = Tree_sitter_parsing.root input_tree in let src = Tree_sitter_parsing.src input_tree in let errors = Run.extract_errors src orig_root_node in - let root_node = Run.remove_extras ~extras orig_root_node in - let matched_tree = Run.match_tree children_regexps src root_node in - let opt_program = Option.map trans_%s matched_tree in - Parsing_result.create src opt_program errors + let opt_program, extras = + Run.translate + ~extras + ~translate_root:(translate_root src) + ~translate_extra:(translate_extra src) + orig_root_node + in + Parsing_result.create src opt_program extras errors let string ?src_file contents = let input_tree = parse_source_string ?src_file contents in diff --git a/src/gen/lib/Codegen_util.ml b/src/gen/lib/Codegen_util.ml index 329fef4..2c6cbef 100644 --- a/src/gen/lib/Codegen_util.ml +++ b/src/gen/lib/Codegen_util.ml @@ -10,6 +10,22 @@ let translate_ident = in fun ident -> Protect_ident.add_translation map ident +(* Capitalize an identifier to be used in OCaml code for variants and such. + It checks that the argument starts with an ASCII letter. *) +let translate_ident_uppercase s = + let res = String.capitalize_ascii s |> translate_ident in + if String.length res > 0 && + match res.[0] with + | 'A'..'Z' -> true + | _ -> false + then + res + else + invalid_arg ( + sprintf "translate_ident_uppercase: cannot capitalize identifier %S" + s + ) + let interleave sep l = let rec loop = function | [] -> [] diff --git a/src/gen/lib/Protect_ident.ml b/src/gen/lib/Protect_ident.ml index b403716..665f22a 100644 --- a/src/gen/lib/Protect_ident.ml +++ b/src/gen/lib/Protect_ident.ml @@ -77,7 +77,13 @@ let ocaml_builtin_types = [ "option"; ] -let ocaml_reserved = ocaml_keywords @ ocaml_builtin_types +let reserved_type_names = [ + "extra"; + "extras"; +] + +let ocaml_reserved = + ocaml_keywords @ ocaml_builtin_types @ reserved_type_names (* Map from input identifier to output identifier and vice-versa. diff --git a/src/gen/lib/Protect_ident.mli b/src/gen/lib/Protect_ident.mli index e786e67..c3a2d1e 100644 --- a/src/gen/lib/Protect_ident.mli +++ b/src/gen/lib/Protect_ident.mli @@ -41,5 +41,8 @@ val ocaml_keywords : string list (* Lowercase identifiers that are built-in type names in OCaml. *) val ocaml_builtin_types : string list -(* Union of ocaml_keywords and ocaml_builtin_types *) +(* Type names reserved for use by the code generator. *) +val reserved_type_names : string list + +(* Union of ocaml_keywords, ocaml_builtin_types, reserved_type_names *) val ocaml_reserved : string list diff --git a/src/run/lib/Main.ml b/src/run/lib/Main.ml index 5b74f0a..f8928d6 100644 --- a/src/run/lib/Main.ml +++ b/src/run/lib/Main.ml @@ -182,6 +182,7 @@ let parse_and_dump ~parse_source_file ~parse_input_tree ~dump_tree + ~dump_extras conf = let input_tree = load_input_tree ~parse_source_file conf in if conf.output_json then ( @@ -197,6 +198,8 @@ let parse_and_dump printf "---\n"; printf "Recovered typed CST:\n%!"; dump_tree matched_tree; + printf "Extras:\n%!"; + dump_extras res.extras; true | None -> eprintf "Error: \ @@ -238,7 +241,7 @@ success: %.2f%% in exit_code -let run ~lang ~parse_source_file ~parse_input_tree ~dump_tree = +let run ~lang ~parse_source_file ~parse_input_tree ~dump_tree ~dump_extras = let conf = parse_command_line ~lang in safe_run (fun () -> let suggested_exit_code = @@ -246,6 +249,7 @@ let run ~lang ~parse_source_file ~parse_input_tree ~dump_tree = ~parse_source_file ~parse_input_tree ~dump_tree + ~dump_extras conf in exit suggested_exit_code diff --git a/src/run/lib/Main.mli b/src/run/lib/Main.mli index 5dd73b1..72b4c41 100644 --- a/src/run/lib/Main.mli +++ b/src/run/lib/Main.mli @@ -5,5 +5,7 @@ val run : lang:string -> parse_source_file:(string -> Tree_sitter_parsing.t) -> - parse_input_tree:(Tree_sitter_parsing.t -> 'a Parsing_result.t) -> - dump_tree:('a -> unit) -> unit + parse_input_tree: + (Tree_sitter_parsing.t -> ('prog, 'extra) Parsing_result.t) -> + dump_tree:('prog -> unit) -> + dump_extras:('extra list -> unit) -> unit diff --git a/src/run/lib/Parsing_result.ml b/src/run/lib/Parsing_result.ml index 0138664..3929381 100644 --- a/src/run/lib/Parsing_result.ml +++ b/src/run/lib/Parsing_result.ml @@ -10,8 +10,9 @@ type stat = { error_count: int; } -type 'a t = { +type ('a, 'b) t = { program: 'a option; + extras: 'b list; errors: Tree_sitter_error.t list; stat: stat; } @@ -40,10 +41,11 @@ let create_stat src errors = error_count; } -let create src program errors = +let create src program extras errors = let stat = create_stat src errors in { program; + extras; errors; stat; } diff --git a/src/run/lib/Parsing_result.mli b/src/run/lib/Parsing_result.mli index e64a13d..26f9e5a 100644 --- a/src/run/lib/Parsing_result.mli +++ b/src/run/lib/Parsing_result.mli @@ -8,12 +8,24 @@ type stat = { error_count: int; } -type 'a t = { - program: 'a option; +(* + There's one type for the program which is the grammar's entrypoint, + and one type for all the extras which are independent entrypoints. + For example: + + type program = ... + type extra = + | Comment of ... + | Heredoc of ... +*) +type ('program, 'extra) t = { + program: 'program option; + extras: 'extra list; errors: Tree_sitter_error.t list; stat: stat; } -val create : Src_file.t -> 'a option -> Tree_sitter_error.t list -> 'a t +val create : + Src_file.t -> 'a option -> 'b list -> Tree_sitter_error.t list -> ('a, 'b) t val export_stat : out_file:string -> stat -> unit diff --git a/src/run/lib/Raw_tree.ml b/src/run/lib/Raw_tree.ml index 755a09c..3eea01c 100644 --- a/src/run/lib/Raw_tree.ml +++ b/src/run/lib/Raw_tree.ml @@ -64,3 +64,7 @@ let format ?(format_any = fun _ -> [Line "??"]) x = let to_string ?format_any x = format ?format_any x |> Tree_sitter_gen.Indent.to_string + +let to_channel ?format_any oc x = + to_string ?format_any x + |> output_string oc diff --git a/src/run/lib/Raw_tree.mli b/src/run/lib/Raw_tree.mli index baac262..ac8b692 100644 --- a/src/run/lib/Raw_tree.mli +++ b/src/run/lib/Raw_tree.mli @@ -49,3 +49,8 @@ val format : val to_string : ?format_any: ('a -> Tree_sitter_gen.Indent.t) -> 'a t -> string + +val to_channel : + ?format_any: ('a -> Tree_sitter_gen.Indent.t) -> + out_channel -> + 'a t -> unit diff --git a/src/run/lib/Run.ml b/src/run/lib/Run.ml index d780a6a..158f7c1 100644 --- a/src/run/lib/Run.ml +++ b/src/run/lib/Run.ml @@ -216,39 +216,81 @@ let extract_errors src root_node = | Error -> "???" (* should not happen *))) ) -let rec filter_nodes keep nodes = - List.filter_map (filter_node keep) nodes - -and filter_node keep node = - if keep node then - Some { node with children = Option.map (filter_nodes keep) node.children } +(* Remove extras from the tree, leaving only nodes matching the entrypoint + rule. *) +let rec remove_extras_from_opt_nodes ~keep_node opt_nodes = + match opt_nodes with + | None -> None + | Some nodes -> + Some (List.filter_map (remove_extras_from_node ~keep_node) nodes) + +and remove_extras_from_node ~keep_node node = + if keep_node node then + Some { + node with + children = remove_extras_from_opt_nodes ~keep_node node.children + } else None -let make_keep ~blacklist = - let tbl = Hashtbl.create 100 in - List.iter (fun s -> Hashtbl.replace tbl s ()) blacklist; - let keep node = +(* + Produce fast functions for identifying whether a node is an extra + and whether a node should be removed to allow matching with the structure + of the grammar. This involves removing error nodes and other extra nodes + tree-sitter may decide to insert. +*) +let make_filters ~extras = + let extra_tbl = Hashtbl.create 100 in + List.iter (fun s -> Hashtbl.replace extra_tbl s ()) extras; + let is_extra node = Hashtbl.mem extra_tbl node.type_ in + let keep_node node = not (node.kind = Error) - && not (Hashtbl.mem tbl node.type_) + && not (is_extra node) in - keep + is_extra, keep_node (* - Remove error nodes, missing nodes, and extra nodes. - - Error nodes indicate unexpected input and can be removed from the tree - while respecting the grammar (because the error bubbles up until - it's an optional position such as in a repeat()). - - Missing nodes are suggested nodes that don't exist in the input. - Missing nodes are left in place so as to keep the tree well-formed - but they're reported as errors. - - Extra nodes are nodes that can appear anywhere in the tree such as comments. + Remove error nodes and extra nodes, which are nodes that can appear + anywhere in the tree. *) -let remove_extras ~extras = - let keep = make_keep ~blacklist:extras in - fun root_node -> - { root_node with - children = Option.map (filter_nodes keep) root_node.children } +let remove_extras ~keep_node root_node = + { root_node with + children = remove_extras_from_opt_nodes ~keep_node root_node.children } + +(* Translate extras and accumulate them into a list. *) +let scan_node_for_extras ~is_extra ~keep_node ~translate_extra node = + let rec scan_nodes_for_extras acc opt_nodes = + match opt_nodes with + | None -> acc + | Some nodes -> + List.fold_left (scan_node_for_extras ~translate_extra) acc nodes + + and scan_node_for_extras ~translate_extra acc node = + let acc = + if is_extra node then + (* We must remove the other extras from the child subtrees. + This is inefficient if we have large, nested extras due to the + subtree being rewritten each time we encounter an extra node. + In practice, it should be ok. *) + match remove_extras ~keep_node node + |> translate_extra with + | None -> acc + | Some x -> x :: acc + else + acc + in + (* An extra can contain other extras, so we must recurse into the children + regardless of whether the current node is an extra. *) + scan_nodes_for_extras acc node.children + in + scan_node_for_extras ~translate_extra [] node |> List.rev + +let translate ~extras ~translate_root ~translate_extra orig_root_node = + let is_extra, keep_node = make_filters ~extras in + let pure_root_node = remove_extras ~keep_node orig_root_node in + let root = translate_root pure_root_node in + let extras = + scan_node_for_extras + ~is_extra ~keep_node ~translate_extra orig_root_node + in + (root, extras) diff --git a/src/run/lib/Run.mli b/src/run/lib/Run.mli index e971a7f..daa726e 100644 --- a/src/run/lib/Run.mli +++ b/src/run/lib/Run.mli @@ -15,6 +15,8 @@ type exp = type capture = matcher_token Matcher.capture +val get_loc : Tree_sitter_bindings.Tree_sitter_output_t.node -> Loc.t + val match_tree : (string * exp option) list -> Src_file.t -> @@ -38,6 +40,14 @@ val extract_errors : Tree_sitter_error.t list val remove_extras : - extras:string list -> + keep_node:(Tree_sitter_bindings.Tree_sitter_output_t.node -> bool) -> Tree_sitter_bindings.Tree_sitter_output_t.node -> Tree_sitter_bindings.Tree_sitter_output_t.node + +val translate : + extras:string list -> + translate_root:(Tree_sitter_bindings.Tree_sitter_output_t.node -> 'a) -> + translate_extra:(Tree_sitter_bindings.Tree_sitter_output_t.node -> + 'b option) -> + Tree_sitter_bindings.Tree_sitter_output_t.node -> + 'a * 'b list diff --git a/src/run/lib/Sample.ml b/src/run/lib/Sample.ml index c6af8e7..3c5e678 100644 --- a/src/run/lib/Sample.ml +++ b/src/run/lib/Sample.ml @@ -25,6 +25,10 @@ module CST = struct ] type statement = (expression * (Loc.t * string (* ";" *))) type program = statement list (* zero or more *) + type comment = (Loc.t * string) + type extra = [ + | `Comment of comment + ] end module Parse = struct @@ -81,6 +85,10 @@ module Parse = struct |]; ) ); + ( + "comment", + None + ); ] (* generated *) @@ -130,21 +138,50 @@ module Parse = struct ) | _ -> assert false - (* generated *) + (* generated - entrypoint *) let trans_program ((kind, body) : mt) = match body with | Children v -> Run.repeat (fun v -> trans_statement (Run.matcher_token v)) v | _ -> assert false + (* generated - extra *) + let trans_comment ((kind, body) : mt) : CST.variable = + match body with + | Leaf v -> v + | _ -> assert false + + let translate_tree src node trans_x = + let matched_tree = Run.match_tree children_regexps src node in + Option.map trans_x matched_tree + + (* generated *) + let translate_extra src + (node : Tree_sitter_bindings.Tree_sitter_output_t.node) + : CST.extra option = + match node.type_ with + | "comment" -> + (match translate_tree src node trans_comment with + | None -> None + | Some x -> Some (`Comment x)) + | _ -> None + + (* generated *) + let translate_root src root_node = + translate_tree src root_node trans_program + let parse_input_tree input_tree = let orig_root_node = Tree_sitter_parsing.root input_tree in let src = Tree_sitter_parsing.src input_tree in let errors = Run.extract_errors src orig_root_node in - let root_node = Run.remove_extras ~extras orig_root_node in - let matched_tree = Run.match_tree children_regexps src root_node in - let program = Option.map trans_program matched_tree in - Parsing_result.create src program errors + let opt_program, extras = + Run.translate + ~extras + ~translate_root:(translate_root src) + ~translate_extra:(translate_extra src) + orig_root_node + in + Parsing_result.create src opt_program extras errors let string ?src_file contents = let input_tree = parse_source_string ?src_file contents in diff --git a/test/extras/check-test-output b/test/extras/check-test-output new file mode 100755 index 0000000..9b18100 --- /dev/null +++ b/test/extras/check-test-output @@ -0,0 +1,10 @@ +#! /usr/bin/env bash +# +# Check specific bits of output +# +set -eu + +if ! grep 'complex_extra:' test.out/ok/complex_extra.cst > /dev/null; then + echo "Extra 'complex_extra' is missing from test.out/ok/complex_extra.cst" + exit 1 +fi diff --git a/test/extras/grammar.js b/test/extras/grammar.js index bcf9cf8..fe9e501 100644 --- a/test/extras/grammar.js +++ b/test/extras/grammar.js @@ -5,10 +5,18 @@ module.exports = grammar({ $.number ), number: $ => /[0-9]+/, - comment: $ => /#.*/ + letter: $ => /[a-zA-Z]/, + comment: $ => /#.*/, + complex_extra: $ => seq('(', repeat(choice($.number, $.letter)), ')'), + extra: $ => 'extra', // a rule named 'extra' (!) + extras: $ => 'extras', // a rule named 'extras' (!) }, extras: $ => [ - $.comment, - /\s|\\\n/ + $.comment, // appears in the CST + $.complex_extra, // same + $.extra, + $.extras, + "IGNOREME", // doesn't appear in the CST because it doesn't have a name + /\s|\\\n/ // same ] }) diff --git a/test/extras/test/ok/a_rule_named_extra b/test/extras/test/ok/a_rule_named_extra new file mode 100644 index 0000000..072889c --- /dev/null +++ b/test/extras/test/ok/a_rule_named_extra @@ -0,0 +1,5 @@ +extra +extras +42 +# something +17 diff --git a/test/extras/test/ok/complex_extra b/test/extras/test/ok/complex_extra new file mode 100644 index 0000000..5ba1689 --- /dev/null +++ b/test/extras/test/ok/complex_extra @@ -0,0 +1,3 @@ +# first comment +1 2 (3 A # extra in the middle of another extra! + 4) 5 diff --git a/test/extras/test/ok/ex2 b/test/extras/test/ok/ex2 new file mode 100644 index 0000000..cf76a52 --- /dev/null +++ b/test/extras/test/ok/ex2 @@ -0,0 +1,2 @@ +1 2 # hello +3 IGNOREME 4 diff --git a/test/implicit-extra/check-test-output b/test/implicit-extra/check-test-output index ceee77b..a9275ed 100755 --- a/test/implicit-extra/check-test-output +++ b/test/implicit-extra/check-test-output @@ -14,6 +14,5 @@ for actual in $targets; do diff -u "$expected" "$actual" done -echo "Checking parse results." - -diff example-parse.expected <(./parse test/ok/example) +echo "Checking parse results: example-parse.expected" +diff -u example-parse.expected <(./parse test/ok/example) diff --git a/test/implicit-extra/example-parse.expected b/test/implicit-extra/example-parse.expected index 5285f7e..9d71a1c 100644 --- a/test/implicit-extra/example-parse.expected +++ b/test/implicit-extra/example-parse.expected @@ -39,6 +39,23 @@ Recovered typed CST: ) ] ) +Extras: +period: +"." +comma: +"," +period: +"." +comma: +"," +comma: +"," +period: +"." +period: +"." +comma: +"," total lines: 2 error lines: 0 error count: 0