Skip to content

Store raw content of code blocks in parser's AST #1325

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
which is more friendly for caching and voodoo (@jonludlam, #1304)
- Filter out warnings coming from linking implementations (@jonludlam, #1319)
- Output warnings coming from the `compile` phase in the driver (@jonludlam, #1323)
- Changed the rules for code block indentation (@panglesd, #1137)
- Changed the rules for code block and verbatim content (@panglesd, #1317)
- Store raw content in verbatim and code block, and expose a function to process
it (@panglesd, #1325)

### Fixed

Expand Down
6 changes: 6 additions & 0 deletions doc/dune
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,9 @@
odoc-config.sexp
(odoc_logo_placeholder.jpg as odoc-pages/odoc_logo_placeholder.jpg))
(package odoc))

(install
(section doc)
(files
(odoc-parser-config.sexp as odoc-config.sexp))
(package odoc-parser))
1 change: 1 addition & 0 deletions doc/odoc-parser-config.sexp
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
(packages odoc)
12 changes: 11 additions & 1 deletion src/model/semantics.ml
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,19 @@ let rec nestable_block_element :
| None -> None
| Some l -> Some (List.map nestable_block_element l)
in
let trimmed_content, warnings =
Odoc_parser.codeblock_content location content.value
in
let warnings = List.map Error.t_of_parser_t warnings in
List.iter (Error.raise_warning ~non_fatal:true) warnings;
let content = Location.at content.location trimmed_content in
Location.at location (`Code_block (lang_tag, content, outputs))
| { value = `Math_block s; location } -> Location.at location (`Math_block s)
| { value = `Verbatim _; _ } as element -> element
| { value = `Verbatim v; location } ->
let v, warnings = Odoc_parser.codeblock_content location v in
let warnings = List.map Error.t_of_parser_t warnings in
List.iter (Error.raise_warning ~non_fatal:true) warnings;
Location.at location (`Verbatim v)
| { value = `Modules modules; location } ->
let modules =
List.fold_left
Expand Down
6 changes: 6 additions & 0 deletions src/parser/ast.ml
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,19 @@ type code_block = {
meta : code_block_meta option;
delimiter : string option;
content : string with_location;
(** This is the raw content, that is the exact string inside the
delimiters. In order to get the "processed" content, see
{!Odoc_parser.codeblock_content} *)
output : nestable_block_element with_location list option;
}

and nestable_block_element =
[ `Paragraph of inline_element with_location list
| `Code_block of code_block
| `Verbatim of string
(** This is the raw content, that is the exact string inside the delimiters.
In order to get the "processed" content, see
{!Odoc_parser.verbatim_content} *)
| `Modules of string with_location list
| `List of
[ `Unordered | `Ordered ]
Expand Down
80 changes: 0 additions & 80 deletions src/parser/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -133,89 +133,10 @@ let verbatim_whitespace_last text =
| exception Invalid_argument _ -> ""
| _ -> text

(** [deindent ~what input ~start_offset s] "deindents" [s] by an offset computed
from [start_offset] and [input], corresponding to the begining of a code
block or verbatim. If that is not possible (eg there is a non-whitespace
line starting with less than [offset] whitespaces), it unindents as much as
possible and raises a warning. *)
let deindent : what:string -> _ -> start_offset:_ -> string -> string =
fun ~what input ~start_offset s ->
let start_location = input.offset_to_location start_offset in
let offset = start_location.Loc.column in
(* Whitespace-only lines do not count, so they return [None]. *)
let count_leading_whitespace line =
let rec count_leading_whitespace' index len =
if index = len then None
else
match line.[index] with
| ' ' | '\t' -> count_leading_whitespace' (index + 1) len
| _ -> Some index
in
let len = String.length line in
(* '\r' may remain because we only split on '\n' below. This is important
for the first line, which would be considered not empty without this check. *)
let len = if len > 0 && line.[len - 1] = '\r' then len - 1 else len in
count_leading_whitespace' 0 len
in

let lines = Astring.String.cuts ~sep:"\n" s in

let least_amount_of_whitespace =
List.fold_left
(fun least_so_far line ->
match (count_leading_whitespace line, least_so_far) with
| Some n, least when n < least -> n
| _ -> least_so_far)
offset lines
in
if least_amount_of_whitespace < offset then
warning input ~start_offset
(Parse_error.not_enough_indentation_in_code_block ~what);
let drop n line =
(* Since blank lines were ignored when calculating
[least_amount_of_whitespace], their length might be less than the
amount. *)
if String.length line < n then ""
else String.sub line n (String.length line - n)
in
let lines = List.map (drop least_amount_of_whitespace) lines in
String.concat "\n" lines

(** Implements the rules for code block as specified in [odoc_for_authors],
section on code blocks and indentation. *)
let code_block_content input ~what ~start_offset s =
let start_location = input.offset_to_location start_offset in
let indent = start_location.column in
let rec handle_first_newline index =
if index >= String.length s then String.make indent ' ' ^ s
else
match s.[index] with
| ' ' | '\t' | '\r' -> handle_first_newline (index + 1)
(* Multiline starting with an empty line *)
| '\n' -> String.sub s (index + 1) (String.length s - index - 1)
(* Multiline NOT starting with an empty line *)
| _ -> String.make indent ' ' ^ s
in
let s = handle_first_newline 0 in
let rec handle_last_newline index =
if index < 0 then s
else
match s.[index] with
| ' ' | '\t' | '\r' -> handle_last_newline (index - 1)
(* Multiline starting with an empty line *)
| '\n' -> String.sub s 0 index
(* Multiline NOT starting with an empty line *)
| _ -> s
in
let s = handle_last_newline (String.length s - 1) in
deindent ~what input ~start_offset s


let emit_verbatim input start_offset buffer =
let t = Buffer.contents buffer in
let t = verbatim_whitespace_first input start_offset t in
let t = verbatim_whitespace_last t in
let t = code_block_content input ~what:"verbatim" ~start_offset t in
emit input (`Verbatim t) ~start_offset

(* The locations have to be treated carefully in this function. We need to ensure that
Expand All @@ -229,7 +150,6 @@ let emit_code_block ~start_offset content_offset input metadata delim terminator
let c = Buffer.contents c in
(* We first handle the case wehere there is no line at the beginning, then
remove trailing, leading lines and deindent *)
let c = code_block_content input ~what:"code block" ~start_offset c in
let c =
with_location_adjustments ~adjust_end_by:terminator
~start_offset:content_offset
Expand Down
85 changes: 85 additions & 0 deletions src/parser/odoc_parser.ml
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,88 @@ let parse_comment ~location ~text =
(* Accessor functions, as [t] is opaque *)
let warnings t = t.warnings
let ast t = t.ast

(** [deindent ~what input ~start_offset s] "deindents" [s] by an offset computed
from [start_offset] and [input], corresponding to the begining of a code
block or verbatim. If that is not possible (eg there is a non-whitespace
line starting with less than [offset] whitespaces), it unindents as much as
possible and raises a warning. *)
let deindent : what:string -> loc:Loc.span -> string -> string * Warning.t list
=
fun ~what ~loc s ->
let offset = loc.start.column in
(* Whitespace-only lines do not count, so they return [None]. *)
let count_leading_whitespace line =
let rec count_leading_whitespace' index len =
if index = len then None
else
match line.[index] with
| ' ' | '\t' -> count_leading_whitespace' (index + 1) len
| _ -> Some index
in
let len = String.length line in
(* '\r' may remain because we only split on '\n' below. This is important
for the first line, which would be considered not empty without this check. *)
let len = if len > 0 && line.[len - 1] = '\r' then len - 1 else len in
count_leading_whitespace' 0 len
in

let lines = Astring.String.cuts ~sep:"\n" s in

let least_amount_of_whitespace =
List.fold_left
(fun least_so_far line ->
match (count_leading_whitespace line, least_so_far) with
| Some n, least when n < least -> n
| _ -> least_so_far)
offset lines
in
let warning =
if least_amount_of_whitespace < offset then
[ Parse_error.not_enough_indentation_in_code_block ~what loc ]
else []
in
let drop n line =
(* Since blank lines were ignored when calculating
[least_amount_of_whitespace], their length might be less than the
amount. *)
if String.length line < n then ""
else String.sub line n (String.length line - n)
in
let lines = List.map (drop least_amount_of_whitespace) lines in
(String.concat "\n" lines, warning)

(** Implements the rules for code block as specified in [odoc_for_authors],
section on code blocks and indentation. *)
let code_block_content ~what ~loc s =
let indent = loc.Loc.start.column in
(* Remove the first line (to first \n char, included) if it's whitespace only.
Otherwise, indent at [indent] level to account for offset. *)
let rec handle_first_newline index =
if index >= String.length s then String.make indent ' ' ^ s
else
match s.[index] with
| ' ' | '\t' | '\r' -> handle_first_newline (index + 1)
| '\n' -> String.sub s (index + 1) (String.length s - index - 1)
| _ -> String.make indent ' ' ^ s
in
let s = handle_first_newline 0 in
(* Remove the last line (from last \n char, included) if it's whitespace
only. *)
let rec handle_last_newline index =
if index < 0 then s
else
match s.[index] with
| ' ' | '\t' | '\r' -> handle_last_newline (index - 1)
| '\n' -> String.sub s 0 index
| _ -> s
in
let s = handle_last_newline (String.length s - 1) in
deindent ~what ~loc s

let verbatim_content loc c =
let what = "verbatim" in
code_block_content ~what ~loc c
let codeblock_content loc c =
let what = "code block" in
code_block_content ~what ~loc c
26 changes: 26 additions & 0 deletions src/parser/odoc_parser.mli
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,29 @@ val position_of_point : t -> Loc.point -> Lexing.position
the usual representation in the Lexing module. Note that this relies on the
information passed in {!parse_comment}, and hence requires the result of
that call in addition to the {!Loc.point} being converted. *)

val codeblock_content : Loc.span -> string -> string * Warning.t list
(** Process the content of a code block, following the rules described
{{!/odoc/odoc_for_authors.indentation_code_blocks}here}. To achieve this, it
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't that cross-package link cool?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is.

needs the location of the code block (including the separators) and the raw
content of the code block. For instance, with the following code block:

{delim@ocaml[
{[
hello
]}
]delim}

We can go from the raw content ["\n hello\n "] to the processed content
[" hello"] with:
{[
match codeblock.value with
| `Code_block { content; _ } ->
codeblock_content codeblock.location content.value
]}

Also returns a list of warnings, eg if the content is not appropriately
indented. *)

val verbatim_content : Loc.span -> string -> string * Warning.t list
(** Similar to {!codeblock_content} but for verbatims. *)
Loading