Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/changes/fixed/12869.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- Fix greedy version location in lang declarations. Previously, error locations for
invalid lang versions would span multiple bytes for multi-byte UTF-8 characters,
causing carets to appear misaligned and seemingly include the closing
parenthesis. Now, error locations for ASCII strings show the full length (e.g.,
"Ali" shows `^^^`), while non-ASCII strings show only the first byte (e.g., "è"
shows `^`) to avoid multi-byte character display issues. (#12869, fixes #12806,
@benodiwal)
25 changes: 24 additions & 1 deletion src/dune_sexp/versioned_file_first_line.mll
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,18 @@ let invalid_lang_line start lexbuf =
lexbuf.Lexing.lex_start_p <- start;
User_error.raise ~loc:(Loc.of_lexbuf lexbuf)
[ Pp.text "Invalid first line, expected: (lang <lang> <version>)" ]

let rec find_first_non_ascii s i =
if i >= String.length s then None
else if Char.code s.[i] >= 128 then Some i
else find_first_non_ascii s (i + 1)
}

let newline = '\r'? '\n'
let blank = [' ' '\t']
let atom_char = [^';' '(' ')' '"' '#' '|' '\000'-'\032']
let ascii = [ '\000' - '\127' ]
let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
let atom_char_with_junk = [^';' '(' ')' '"' '#' '|' '\000'-'\032'] | [ '\127'-'\255' ]

rule lex_opt = parse
| '(' blank* "lang"
Expand All @@ -34,6 +41,22 @@ and atom start = parse
| atom_char+ as s
{ (Loc.of_lexbuf lexbuf, s)
}
| atom_char_with_junk+ as s
{ let atom_start = Lexing.lexeme_start_p lexbuf in
(* CR-someday benodiwal: Consider using Uuseg for more accurate grapheme
cluster detection instead of the simple byte-based check. This would
better handle complex Unicode like emoji sequences, though terminal
rendering inconsistencies would still cause display issues. *)
let loc_start, loc_end =
match find_first_non_ascii s 0 with
| None -> assert false
| Some offset ->
let error_start = { atom_start with pos_cnum = atom_start.pos_cnum + offset } in
let error_end = { error_start with pos_cnum = error_start.pos_cnum + 1 } in
error_start, error_end
in
(Loc.create ~start:loc_start ~stop:loc_end, s)
}
| _ | eof
{ to_eol lexbuf;
invalid_lang_line start lexbuf
Expand Down
62 changes: 46 additions & 16 deletions test/blackbox-tests/test-cases/lang-invalid-version.t
Original file line number Diff line number Diff line change
Expand Up @@ -24,48 +24,78 @@ atoms without needing validation in every decoder.

Test with various non-ASCII characters:

CR-someday benodiwal: The version_loc is greedy and captures the closing
parenthesis.

$ test_invalid_version "è"
File "dune-project", line 1, characters 11-13:
File "dune-project", line 1, characters 11-12:
1 | (lang dune è)
^^
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

$ test_invalid_version "π3.14"
File "dune-project", line 1, characters 11-17:
File "dune-project", line 1, characters 11-12:
1 | (lang dune π3.14)
^^^^^^
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

$ test_invalid_version "α"
File "dune-project", line 1, characters 11-13:
File "dune-project", line 1, characters 11-12:
1 | (lang dune α)
^^
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

$ test_invalid_version "😀"
File "dune-project", line 1, characters 11-15:
File "dune-project", line 1, characters 11-12:
1 | (lang dune 😀)
^^^^
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

CR-someday benodiwal: Unicode string lengths are miscomputed in location
excerpts for East Asian characters.

$ test_invalid_version "中3.16文"
File "dune-project", line 1, characters 11-21:
File "dune-project", line 1, characters 11-12:
1 | (lang dune 中3.16文)
^^^^^^^^^^
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

Test where non-ASCII character is not the first character:

$ test_invalid_version "3.14è"
File "dune-project", line 1, characters 15-16:
1 | (lang dune 3.14è)
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

$ test_invalid_version "2è3"
File "dune-project", line 1, characters 12-13:
1 | (lang dune 2è3)
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

$ test_invalid_version "abc😀def"
File "dune-project", line 1, characters 14-15:
1 | (lang dune abc😀def)
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]

Test with multiple non-ASCII characters (caret points to first one):

$ test_invalid_version "aè😀"
File "dune-project", line 1, characters 12-13:
1 | (lang dune aè😀)
^
Error: Invalid atom: contains non-ASCII character(s). Atoms must only contain
ASCII characters.
[1]
Loading