Skip to content

Commit

Permalink
fix: handle utf8 characters in the dune files(ocaml#9728)
Browse files Browse the repository at this point in the history
Signed-off-by: Alpha DIALLO <moyodiallo@gmail.com>
  • Loading branch information
moyodiallo committed Mar 15, 2024
1 parent 6848420 commit 509b6f5
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 33 deletions.
122 changes: 101 additions & 21 deletions src/dune_sexp/escape.ml
Original file line number Diff line number Diff line change
@@ -1,26 +1,92 @@
open! Stdune

module Utf8 = struct
(*
The first byte of an utf8 character gives the size in bytes of the utf8:
0xxxxxxxx -> 1
110xxxxxx -> 2
1110xxxxx -> 3
11110xxxx -> 4
*)
let utf8_byte_length u =
match Char.code u with
| u when u < 128 -> 1
| u when u < 194 -> 0
| u when u < 224 -> 2
| u when u < 240 -> 3
| u when u < 245 -> 4
| _ -> 0
;;

let unsafe_get s i = String.unsafe_get s i |> Char.code
let next_utf8_length s i = utf8_byte_length (String.unsafe_get s i)

let is_utf8_valid s i l =
assert (String.length s >= l);
match l with
| 1 -> true
| 2 ->
let b1 = unsafe_get s (i + 1) in
if b1 lsr 6 != 0b10 then false else true
| 3 ->
let b0 = unsafe_get s i in
let b1 = unsafe_get s (i + 1) in
let b2 = unsafe_get s (i + 2) in
if b2 lsr 6 != 0b10
then false
else (
match b0 with
| 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then false else true
| 0xED -> if b1 < 0x80 || 0x9F < b1 then false else true
| _ -> if b1 lsr 6 != 0b10 then false else true)
| 4 ->
let b0 = unsafe_get s i in
let b1 = unsafe_get s (i + 1) in
let b2 = unsafe_get s (i + 2) in
let b3 = unsafe_get s (i + 3) in
if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10
then false
else (
match b0 with
| 0xF0 -> if b1 < 0x90 || 0xBF < b1 then false else true
| 0xF4 -> if b1 < 0x80 || 0x8F < b1 then false else true
| _ -> if b1 lsr 6 != 0b10 then false else true)
| _ -> false
;;
end

let quote_length s =
let n = ref 0 in
let len = String.length s in
for i = 0 to len - 1 do
n
:= !n
+
match String.unsafe_get s i with
| '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
| '%' -> if i + 1 < len && s.[i + 1] = '{' then 2 else 1
| ' ' .. '~' -> 1
| _ -> 4
let i = ref 0 in
while !i < len do
(n
:= !n
+
match String.unsafe_get s !i with
| '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
| '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1
| ' ' .. '~' -> 1
| _ ->
let uchar_len = Utf8.next_utf8_length s !i in
(match Utf8.is_utf8_valid s !i uchar_len with
| true ->
assert (uchar_len > 1 && uchar_len < 5);
i := !i + uchar_len - 1;
uchar_len
| false -> 4));
incr i
done;
!n
;;

let escape_to s ~dst:s' ~ofs =
let n = ref ofs in
let len = String.length s in
for i = 0 to len - 1 do
(match String.unsafe_get s i with
let i = ref 0 in
while !i < len do
(match String.unsafe_get s !i with
| ('\"' | '\\') as c ->
Bytes.unsafe_set s' !n '\\';
incr n;
Expand All @@ -41,21 +107,35 @@ let escape_to s ~dst:s' ~ofs =
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n 'b'
| '%' when i + 1 < len && s.[i + 1] = '{' ->
| '%' when !i + 1 < len && s.[!i + 1] = '{' ->
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n '%'
| ' ' .. '~' as c -> Bytes.unsafe_set s' !n c
| c ->
let a = Char.code c in
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10))));
incr n
let uchar_len = Utf8.next_utf8_length s !i in
(match Utf8.is_utf8_valid s !i uchar_len with
| true ->
assert (uchar_len > 1 && uchar_len < 5);
Bytes.unsafe_set s' !n (String.unsafe_get s !i);
Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1));
if uchar_len > 2
then Bytes.unsafe_set s' (!n + 2) (String.unsafe_get s (!i + 2));
if uchar_len > 3
then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3));
n := !n + uchar_len - 1;
i := !i + uchar_len - 1
| false ->
let a = Char.code c in
Bytes.unsafe_set s' !n '\\';
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
incr n;
Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10)))));
incr n;
incr i
done
;;

Expand Down
5 changes: 5 additions & 0 deletions src/dune_sexp/escape.mli
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
module Utf8 : sig
val next_utf8_length : string -> int -> int
val is_utf8_valid : string -> int -> int -> bool
end

val escaped : string -> string
val quoted : string -> string
36 changes: 28 additions & 8 deletions src/dune_sexp/lexer.mll
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ end

type t = with_comments:bool -> Lexing.lexbuf -> Token.t

let error ?(delta = 0) lexbuf message =
let error ?(delta = 0) ?(delta_stop = 0) lexbuf message =
let start = Lexing.lexeme_start_p lexbuf in
let stop = Lexing.lexeme_end_p lexbuf in
let loc =
Loc.create ~start:{ start with pos_cnum = start.pos_cnum + delta }
~stop:(Lexing.lexeme_end_p lexbuf)
~stop:{ stop with pos_cnum = stop.pos_cnum + delta_stop }
in
User_error.raise ~loc [ Pp.text message ]

Expand Down Expand Up @@ -144,6 +145,8 @@ let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F']
let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
let varname_char = atom_char # [ ':' '%' '{' '}' ]

let non_ascii = ['\128'-'\255']

rule token with_comments = parse
| newline
{ Lexing.new_line lexbuf; token with_comments lexbuf }
Expand Down Expand Up @@ -352,13 +355,30 @@ and template_variable = parse
}
| '}' | eof
{ error lexbuf "%{...} forms cannot be empty" }
| (varname_char* as skip) (_ as other)
| (varname_char+ ':' ((':' | varname_char)*) as skip) (_ as other)
| (varname_char* as skip) (non_ascii* as maybe_utf) (_ as other)
| (varname_char+ ':' ((':' | varname_char)*) as skip) (non_ascii* as maybe_utf) (_ as other)
{
error
~delta:(String.length skip)
lexbuf
(Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
let utf_len = String.length maybe_utf in
let uchar =
if utf_len > 1 then
let uchar_len = Escape.Utf8.next_utf8_length maybe_utf 0 in
if uchar_len <= utf_len && Escape.Utf8.is_utf8_valid maybe_utf 0 uchar_len then
Some (String.sub maybe_utf ~pos:0 ~len:uchar_len, uchar_len)
else None
else None
in
match uchar with
| Some (uchar, len) ->
error
~delta:(String.length skip)
~delta_stop:(-len)
lexbuf
(Printf.sprintf "The character %s is not allowed inside %%{...} forms" uchar)
| _ ->
error
~delta:(String.length skip)
lexbuf
(Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
}
{
Expand Down
25 changes: 21 additions & 4 deletions test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
Original file line number Diff line number Diff line change
@@ -1,13 +1,30 @@
How the non-ASCII characters are handled, this is also related to the issue #9728
Utf8 characters are handled for now, this is also related to the issue #9728

$ dune format-dune-file <<EOF
> ("É")
> ("Éff ĎúÑȨ")
> EOF
("\195\137")
("Éff ĎúÑȨ")

$ dune format-dune-file <<EOF
> (run foo %{bin:é})
> EOF
File "", line 1, characters 15-16:
Error: The character '\195' is not allowed inside %{...} forms
Error: The character é is not allowed inside %{...} forms
[1]

$ dune format-dune-file <<EOF
> (echo "hÉllo")
> EOF
(echo "hÉllo")

$ dune format-dune-file <<EOF
> (echo "É")
> EOF
(echo "É")

$ dune format-dune-file <<EOF
> (Écho "hello")
> EOF
File "", line 1, characters 1-1:
Error: Invalid . file
[1]

0 comments on commit 509b6f5

Please sign in to comment.