Skip to content

Commit 792fdd7

Browse files
committed
CA-361221: utf8_recode: use Uutf.{Buffer.add_utf_8,String.fold_utf_8} instead of Uutf.{encode,decoder}
Uutf.encoder allocates a 64k buffer internally (in addition to our 1k buffer). Use Uutf.Buffer to write utf8 chars directly into our buffer instead. Also size the buffer based on initial string size (it can grow if needed). This should reduce allocation rate. Thanks to a hint from https://discuss.ocaml.org/t/decoding-many-unicode-strings-with-uutf/8910/2 Also avoid allocations if string is all utf8 This could be further optimized to stop on first invalid utf8 char and do the recode on the rest. Signed-off-by: Edwin Török <edvin.torok@citrix.com>
1 parent 0c46479 commit 792fdd7

File tree

1 file changed

+14
-15
lines changed

1 file changed

+14
-15
lines changed

lib/xenops_utils.ml

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -91,22 +91,21 @@ let ignore_bool (_: bool) = ()
9191
let ignore_int (_: int) = ()
9292

9393
(* Recode an incoming string as valid UTF-8 *)
94+
let utf8_add_if_valid buf _ next =
95+
let uchar = match next with `Malformed _ -> Uutf.u_rep | `Uchar u -> u in
96+
Uutf.Buffer.add_utf_8 buf uchar ;
97+
buf
98+
99+
let is_valid prev _ = function `Malformed _ -> false | `Uchar _ -> prev
100+
94101
let utf8_recode str =
95-
let out_encoding = `UTF_8 in
96-
let b = Buffer.create 1024 in
97-
let dst = `Buffer b in
98-
let src = `String str in
99-
let rec loop d e =
100-
match Uutf.decode d with
101-
| `Uchar _ as u -> ignore (Uutf.encode e u); loop d e
102-
| `End -> ignore (Uutf.encode e `End)
103-
|`Malformed _ -> ignore (Uutf.encode e (`Uchar Uutf.u_rep)); loop d e
104-
| `Await -> assert false
105-
in
106-
let d = Uutf.decoder src in
107-
let e = Uutf.encoder out_encoding dst in
108-
loop d e;
109-
Buffer.contents b
102+
(* optimistic assumption that the string contains only valid utf8,
103+
avoids allocations *)
104+
if Uutf.String.fold_utf_8 is_valid true str then
105+
str
106+
else
107+
let b = Buffer.create (String.length str) in
108+
Uutf.String.fold_utf_8 utf8_add_if_valid b str |> Buffer.contents
110109

111110
module Mutex = struct
112111
include Mutex

0 commit comments

Comments
 (0)