Skip to content

Commit 1ce702b

Browse files
committed
Update text encoding in kaun
1 parent 51bc7ab commit 1ce702b

File tree

4 files changed

+92
-65
lines changed

4 files changed

+92
-65
lines changed

kaun/lib/kaun/dataset/dataset.ml

Lines changed: 64 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -185,72 +185,73 @@ let from_tensors (x, y) =
185185
}
186186

187187
(* Text Data Sources *)
188+
let from_text_file ?(encoding = `UTF8) ?(chunk_size = 65536) path =
189+
match encoding with
190+
| `UTF8 | `ASCII | `LATIN1 ->
191+
let enc_name =
192+
match encoding with
193+
| `UTF8 -> "utf-8"
194+
| `ASCII -> "us-ascii"
195+
| `LATIN1 -> "iso-8859-1"
196+
in
197+
let uenc_opt = Uutf.encoding_of_string enc_name in
198+
let dec = Uutf.decoder ?encoding:uenc_opt `Manual in
199+
let handle = create_mmap path in
200+
let offset = ref 0 in
201+
let closed = ref false in
202+
let buf = Buffer.create 512 in
203+
let lines_queue = Queue.create () in
204+
205+
let push_line_from_buf () =
206+
let s = Buffer.contents buf in
207+
Buffer.clear buf;
208+
Queue.add s lines_queue
209+
in
188210

189-
let from_text_file ?encoding ?(chunk_size = 65536) path =
190-
let _ = encoding in
191-
(* TODO: Handle different encodings *)
192-
let handle = create_mmap path in
193-
let offset = ref 0 in
194-
let buffer = ref "" in
195-
let buffer_pos = ref 0 in
196-
let closed = ref false in
197-
198-
let rec next_line () =
199-
if !closed then None
200-
else
201-
(* Look for newline in buffer *)
202-
try
203-
let nl_pos = String.index_from !buffer !buffer_pos '\n' in
204-
let line = String.sub !buffer !buffer_pos (nl_pos - !buffer_pos) in
205-
buffer_pos := nl_pos + 1;
206-
Some line
207-
with Not_found ->
208-
(* Need more data *)
209-
if !offset >= handle.size then
210-
(* End of file - return remaining buffer if any *)
211-
if !buffer_pos < String.length !buffer then (
212-
let line =
213-
String.sub !buffer !buffer_pos
214-
(String.length !buffer - !buffer_pos)
215-
in
216-
buffer := "";
217-
buffer_pos := 0;
218-
Some line)
219-
else (
211+
let rec drain_decoder () =
212+
match Uutf.decode dec with
213+
| `Uchar u ->
214+
if Uchar.to_int u = 0x000A then push_line_from_buf ();
215+
if Uchar.to_int u <> 0x000A then Uutf.Buffer.add_utf_8 buf u;
216+
drain_decoder ()
217+
| `Malformed _ ->
218+
Uutf.Buffer.add_utf_8 buf Uutf.u_rep;
219+
drain_decoder ()
220+
| `Await ->
221+
if !offset >= handle.size then
222+
Uutf.Manual.src dec (Bytes.create 0) 0 0
223+
else
224+
let chunk =
225+
read_mmap_chunk handle ~offset:!offset ~length:chunk_size
226+
in
227+
offset := !offset + String.length chunk;
228+
let bytes = Bytes.of_string chunk in
229+
Uutf.Manual.src dec bytes 0 (Bytes.length bytes);
230+
drain_decoder ()
231+
| `End ->
232+
if Buffer.length buf > 0 then push_line_from_buf ();
233+
()
234+
in
235+
236+
let rec next_line () =
237+
if not (Queue.is_empty lines_queue) then Some (Queue.take lines_queue)
238+
else if !closed then None
239+
else (
240+
drain_decoder ();
241+
if not (Queue.is_empty lines_queue) then Some (Queue.take lines_queue)
242+
else if !offset >= handle.size then (
220243
close_mmap handle;
221244
closed := true;
222245
None)
223-
else
224-
(* Read next chunk *)
225-
let chunk =
226-
read_mmap_chunk handle ~offset:!offset ~length:chunk_size
227-
in
228-
offset := !offset + String.length chunk;
229-
230-
(* Append to remaining buffer *)
231-
if !buffer_pos < String.length !buffer then
232-
buffer :=
233-
String.sub !buffer !buffer_pos
234-
(String.length !buffer - !buffer_pos)
235-
^ chunk
236-
else buffer := chunk;
237-
buffer_pos := 0;
238-
next_line ()
239-
in
240-
241-
let reset () =
242-
offset := 0;
243-
buffer := "";
244-
buffer_pos := 0;
245-
closed := false
246-
in
246+
else next_line ())
247+
in
247248

248-
{
249-
next = next_line;
250-
cardinality = (fun () -> Unknown);
251-
reset = Some reset;
252-
spec = (fun () -> Scalar "string");
253-
}
249+
{
250+
next = next_line;
251+
cardinality = (fun () -> Unknown);
252+
reset = None;
253+
spec = (fun () -> Scalar "string");
254+
}
254255

255256
let from_text_files ?(encoding = `UTF8) ?(chunk_size = 65536) paths =
256257
let current_file = ref 0 in
@@ -262,7 +263,8 @@ let from_text_files ?(encoding = `UTF8) ?(chunk_size = 65536) paths =
262263
if !current_file >= List.length paths then None
263264
else
264265
let path = List.nth paths !current_file in
265-
current_dataset := Some (from_text_file ~encoding ~chunk_size path);
266+
let ds = from_text_file ~encoding ~chunk_size path in
267+
current_dataset := Some ds;
266268
incr current_file;
267269
next ()
268270
| Some ds -> (

kaun/lib/kaun/dataset/dataset.mli

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,21 @@ val from_file : (string -> 'a) -> string -> 'a t
7070
(** {2 Text Data Sources} *)
7171

7272
val from_text_file :
73-
?encoding:[ `UTF8 | `ASCII ] -> ?chunk_size:int -> string -> string t
73+
?encoding:[ `UTF8 | `ASCII | `LATIN1 ] ->
74+
?chunk_size:int ->
75+
string ->
76+
string t
7477
(** [from_text_file ?encoding ?chunk_size path] creates a memory-mapped text
7578
dataset yielding lines as strings.
7679
- [encoding]: Text encoding (default: UTF8)
7780
- [chunk_size]: Size of chunks to read at once (default: 64KB) The file is
7881
memory-mapped and read lazily in chunks. *)
7982

8083
val from_text_files :
81-
?encoding:[ `UTF8 | `ASCII ] -> ?chunk_size:int -> string list -> string t
84+
?encoding:[ `UTF8 | `ASCII | `LATIN1 ] ->
85+
?chunk_size:int ->
86+
string list ->
87+
string t
8288
(** [from_text_files paths] creates a dataset from multiple text files. Files
8389
are processed sequentially without loading all into memory. *)
8490

kaun/lib/kaun/dune

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
(library
44
(name kaun)
55
(public_name kaun)
6-
(libraries rune unix str nx nx.core nx.io yojson domainslib))
6+
(libraries rune unix str nx nx.core nx.io yojson domainslib uutf))

kaun/test/test_dataset.ml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,23 @@ let test_from_text_file () =
6969
[ "line1"; "line2"; "line3" ]
7070
collected)
7171

72+
(* Test for utf8 *)
73+
let test_from_text_file_utf8 () =
74+
let content = "hello \xF0\x9F\x98\x8A\nsecond\n" in
75+
with_temp_file content (fun path ->
76+
let ds = from_text_file ~encoding:`UTF8 path in
77+
let lines = collect_dataset ds in
78+
Alcotest.(check (list string))
79+
"utf8 emoji preserved" [ "hello 😊"; "second" ] lines)
80+
81+
(* Test for Latin1 *)
82+
let test_from_text_file_latin1 () =
83+
let content = "caf\xE9\nna\xEFve\n" in
84+
with_temp_file content (fun path ->
85+
let ds = from_text_file ~encoding:`LATIN1 path in
86+
let lines = collect_dataset ds in
87+
Alcotest.(check (list string)) "latin1 decoded" [ "café"; "naïve" ] lines)
88+
7289
let test_from_text_file_large_lines () =
7390
let line = String.make 1000 'x' in
7491
let content = line ^ "\n" ^ line ^ "\n" in
@@ -618,6 +635,8 @@ let () =
618635
( "text_files",
619636
[
620637
test_case "from_text_file" `Quick test_from_text_file;
638+
test_case "from_text_file_utf8" `Quick test_from_text_file_utf8;
639+
test_case "from_text_file_latin1" `Quick test_from_text_file_latin1;
621640
test_case "from_text_file_large_lines" `Quick
622641
test_from_text_file_large_lines;
623642
test_case "from_text_files" `Quick test_from_text_files;

0 commit comments

Comments
 (0)