Update text encoding in kaun

Satarupa22-SD · Satarupa22-SD · commit 1ce702bbab19 · 2025-10-21T11:13:04.000Z
diff --git a/kaun/lib/kaun/dataset/dataset.ml b/kaun/lib/kaun/dataset/dataset.ml
@@ -185,72 +185,73 @@ let from_tensors (x, y) =
   }
 
 (* Text Data Sources *)
+let from_text_file ?(encoding = `UTF8) ?(chunk_size = 65536) path =
+  match encoding with
+  | `UTF8 | `ASCII | `LATIN1 ->
+      let enc_name =
+        match encoding with
+        | `UTF8 -> "utf-8"
+        | `ASCII -> "us-ascii"
+        | `LATIN1 -> "iso-8859-1"
+      in
+      let uenc_opt = Uutf.encoding_of_string enc_name in
+      let dec = Uutf.decoder ?encoding:uenc_opt `Manual in
+      let handle = create_mmap path in
+      let offset = ref 0 in
+      let closed = ref false in
+      let buf = Buffer.create 512 in
+      let lines_queue = Queue.create () in
+
+      let push_line_from_buf () =
+        let s = Buffer.contents buf in
+        Buffer.clear buf;
+        Queue.add s lines_queue
+      in
 
-let from_text_file ?encoding ?(chunk_size = 65536) path =
-  let _ = encoding in
-  (* TODO: Handle different encodings *)
-  let handle = create_mmap path in
-  let offset = ref 0 in
-  let buffer = ref "" in
-  let buffer_pos = ref 0 in
-  let closed = ref false in
-
-  let rec next_line () =
-    if !closed then None
-    else
-      (* Look for newline in buffer *)
-      try
-        let nl_pos = String.index_from !buffer !buffer_pos '\n' in
-        let line = String.sub !buffer !buffer_pos (nl_pos - !buffer_pos) in
-        buffer_pos := nl_pos + 1;
-        Some line
-      with Not_found ->
-        (* Need more data *)
-        if !offset >= handle.size then
-          (* End of file - return remaining buffer if any *)
-          if !buffer_pos < String.length !buffer then (
-            let line =
-              String.sub !buffer !buffer_pos
-                (String.length !buffer - !buffer_pos)
-            in
-            buffer := "";
-            buffer_pos := 0;
-            Some line)
-          else (
+      let rec drain_decoder () =
+        match Uutf.decode dec with
+        | `Uchar u ->
+            if Uchar.to_int u = 0x000A then push_line_from_buf ();
+            if Uchar.to_int u <> 0x000A then Uutf.Buffer.add_utf_8 buf u;
+            drain_decoder ()
+        | `Malformed _ ->
+            Uutf.Buffer.add_utf_8 buf Uutf.u_rep;
+            drain_decoder ()
+        | `Await ->
+            if !offset >= handle.size then
+              Uutf.Manual.src dec (Bytes.create 0) 0 0
+            else
+              let chunk =
+                read_mmap_chunk handle ~offset:!offset ~length:chunk_size
+              in
+              offset := !offset + String.length chunk;
+              let bytes = Bytes.of_string chunk in
+              Uutf.Manual.src dec bytes 0 (Bytes.length bytes);
+              drain_decoder ()
+        | `End ->
+            if Buffer.length buf > 0 then push_line_from_buf ();
+            ()
+      in
+
+      let rec next_line () =
+        if not (Queue.is_empty lines_queue) then Some (Queue.take lines_queue)
+        else if !closed then None
+        else (
+          drain_decoder ();
+          if not (Queue.is_empty lines_queue) then Some (Queue.take lines_queue)
+          else if !offset >= handle.size then (
             close_mmap handle;
             closed := true;
             None)
-        else
-          (* Read next chunk *)
-          let chunk =
-            read_mmap_chunk handle ~offset:!offset ~length:chunk_size
-          in
-          offset := !offset + String.length chunk;
-
-          (* Append to remaining buffer *)
-          if !buffer_pos < String.length !buffer then
-            buffer :=
-              String.sub !buffer !buffer_pos
-                (String.length !buffer - !buffer_pos)
-              ^ chunk
-          else buffer := chunk;
-          buffer_pos := 0;
-          next_line ()
-  in
-
-  let reset () =
-    offset := 0;
-    buffer := "";
-    buffer_pos := 0;
-    closed := false
-  in
+          else next_line ())
+      in
 
-  {
-    next = next_line;
-    cardinality = (fun () -> Unknown);
-    reset = Some reset;
-    spec = (fun () -> Scalar "string");
-  }
+      {
+        next = next_line;
+        cardinality = (fun () -> Unknown);
+        reset = None;
+        spec = (fun () -> Scalar "string");
+      }
 
 let from_text_files ?(encoding = `UTF8) ?(chunk_size = 65536) paths =
   let current_file = ref 0 in
@@ -262,7 +263,8 @@ let from_text_files ?(encoding = `UTF8) ?(chunk_size = 65536) paths =
         if !current_file >= List.length paths then None
         else
           let path = List.nth paths !current_file in
-          current_dataset := Some (from_text_file ~encoding ~chunk_size path);
+          let ds = from_text_file ~encoding ~chunk_size path in
+          current_dataset := Some ds;
           incr current_file;
           next ()
     | Some ds -> (
diff --git a/kaun/lib/kaun/dataset/dataset.mli b/kaun/lib/kaun/dataset/dataset.mli
@@ -70,15 +70,21 @@ val from_file : (string -> 'a) -> string -> 'a t
 (** {2 Text Data Sources} *)
 
 val from_text_file :
-  ?encoding:[ `UTF8 | `ASCII ] -> ?chunk_size:int -> string -> string t
+  ?encoding:[ `UTF8 | `ASCII | `LATIN1 ] ->
+  ?chunk_size:int ->
+  string ->
+  string t
 (** [from_text_file ?encoding ?chunk_size path] creates a memory-mapped text
     dataset yielding lines as strings.
     - [encoding]: Text encoding (default: UTF8)
     - [chunk_size]: Size of chunks to read at once (default: 64KB) The file is
       memory-mapped and read lazily in chunks. *)
 
 val from_text_files :
-  ?encoding:[ `UTF8 | `ASCII ] -> ?chunk_size:int -> string list -> string t
+  ?encoding:[ `UTF8 | `ASCII | `LATIN1 ] ->
+  ?chunk_size:int ->
+  string list ->
+  string t
 (** [from_text_files paths] creates a dataset from multiple text files. Files
     are processed sequentially without loading all into memory. *)
 
diff --git a/kaun/lib/kaun/dune b/kaun/lib/kaun/dune
@@ -3,4 +3,4 @@
 (library
  (name kaun)
  (public_name kaun)
- (libraries rune unix str nx nx.core nx.io yojson domainslib))
+ (libraries rune unix str nx nx.core nx.io yojson domainslib uutf))
diff --git a/kaun/test/test_dataset.ml b/kaun/test/test_dataset.ml
@@ -69,6 +69,23 @@ let test_from_text_file () =
         [ "line1"; "line2"; "line3" ]
         collected)
 
+(* Test for utf8 *)
+let test_from_text_file_utf8 () =
+  let content = "hello \xF0\x9F\x98\x8A\nsecond\n" in
+  with_temp_file content (fun path ->
+      let ds = from_text_file ~encoding:`UTF8 path in
+      let lines = collect_dataset ds in
+      Alcotest.(check (list string))
+        "utf8 emoji preserved" [ "hello 😊"; "second" ] lines)
+
+(* Test for Latin1 *)
+let test_from_text_file_latin1 () =
+  let content = "caf\xE9\nna\xEFve\n" in
+  with_temp_file content (fun path ->
+      let ds = from_text_file ~encoding:`LATIN1 path in
+      let lines = collect_dataset ds in
+      Alcotest.(check (list string)) "latin1 decoded" [ "café"; "naïve" ] lines)
+
 let test_from_text_file_large_lines () =
   let line = String.make 1000 'x' in
   let content = line ^ "\n" ^ line ^ "\n" in
@@ -618,6 +635,8 @@ let () =
       ( "text_files",
         [
           test_case "from_text_file" `Quick test_from_text_file;
+          test_case "from_text_file_utf8" `Quick test_from_text_file_utf8;
+          test_case "from_text_file_latin1" `Quick test_from_text_file_latin1;
           test_case "from_text_file_large_lines" `Quick
             test_from_text_file_large_lines;
           test_case "from_text_files" `Quick test_from_text_files;