@@ -185,72 +185,73 @@ let from_tensors (x, y) =
185185 }
186186
187187(* Text Data Sources *)
188+ let from_text_file ?(encoding = `UTF8 ) ?(chunk_size = 65536 ) path =
189+ match encoding with
190+ | `UTF8 | `ASCII | `LATIN1 ->
191+ let enc_name =
192+ match encoding with
193+ | `UTF8 -> " utf-8"
194+ | `ASCII -> " us-ascii"
195+ | `LATIN1 -> " iso-8859-1"
196+ in
197+ let uenc_opt = Uutf. encoding_of_string enc_name in
198+ let dec = Uutf. decoder ?encoding:uenc_opt `Manual in
199+ let handle = create_mmap path in
200+ let offset = ref 0 in
201+ let closed = ref false in
202+ let buf = Buffer. create 512 in
203+ let lines_queue = Queue. create () in
204+
205+ let push_line_from_buf () =
206+ let s = Buffer. contents buf in
207+ Buffer. clear buf;
208+ Queue. add s lines_queue
209+ in
188210
189- let from_text_file ?encoding ?(chunk_size = 65536 ) path =
190- let _ = encoding in
191- (* TODO: Handle different encodings *)
192- let handle = create_mmap path in
193- let offset = ref 0 in
194- let buffer = ref " " in
195- let buffer_pos = ref 0 in
196- let closed = ref false in
197-
198- let rec next_line () =
199- if ! closed then None
200- else
201- (* Look for newline in buffer *)
202- try
203- let nl_pos = String. index_from ! buffer ! buffer_pos '\n' in
204- let line = String. sub ! buffer ! buffer_pos (nl_pos - ! buffer_pos) in
205- buffer_pos := nl_pos + 1 ;
206- Some line
207- with Not_found ->
208- (* Need more data *)
209- if ! offset > = handle.size then
210- (* End of file - return remaining buffer if any *)
211- if ! buffer_pos < String. length ! buffer then (
212- let line =
213- String. sub ! buffer ! buffer_pos
214- (String. length ! buffer - ! buffer_pos)
215- in
216- buffer := " " ;
217- buffer_pos := 0 ;
218- Some line)
219- else (
211+ let rec drain_decoder () =
212+ match Uutf. decode dec with
213+ | `Uchar u ->
214+ if Uchar. to_int u = 0x000A then push_line_from_buf () ;
215+ if Uchar. to_int u <> 0x000A then Uutf.Buffer. add_utf_8 buf u;
216+ drain_decoder ()
217+ | `Malformed _ ->
218+ Uutf.Buffer. add_utf_8 buf Uutf. u_rep;
219+ drain_decoder ()
220+ | `Await ->
221+ if ! offset > = handle.size then
222+ Uutf.Manual. src dec (Bytes. create 0 ) 0 0
223+ else
224+ let chunk =
225+ read_mmap_chunk handle ~offset: ! offset ~length: chunk_size
226+ in
227+ offset := ! offset + String. length chunk;
228+ let bytes = Bytes. of_string chunk in
229+ Uutf.Manual. src dec bytes 0 (Bytes. length bytes);
230+ drain_decoder ()
231+ | `End ->
232+ if Buffer. length buf > 0 then push_line_from_buf () ;
233+ ()
234+ in
235+
236+ let rec next_line () =
237+ if not (Queue. is_empty lines_queue) then Some (Queue. take lines_queue)
238+ else if ! closed then None
239+ else (
240+ drain_decoder () ;
241+ if not (Queue. is_empty lines_queue) then Some (Queue. take lines_queue)
242+ else if ! offset > = handle.size then (
220243 close_mmap handle;
221244 closed := true ;
222245 None )
223- else
224- (* Read next chunk *)
225- let chunk =
226- read_mmap_chunk handle ~offset: ! offset ~length: chunk_size
227- in
228- offset := ! offset + String. length chunk;
229-
230- (* Append to remaining buffer *)
231- if ! buffer_pos < String. length ! buffer then
232- buffer :=
233- String. sub ! buffer ! buffer_pos
234- (String. length ! buffer - ! buffer_pos)
235- ^ chunk
236- else buffer := chunk;
237- buffer_pos := 0 ;
238- next_line ()
239- in
240-
241- let reset () =
242- offset := 0 ;
243- buffer := " " ;
244- buffer_pos := 0 ;
245- closed := false
246- in
246+ else next_line () )
247+ in
247248
248- {
249- next = next_line;
250- cardinality = (fun () -> Unknown );
251- reset = Some reset;
252- spec = (fun () -> Scalar " string" );
253- }
249+ {
250+ next = next_line;
251+ cardinality = (fun () -> Unknown );
252+ reset = None ;
253+ spec = (fun () -> Scalar " string" );
254+ }
254255
255256let from_text_files ?(encoding = `UTF8 ) ?(chunk_size = 65536 ) paths =
256257 let current_file = ref 0 in
@@ -262,7 +263,8 @@ let from_text_files ?(encoding = `UTF8) ?(chunk_size = 65536) paths =
262263 if ! current_file > = List. length paths then None
263264 else
264265 let path = List. nth paths ! current_file in
265- current_dataset := Some (from_text_file ~encoding ~chunk_size path);
266+ let ds = from_text_file ~encoding ~chunk_size path in
267+ current_dataset := Some ds;
266268 incr current_file;
267269 next ()
268270 | Some ds -> (
0 commit comments