1
+ use std:: sync:: Arc ;
2
+
3
+ use bytes:: Bytes ;
1
4
use chrono:: { DateTime , Utc } ;
5
+ use futures:: stream:: BoxStream ;
6
+ use futures:: StreamExt ;
2
7
use object_store:: { GetOptions , GetResult , ObjectStore } ;
3
- use pyo3:: exceptions:: PyValueError ;
8
+ use pyo3:: exceptions:: { PyStopAsyncIteration , PyStopIteration , PyValueError } ;
4
9
use pyo3:: prelude:: * ;
5
10
use pyo3:: types:: PyBytes ;
6
11
use pyo3_object_store:: error:: { PyObjectStoreError , PyObjectStoreResult } ;
7
12
use pyo3_object_store:: PyObjectStore ;
13
+ use tokio:: sync:: Mutex ;
8
14
9
15
use crate :: list:: PyObjectMeta ;
10
16
use crate :: runtime:: get_runtime;
11
17
18
+ /// 10MB default chunk size
19
+ const DEFAULT_BYTES_CHUNK_SIZE : usize = 10 * 1024 * 1024 ;
20
+
12
21
#[ derive( FromPyObject ) ]
13
22
pub ( crate ) struct PyGetOptions {
14
23
if_match : Option < String > ,
@@ -54,7 +63,7 @@ impl PyGetResult {
54
63
let runtime = get_runtime ( py) ?;
55
64
py. allow_threads ( || {
56
65
let bytes = runtime. block_on ( get_result. bytes ( ) ) ?;
57
- Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper ( bytes) )
66
+ Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper :: new ( bytes) )
58
67
} )
59
68
}
60
69
@@ -68,7 +77,7 @@ impl PyGetResult {
68
77
. bytes ( )
69
78
. await
70
79
. map_err ( PyObjectStoreError :: ObjectStoreError ) ?;
71
- Ok ( PyBytesWrapper ( bytes) )
80
+ Ok ( PyBytesWrapper :: new ( bytes) )
72
81
} )
73
82
}
74
83
@@ -80,14 +89,129 @@ impl PyGetResult {
80
89
. ok_or ( PyValueError :: new_err ( "Result has already been disposed." ) ) ?;
81
90
Ok ( PyObjectMeta :: new ( inner. meta . clone ( ) ) )
82
91
}
92
+
93
+ #[ pyo3( signature = ( min_chunk_size = DEFAULT_BYTES_CHUNK_SIZE ) ) ]
94
+ fn stream ( & mut self , min_chunk_size : usize ) -> PyResult < PyBytesStream > {
95
+ let get_result = self
96
+ . 0
97
+ . take ( )
98
+ . ok_or ( PyValueError :: new_err ( "Result has already been disposed." ) ) ?;
99
+ Ok ( PyBytesStream :: new ( get_result. into_stream ( ) , min_chunk_size) )
100
+ }
101
+
102
+ fn __aiter__ ( & mut self ) -> PyResult < PyBytesStream > {
103
+ self . stream ( DEFAULT_BYTES_CHUNK_SIZE )
104
+ }
105
+
106
+ fn __iter__ ( & mut self ) -> PyResult < PyBytesStream > {
107
+ self . stream ( DEFAULT_BYTES_CHUNK_SIZE )
108
+ }
109
+ }
110
+
111
+ #[ pyclass( name = "BytesStream" ) ]
112
+ pub struct PyBytesStream {
113
+ stream : Arc < Mutex < BoxStream < ' static , object_store:: Result < Bytes > > > > ,
114
+ min_chunk_size : usize ,
115
+ }
116
+
117
+ impl PyBytesStream {
118
+ fn new ( stream : BoxStream < ' static , object_store:: Result < Bytes > > , min_chunk_size : usize ) -> Self {
119
+ Self {
120
+ stream : Arc :: new ( Mutex :: new ( stream) ) ,
121
+ min_chunk_size,
122
+ }
123
+ }
124
+ }
125
+
126
+ async fn next_stream (
127
+ stream : Arc < Mutex < BoxStream < ' static , object_store:: Result < Bytes > > > > ,
128
+ min_chunk_size : usize ,
129
+ sync : bool ,
130
+ ) -> PyResult < PyBytesWrapper > {
131
+ let mut stream = stream. lock ( ) . await ;
132
+ let mut buffers: Vec < Bytes > = vec ! [ ] ;
133
+ loop {
134
+ match stream. next ( ) . await {
135
+ Some ( Ok ( bytes) ) => {
136
+ buffers. push ( bytes) ;
137
+ let total_buffer_len = buffers. iter ( ) . fold ( 0 , |acc, buf| acc + buf. len ( ) ) ;
138
+ if total_buffer_len >= min_chunk_size {
139
+ return Ok ( PyBytesWrapper :: new_multiple ( buffers) ) ;
140
+ }
141
+ }
142
+ Some ( Err ( e) ) => return Err ( PyObjectStoreError :: from ( e) . into ( ) ) ,
143
+ None => {
144
+ if buffers. is_empty ( ) {
145
+ // Depending on whether the iteration is sync or not, we raise either a
146
+ // StopIteration or a StopAsyncIteration
147
+ if sync {
148
+ return Err ( PyStopIteration :: new_err ( "stream exhausted" ) ) ;
149
+ } else {
150
+ return Err ( PyStopAsyncIteration :: new_err ( "stream exhausted" ) ) ;
151
+ }
152
+ } else {
153
+ return Ok ( PyBytesWrapper :: new_multiple ( buffers) ) ;
154
+ }
155
+ }
156
+ } ;
157
+ }
158
+ }
159
+
160
+ #[ pymethods]
161
+ impl PyBytesStream {
162
+ fn __aiter__ ( slf : Py < Self > ) -> Py < Self > {
163
+ slf
164
+ }
165
+
166
+ fn __iter__ ( slf : Py < Self > ) -> Py < Self > {
167
+ slf
168
+ }
169
+
170
+ fn __anext__ < ' py > ( & ' py self , py : Python < ' py > ) -> PyResult < Bound < PyAny > > {
171
+ let stream = self . stream . clone ( ) ;
172
+ pyo3_async_runtimes:: tokio:: future_into_py (
173
+ py,
174
+ next_stream ( stream, self . min_chunk_size , false ) ,
175
+ )
176
+ }
177
+
178
+ fn __next__ < ' py > ( & ' py self , py : Python < ' py > ) -> PyResult < PyBytesWrapper > {
179
+ let runtime = get_runtime ( py) ?;
180
+ let stream = self . stream . clone ( ) ;
181
+ runtime. block_on ( next_stream ( stream, self . min_chunk_size , true ) )
182
+ }
83
183
}
84
184
85
- pub ( crate ) struct PyBytesWrapper ( bytes:: Bytes ) ;
185
+ pub ( crate ) struct PyBytesWrapper ( Vec < Bytes > ) ;
186
+
187
+ impl PyBytesWrapper {
188
+ pub fn new ( buf : Bytes ) -> Self {
189
+ Self ( vec ! [ buf] )
190
+ }
86
191
87
- // TODO: return buffer protocol object
192
+ pub fn new_multiple ( buffers : Vec < Bytes > ) -> Self {
193
+ Self ( buffers)
194
+ }
195
+ }
196
+
197
+ // TODO: return buffer protocol object? This isn't possible on an array of Bytes, so if you want to
198
+ // support the buffer protocol in the future (e.g. for get_range) you may need to have a separate
199
+ // wrapper of Bytes
88
200
impl IntoPy < PyObject > for PyBytesWrapper {
89
201
fn into_py ( self , py : Python < ' _ > ) -> PyObject {
90
- PyBytes :: new_bound ( py, & self . 0 ) . into_py ( py)
202
+ let total_len = self . 0 . iter ( ) . fold ( 0 , |acc, buf| acc + buf. len ( ) ) ;
203
+ // Copy all internal Bytes objects into a single PyBytes
204
+ // Since our inner callback is infallible, this will only panic on out of memory
205
+ PyBytes :: new_bound_with ( py, total_len, |target| {
206
+ let mut offset = 0 ;
207
+ for buf in self . 0 . iter ( ) {
208
+ target[ offset..offset + buf. len ( ) ] . copy_from_slice ( buf) ;
209
+ offset += buf. len ( ) ;
210
+ }
211
+ Ok ( ( ) )
212
+ } )
213
+ . unwrap ( )
214
+ . into_py ( py)
91
215
}
92
216
}
93
217
@@ -144,7 +268,7 @@ pub(crate) fn get_range(
144
268
let range = offset..offset + length;
145
269
py. allow_threads ( || {
146
270
let out = runtime. block_on ( store. as_ref ( ) . get_range ( & location. into ( ) , range) ) ?;
147
- Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper ( out) )
271
+ Ok :: < _ , PyObjectStoreError > ( PyBytesWrapper :: new ( out) )
148
272
} )
149
273
}
150
274
@@ -163,7 +287,7 @@ pub(crate) fn get_range_async(
163
287
. get_range ( & location. into ( ) , range)
164
288
. await
165
289
. map_err ( PyObjectStoreError :: ObjectStoreError ) ?;
166
- Ok ( PyBytesWrapper ( out) )
290
+ Ok ( PyBytesWrapper :: new ( out) )
167
291
} )
168
292
}
169
293
@@ -183,7 +307,7 @@ pub(crate) fn get_ranges(
183
307
. collect :: < Vec < _ > > ( ) ;
184
308
py. allow_threads ( || {
185
309
let out = runtime. block_on ( store. as_ref ( ) . get_ranges ( & location. into ( ) , & ranges) ) ?;
186
- Ok :: < _ , PyObjectStoreError > ( out. into_iter ( ) . map ( PyBytesWrapper ) . collect ( ) )
310
+ Ok :: < _ , PyObjectStoreError > ( out. into_iter ( ) . map ( PyBytesWrapper :: new ) . collect ( ) )
187
311
} )
188
312
}
189
313
@@ -206,6 +330,6 @@ pub(crate) fn get_ranges_async(
206
330
. get_ranges ( & location. into ( ) , & ranges)
207
331
. await
208
332
. map_err ( PyObjectStoreError :: ObjectStoreError ) ?;
209
- Ok ( out. into_iter ( ) . map ( PyBytesWrapper ) . collect :: < Vec < _ > > ( ) )
333
+ Ok ( out. into_iter ( ) . map ( PyBytesWrapper :: new ) . collect :: < Vec < _ > > ( ) )
210
334
} )
211
335
}
0 commit comments