Skip to content

Commit c10f3fe

Browse files
authored
Add async iterator over bytes in result of get (#11)
* Add async iterator over bytes in result of `get` * Add synchronous iteration * Implement `aiter` and `iter` on `GetResult` * Chunk size * Add python test
1 parent 922b58f commit c10f3fe

File tree

6 files changed

+262
-13
lines changed

6 files changed

+262
-13
lines changed

object-store-rs/python/object_store_rs/_get.pyi

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,34 @@ class GetOptions(TypedDict):
8282
"""
8383

8484
class GetResult:
85-
"""Result for a get request"""
85+
"""Result for a get request.
86+
87+
You can materialize the entire buffer by using either `bytes` or `bytes_async`, or
88+
you can stream the result using `stream`. `__iter__` and `__aiter__` are implemented
89+
as aliases to `stream`, so you can alternatively call `iter()` or `aiter()` on
90+
`GetResult` to start an iterator.
91+
92+
Using as an async iterator:
93+
```py
94+
resp = await obs.get_async(store, path)
95+
# 5MB chunk size in stream
96+
stream = resp.stream(min_chunk_size=5 * 1024 * 1024)
97+
async for buf in stream:
98+
print(len(buf))
99+
```
100+
101+
Using as a sync iterator:
102+
```py
103+
resp = obs.get(store, path)
104+
# 20MB chunk size in stream
105+
stream = resp.stream(min_chunk_size=20 * 1024 * 1024)
106+
for buf in stream:
107+
print(len(buf))
108+
```
109+
110+
Note that after calling `bytes`, `bytes_async`, or `stream`, you will no longer be
111+
able to call other methods on this object, such as the `meta` attribute.
112+
"""
86113

87114
def bytes(self) -> bytes:
88115
"""
@@ -98,6 +125,45 @@ class GetResult:
98125
def meta(self) -> ObjectMeta:
99126
"""The ObjectMeta for this object"""
100127

128+
def stream(self, min_chunk_size: int = 10 * 1024 * 1024) -> BytesStream:
129+
"""Return a chunked stream over the result's bytes.
130+
131+
Args:
132+
min_chunk_size: The minimum size in bytes for each chunk in the returned
133+
`BytesStream`. All chunks except for the last chunk will be at least
134+
this size. Defaults to 10*1024*1024 (10MB).
135+
136+
Returns:
137+
A chunked stream
138+
"""
139+
140+
def __aiter__(self) -> BytesStream:
141+
"""
142+
Return a chunked stream over the result's bytes with the default (10MB) chunk
143+
size.
144+
"""
145+
146+
def __iter__(self) -> BytesStream:
147+
"""
148+
Return a chunked stream over the result's bytes with the default (10MB) chunk
149+
size.
150+
"""
151+
152+
class BytesStream:
153+
"""An async stream of bytes."""
154+
155+
def __aiter__(self) -> BytesStream:
156+
"""Return `Self` as an async iterator."""
157+
158+
def __iter__(self) -> BytesStream:
159+
"""Return `Self` as an async iterator."""
160+
161+
async def __anext__(self) -> bytes:
162+
"""Return the next chunk of bytes in the stream."""
163+
164+
def __next__(self) -> bytes:
165+
"""Return the next chunk of bytes in the stream."""
166+
101167
def get(
102168
store: ObjectStore, location: str, *, options: GetOptions | None = None
103169
) -> GetResult:

object-store-rs/src/get.rs

Lines changed: 134 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,23 @@
1+
use std::sync::Arc;
2+
3+
use bytes::Bytes;
14
use chrono::{DateTime, Utc};
5+
use futures::stream::BoxStream;
6+
use futures::StreamExt;
27
use object_store::{GetOptions, GetResult, ObjectStore};
3-
use pyo3::exceptions::PyValueError;
8+
use pyo3::exceptions::{PyStopAsyncIteration, PyStopIteration, PyValueError};
49
use pyo3::prelude::*;
510
use pyo3::types::PyBytes;
611
use pyo3_object_store::error::{PyObjectStoreError, PyObjectStoreResult};
712
use pyo3_object_store::PyObjectStore;
13+
use tokio::sync::Mutex;
814

915
use crate::list::PyObjectMeta;
1016
use crate::runtime::get_runtime;
1117

18+
/// 10MB default chunk size
19+
const DEFAULT_BYTES_CHUNK_SIZE: usize = 10 * 1024 * 1024;
20+
1221
#[derive(FromPyObject)]
1322
pub(crate) struct PyGetOptions {
1423
if_match: Option<String>,
@@ -54,7 +63,7 @@ impl PyGetResult {
5463
let runtime = get_runtime(py)?;
5564
py.allow_threads(|| {
5665
let bytes = runtime.block_on(get_result.bytes())?;
57-
Ok::<_, PyObjectStoreError>(PyBytesWrapper(bytes))
66+
Ok::<_, PyObjectStoreError>(PyBytesWrapper::new(bytes))
5867
})
5968
}
6069

@@ -68,7 +77,7 @@ impl PyGetResult {
6877
.bytes()
6978
.await
7079
.map_err(PyObjectStoreError::ObjectStoreError)?;
71-
Ok(PyBytesWrapper(bytes))
80+
Ok(PyBytesWrapper::new(bytes))
7281
})
7382
}
7483

@@ -80,14 +89,129 @@ impl PyGetResult {
8089
.ok_or(PyValueError::new_err("Result has already been disposed."))?;
8190
Ok(PyObjectMeta::new(inner.meta.clone()))
8291
}
92+
93+
#[pyo3(signature = (min_chunk_size = DEFAULT_BYTES_CHUNK_SIZE))]
94+
fn stream(&mut self, min_chunk_size: usize) -> PyResult<PyBytesStream> {
95+
let get_result = self
96+
.0
97+
.take()
98+
.ok_or(PyValueError::new_err("Result has already been disposed."))?;
99+
Ok(PyBytesStream::new(get_result.into_stream(), min_chunk_size))
100+
}
101+
102+
fn __aiter__(&mut self) -> PyResult<PyBytesStream> {
103+
self.stream(DEFAULT_BYTES_CHUNK_SIZE)
104+
}
105+
106+
fn __iter__(&mut self) -> PyResult<PyBytesStream> {
107+
self.stream(DEFAULT_BYTES_CHUNK_SIZE)
108+
}
109+
}
110+
111+
#[pyclass(name = "BytesStream")]
112+
pub struct PyBytesStream {
113+
stream: Arc<Mutex<BoxStream<'static, object_store::Result<Bytes>>>>,
114+
min_chunk_size: usize,
115+
}
116+
117+
impl PyBytesStream {
118+
fn new(stream: BoxStream<'static, object_store::Result<Bytes>>, min_chunk_size: usize) -> Self {
119+
Self {
120+
stream: Arc::new(Mutex::new(stream)),
121+
min_chunk_size,
122+
}
123+
}
124+
}
125+
126+
async fn next_stream(
127+
stream: Arc<Mutex<BoxStream<'static, object_store::Result<Bytes>>>>,
128+
min_chunk_size: usize,
129+
sync: bool,
130+
) -> PyResult<PyBytesWrapper> {
131+
let mut stream = stream.lock().await;
132+
let mut buffers: Vec<Bytes> = vec![];
133+
loop {
134+
match stream.next().await {
135+
Some(Ok(bytes)) => {
136+
buffers.push(bytes);
137+
let total_buffer_len = buffers.iter().fold(0, |acc, buf| acc + buf.len());
138+
if total_buffer_len >= min_chunk_size {
139+
return Ok(PyBytesWrapper::new_multiple(buffers));
140+
}
141+
}
142+
Some(Err(e)) => return Err(PyObjectStoreError::from(e).into()),
143+
None => {
144+
if buffers.is_empty() {
145+
// Depending on whether the iteration is sync or not, we raise either a
146+
// StopIteration or a StopAsyncIteration
147+
if sync {
148+
return Err(PyStopIteration::new_err("stream exhausted"));
149+
} else {
150+
return Err(PyStopAsyncIteration::new_err("stream exhausted"));
151+
}
152+
} else {
153+
return Ok(PyBytesWrapper::new_multiple(buffers));
154+
}
155+
}
156+
};
157+
}
158+
}
159+
160+
#[pymethods]
161+
impl PyBytesStream {
162+
fn __aiter__(slf: Py<Self>) -> Py<Self> {
163+
slf
164+
}
165+
166+
fn __iter__(slf: Py<Self>) -> Py<Self> {
167+
slf
168+
}
169+
170+
fn __anext__<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<PyAny>> {
171+
let stream = self.stream.clone();
172+
pyo3_async_runtimes::tokio::future_into_py(
173+
py,
174+
next_stream(stream, self.min_chunk_size, false),
175+
)
176+
}
177+
178+
fn __next__<'py>(&'py self, py: Python<'py>) -> PyResult<PyBytesWrapper> {
179+
let runtime = get_runtime(py)?;
180+
let stream = self.stream.clone();
181+
runtime.block_on(next_stream(stream, self.min_chunk_size, true))
182+
}
83183
}
84184

85-
pub(crate) struct PyBytesWrapper(bytes::Bytes);
185+
pub(crate) struct PyBytesWrapper(Vec<Bytes>);
186+
187+
impl PyBytesWrapper {
188+
pub fn new(buf: Bytes) -> Self {
189+
Self(vec![buf])
190+
}
86191

87-
// TODO: return buffer protocol object
192+
pub fn new_multiple(buffers: Vec<Bytes>) -> Self {
193+
Self(buffers)
194+
}
195+
}
196+
197+
// TODO: return buffer protocol object? This isn't possible on an array of Bytes, so if you want to
198+
// support the buffer protocol in the future (e.g. for get_range) you may need to have a separate
199+
// wrapper of Bytes
88200
impl IntoPy<PyObject> for PyBytesWrapper {
89201
fn into_py(self, py: Python<'_>) -> PyObject {
90-
PyBytes::new_bound(py, &self.0).into_py(py)
202+
let total_len = self.0.iter().fold(0, |acc, buf| acc + buf.len());
203+
// Copy all internal Bytes objects into a single PyBytes
204+
// Since our inner callback is infallible, this will only panic on out of memory
205+
PyBytes::new_bound_with(py, total_len, |target| {
206+
let mut offset = 0;
207+
for buf in self.0.iter() {
208+
target[offset..offset + buf.len()].copy_from_slice(buf);
209+
offset += buf.len();
210+
}
211+
Ok(())
212+
})
213+
.unwrap()
214+
.into_py(py)
91215
}
92216
}
93217

@@ -144,7 +268,7 @@ pub(crate) fn get_range(
144268
let range = offset..offset + length;
145269
py.allow_threads(|| {
146270
let out = runtime.block_on(store.as_ref().get_range(&location.into(), range))?;
147-
Ok::<_, PyObjectStoreError>(PyBytesWrapper(out))
271+
Ok::<_, PyObjectStoreError>(PyBytesWrapper::new(out))
148272
})
149273
}
150274

@@ -163,7 +287,7 @@ pub(crate) fn get_range_async(
163287
.get_range(&location.into(), range)
164288
.await
165289
.map_err(PyObjectStoreError::ObjectStoreError)?;
166-
Ok(PyBytesWrapper(out))
290+
Ok(PyBytesWrapper::new(out))
167291
})
168292
}
169293

@@ -183,7 +307,7 @@ pub(crate) fn get_ranges(
183307
.collect::<Vec<_>>();
184308
py.allow_threads(|| {
185309
let out = runtime.block_on(store.as_ref().get_ranges(&location.into(), &ranges))?;
186-
Ok::<_, PyObjectStoreError>(out.into_iter().map(PyBytesWrapper).collect())
310+
Ok::<_, PyObjectStoreError>(out.into_iter().map(PyBytesWrapper::new).collect())
187311
})
188312
}
189313

@@ -206,6 +330,6 @@ pub(crate) fn get_ranges_async(
206330
.get_ranges(&location.into(), &ranges)
207331
.await
208332
.map_err(PyObjectStoreError::ObjectStoreError)?;
209-
Ok(out.into_iter().map(PyBytesWrapper).collect::<Vec<_>>())
333+
Ok(out.into_iter().map(PyBytesWrapper::new).collect::<Vec<_>>())
210334
})
211335
}

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ dev-dependencies = [
1919
"mkdocstrings[python]>=0.26.1",
2020
"pandas>=2.2.3",
2121
"pip>=24.2",
22+
"pytest-asyncio>=0.24.0",
2223
"pytest>=8.3.3",
2324
]
2425

tests/test_get.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import object_store_rs as obs
2+
import pytest
3+
from object_store_rs.store import MemoryStore
4+
5+
6+
def test_stream_sync():
7+
store = MemoryStore()
8+
9+
data = b"the quick brown fox jumps over the lazy dog," * 5000
10+
path = "big-data.txt"
11+
12+
obs.put_file(store, path, data)
13+
resp = obs.get(store, path)
14+
stream = resp.stream(min_chunk_size=0)
15+
16+
# Note: it looks from manual testing that with the local store we're only getting
17+
# one chunk and not able to test the chunk sizing.
18+
pos = 0
19+
for chunk in stream:
20+
size = len(chunk)
21+
assert chunk == data[pos : pos + size]
22+
pos += size
23+
24+
assert pos == len(data)
25+
26+
27+
@pytest.mark.asyncio
28+
async def test_stream_async():
29+
store = MemoryStore()
30+
31+
data = b"the quick brown fox jumps over the lazy dog," * 5000
32+
path = "big-data.txt"
33+
34+
await obs.put_file_async(store, path, data)
35+
resp = await obs.get_async(store, path)
36+
stream = resp.stream(min_chunk_size=0)
37+
38+
# Note: it looks from manual testing that with the local store we're only getting
39+
# one chunk and not able to test the chunk sizing.
40+
pos = 0
41+
async for chunk in stream:
42+
size = len(chunk)
43+
assert chunk == data[pos : pos + size]
44+
pos += size
45+
46+
assert pos == len(data)

tests/test_hello.py

Lines changed: 0 additions & 2 deletions
This file was deleted.

uv.lock

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)