Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: support reading from in-memory buffers #25

Merged
merged 20 commits into from
Apr 2, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0a10bc4
ENH: support reading from in-memory buffers
jorisvandenbossche Nov 12, 2021
60b8d28
passthrough bytes in read_dataframe
jorisvandenbossche Nov 12, 2021
d81a70c
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Jan 27, 2022
dadea0c
fixup merge
jorisvandenbossche Jan 27, 2022
083a32a
attribute buffer_to_virtual_file to fiona + clean up ogr.pxd
jorisvandenbossche Jan 27, 2022
9b3bdf3
clean-up + docstrings
jorisvandenbossche Jan 27, 2022
fa9808b
parametrize test
jorisvandenbossche Feb 9, 2022
6d1aeff
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Feb 9, 2022
719399d
support file-like objects
jorisvandenbossche Feb 9, 2022
1502ab1
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Feb 25, 2022
b8f7985
make first argument positional-only
jorisvandenbossche Feb 25, 2022
bc1b4f1
skip pygeos if not present
jorisvandenbossche Mar 11, 2022
792210b
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Mar 11, 2022
3e055f3
fixup merge
jorisvandenbossche Mar 11, 2022
74f12a9
Update pyogrio/tests/test_raw_io.py
jorisvandenbossche Mar 11, 2022
41f70d4
Update pyogrio/_ogr.pyx
jorisvandenbossche Apr 1, 2022
bd68a50
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Apr 1, 2022
dadcc40
small edits
jorisvandenbossche Apr 1, 2022
985a1ab
add whatsnew
jorisvandenbossche Apr 1, 2022
332680e
update comment
jorisvandenbossche Apr 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Contains declarations against GDAL / OGR API
from libc.stdint cimport int64_t
from libc.stdio cimport FILE


cdef extern from "cpl_conv.h":
Expand Down Expand Up @@ -33,6 +34,38 @@ cdef extern from "cpl_string.h":
char** CSLAddString(char **list, const char *string)


cdef extern from "sys/stat.h" nogil:
struct stat:
int st_mode


cdef extern from "cpl_vsi.h" nogil:

ctypedef int vsi_l_offset
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
ctypedef FILE VSILFILE
ctypedef stat VSIStatBufL

unsigned char *VSIGetMemFileBuffer(const char *path,
vsi_l_offset *data_len,
int take_ownership)
VSILFILE *VSIFileFromMemBuffer(const char *path, void *data,
vsi_l_offset data_len, int take_ownership)
VSILFILE* VSIFOpenL(const char *path, const char *mode)
int VSIFCloseL(VSILFILE *fp)
int VSIUnlink(const char *path)
int VSIMkdir(const char *path, long mode)
int VSIRmdir(const char *path)
int VSIFFlushL(VSILFILE *fp)
size_t VSIFReadL(void *buffer, size_t nSize, size_t nCount, VSILFILE *fp)
char** VSIReadDir(const char* pszPath)
int VSIFSeekL(VSILFILE *fp, vsi_l_offset nOffset, int nWhence)
vsi_l_offset VSIFTellL(VSILFILE *fp)
int VSIFTruncateL(VSILFILE *fp, vsi_l_offset nNewSize)
size_t VSIFWriteL(void *buffer, size_t nSize, size_t nCount, VSILFILE *fp)
int VSIStatL(const char *pszFilename, VSIStatBufL *psStatBuf)
int VSI_ISDIR(int mode)
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved


cdef extern from "ogr_core.h":
ctypedef enum OGRErr:
OGRERR_NONE # success
Expand Down
23 changes: 23 additions & 0 deletions pyogrio/_ogr.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from uuid import uuid4


cdef get_string(const char *c_str, str encoding="UTF-8"):
"""Get Python string from a char *

Expand Down Expand Up @@ -113,3 +116,23 @@ def ogr_list_drivers():

return drivers


def buffer_to_virtual_file(bytesbuf, ext=''):
"""Maps a bytes buffer to a virtual file.
`ext` is empty or begins with a period and contains at most one period.
"""
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

vsi_filename = '/vsimem/{}'.format(uuid4().hex + ext)
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

vsi_handle = VSIFileFromMemBuffer(vsi_filename.encode("utf8"), <unsigned char *>bytesbuf, len(bytesbuf), 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Elsewhere we usually handle Python => C strings in multiple steps, I thought in part because not doing so triggers a compilation error. And we've standarded on using "UTF-8" to refer to unicode in Cython.

So this would be

char *filename_c = NULL
filename_b = vsi_filename.encode("UTF-8")
filename_c = filename_b
vsi_handle = VSIFileFromMemBuffer(filename_c, <unsigned char *>bytesbuf, len(bytesbuf), 0)

Though I'm not sure that is strictly necessary. (same for remove_virtual_filename too)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem to be necessary, since it is compiling here?

(but already changed utf8 to UTF-8)


if vsi_handle == NULL:
raise OSError('failed to map buffer to file')
if VSIFCloseL(vsi_handle) != 0:
raise OSError('failed to close mapped file handle')

return vsi_filename


def remove_virtual_file(vsi_filename):
return VSIUnlink(vsi_filename.encode("utf8"))
46 changes: 32 additions & 14 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

with GDALEnv():
from pyogrio._io import ogr_read, ogr_read_info, ogr_list_layers, ogr_write
from pyogrio._ogr import buffer_to_virtual_file, remove_virtual_file


DRIVERS = {
Expand All @@ -16,7 +17,7 @@


def read(
path,
path_or_buffer,
layer=None,
encoding=None,
columns=None,
Expand All @@ -34,7 +35,7 @@ def read(

Parameters
----------
path : pathlib.Path or str
path_or_buffer : pathlib.Path or str
data source path
layer : int or str, optional (default: first layer)
If an integer is provided, it corresponds to the index of the layer
Expand Down Expand Up @@ -84,19 +85,36 @@ def read(
"geometry": "<geometry type>"
}
"""
from_buffer = False
if isinstance(path_or_buffer, bytes):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I am checking for bytes vs strings to determine whether it's a path or in-memory bytes. I don't know if that is robust enough? Or do we want a separate read_buffer or so?
Alternatively (or in addition), we could also support "file-like" objects (objects that have a read() method that will return the bytes). That's eg what fiona does in their open() method.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think file-like objects that have a read() method would provide a more general solution? For example, the is_zipped below check is limited to a single zip format, whereas the file-like pattern would let user construct and pass in a ZipFile or GzipFile class instance. I'm not familiar with fsspec, but it seems like the file-like pattern would support that as well.

from_buffer = True
ext = ""
if path_or_buffer[:4].startswith(b'PK\x03\x04'):
ext = ".zip"
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
path = buffer_to_virtual_file(path_or_buffer, ext=ext)
if path_or_buffer[:4].startswith(b'PK\x03\x04'):
path = "/vsizip/" + path
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
else:
path = str(path_or_buffer)

try:
result = ogr_read(
path,
layer=layer,
encoding=encoding,
columns=columns,
read_geometry=read_geometry,
force_2d=force_2d,
skip_features=skip_features,
max_features=max_features or 0,
where=where,
bbox=bbox,
)
finally:
if from_buffer:
remove_virtual_file(path)

return ogr_read(
str(path),
layer=layer,
encoding=encoding,
columns=columns,
read_geometry=read_geometry,
force_2d=force_2d,
skip_features=skip_features,
max_features=max_features or 0,
where=where,
bbox=bbox,
)
return result


def write(
Expand Down
2 changes: 1 addition & 1 deletion pyogrio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def naturalearth_lowres_vsi(tmp_path, naturalearth_lowres):
filename = f"{naturalearth_lowres.stem}.{ext}"
out.write(naturalearth_lowres.parent / filename, filename)

return f"/vsizip/{path}/{naturalearth_lowres.name}"
return path, f"/vsizip/{path}/{naturalearth_lowres.name}"


@pytest.fixture(scope="session")
Expand Down
2 changes: 1 addition & 1 deletion pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_list_layers(naturalearth_lowres, naturalearth_lowres_vsi, test_fgdb_vsi
)

assert array_equal(
list_layers(naturalearth_lowres_vsi), [["naturalearth_lowres", "Polygon"]]
list_layers(naturalearth_lowres_vsi[1]), [["naturalearth_lowres", "Polygon"]]
)

# Measured 3D is downgraded to 2.5D during read
Expand Down
2 changes: 1 addition & 1 deletion pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_read_dataframe(naturalearth_lowres):


def test_read_dataframe_vsi(naturalearth_lowres_vsi):
df = read_dataframe(naturalearth_lowres_vsi)
df = read_dataframe(naturalearth_lowres_vsi[1])
assert len(df) == 177


Expand Down
47 changes: 47 additions & 0 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def test_read(naturalearth_lowres):


def test_vsi_read_layers(naturalearth_lowres_vsi):
_, naturalearth_lowres_vsi = naturalearth_lowres_vsi
assert array_equal(
list_layers(naturalearth_lowres_vsi), [["naturalearth_lowres", "Polygon"]]
)
Expand Down Expand Up @@ -219,3 +220,49 @@ def test_write_unsupported(tmpdir, naturalearth_lowres):

with pytest.raises(DriverError, match="does not support write functionality"):
write(filename, geometry, field_data, driver="OpenFileGDB", **meta)


def assert_equal_result(result1, result2):
meta1, geometry1, field_data1 = result1
meta2, geometry2, field_data2 = result2

assert np.array_equal(meta1["fields"], meta2["fields"])
# assert np.array_equal(geometry1, geometry2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor nit: remove commented line

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turned it into a small explanation why we are using pygeos here

import pygeos
assert pygeos.equals_exact(
pygeos.from_wkb(geometry1), pygeos.from_wkb(geometry2), tolerance=0.00001
).all()
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
assert all([np.array_equal(f1, f2) for f1, f2 in zip(field_data1, field_data2)])


def test_read_from_bytes(tmpdir, naturalearth_lowres):
meta, geometry, field_data = read(naturalearth_lowres)
filename = os.path.join(str(tmpdir), "test.gpkg")
write(filename, geometry, field_data, driver="GPKG", **meta)

with open(filename, "rb") as f:
buffer = f.read()

meta2, geometry2, field_data2 = read(buffer)
assert_equal_result((meta, geometry, field_data), (meta2, geometry2, field_data2))


filename = os.path.join(str(tmpdir), "test.geojson")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even though there is some shared code, I'd suggest simplifying this a little bit and using pytest.mark.parametrize with varying driver

write(filename, geometry, field_data, driver="GeoJSON", **meta)

with open(filename, "rb") as f:
buffer = f.read()

meta2, geometry2, field_data2 = read(buffer)
assert_equal_result((meta, geometry, field_data), (meta2, geometry2, field_data2))


def test_read_from_bytes_zipped(tmpdir, naturalearth_lowres_vsi):
path, vsi_path = naturalearth_lowres_vsi
meta, geometry, field_data = read(vsi_path)

with open(path, "rb") as f:
buffer = f.read()

meta2, geometry2, field_data2 = read(buffer)
assert_equal_result((meta, geometry, field_data), (meta2, geometry2, field_data2))