Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: support reading from in-memory buffers #25

Merged
merged 20 commits into from
Apr 2, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0a10bc4
ENH: support reading from in-memory buffers
jorisvandenbossche Nov 12, 2021
60b8d28
passthrough bytes in read_dataframe
jorisvandenbossche Nov 12, 2021
d81a70c
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Jan 27, 2022
dadea0c
fixup merge
jorisvandenbossche Jan 27, 2022
083a32a
attribute buffer_to_virtual_file to fiona + clean up ogr.pxd
jorisvandenbossche Jan 27, 2022
9b3bdf3
clean-up + docstrings
jorisvandenbossche Jan 27, 2022
fa9808b
parametrize test
jorisvandenbossche Feb 9, 2022
6d1aeff
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Feb 9, 2022
719399d
support file-like objects
jorisvandenbossche Feb 9, 2022
1502ab1
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Feb 25, 2022
b8f7985
make first argument positional-only
jorisvandenbossche Feb 25, 2022
bc1b4f1
skip pygeos if not present
jorisvandenbossche Mar 11, 2022
792210b
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Mar 11, 2022
3e055f3
fixup merge
jorisvandenbossche Mar 11, 2022
74f12a9
Update pyogrio/tests/test_raw_io.py
jorisvandenbossche Mar 11, 2022
41f70d4
Update pyogrio/_ogr.pyx
jorisvandenbossche Apr 1, 2022
bd68a50
Merge remote-tracking branch 'upstream/main' into read-in-memory
jorisvandenbossche Apr 1, 2022
dadcc40
small edits
jorisvandenbossche Apr 1, 2022
985a1ab
add whatsnew
jorisvandenbossche Apr 1, 2022
332680e
update comment
jorisvandenbossche Apr 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Contains declarations against GDAL / OGR API
from libc.stdint cimport int64_t
from libc.stdio cimport FILE


cdef extern from "cpl_conv.h":
Expand Down Expand Up @@ -34,6 +35,16 @@ cdef extern from "cpl_string.h":
char** CSLAddString(char **list, const char *string)


cdef extern from "cpl_vsi.h" nogil:

ctypedef FILE VSILFILE

VSILFILE *VSIFileFromMemBuffer(const char *path, void *data,
int data_len, int take_ownership)
int VSIFCloseL(VSILFILE *fp)
int VSIUnlink(const char *path)


cdef extern from "ogr_core.h":
ctypedef enum OGRErr:
OGRERR_NONE # success
Expand Down
25 changes: 25 additions & 0 deletions pyogrio/_ogr.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
from uuid import uuid4
import warnings

from pyogrio._err cimport exc_wrap_int, exc_wrap_ogrerr
Expand Down Expand Up @@ -122,6 +123,30 @@ def ogr_list_drivers():
return drivers


def buffer_to_virtual_file(bytesbuf, ext=''):
"""Maps a bytes buffer to a virtual file.
`ext` is empty or begins with a period and contains at most one period.

This (and remove_virtual_file) is originally copied from the Fiona project
(https://github.com/Toblerity/Fiona/blob/c388e9adcf9d33e3bb04bf92b2ff210bbce452d9/fiona/ogrext.pyx#L1863-L1879)
"""
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

vsi_filename = '/vsimem/{}'.format(uuid4().hex + ext)
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

vsi_handle = VSIFileFromMemBuffer(vsi_filename.encode("utf8"), <unsigned char *>bytesbuf, len(bytesbuf), 0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Elsewhere we usually handle Python => C strings in multiple steps, I thought in part because not doing so triggers a compilation error. And we've standarded on using "UTF-8" to refer to unicode in Cython.

So this would be

char *filename_c = NULL
filename_b = vsi_filename.encode("UTF-8")
filename_c = filename_b
vsi_handle = VSIFileFromMemBuffer(filename_c, <unsigned char *>bytesbuf, len(bytesbuf), 0)

Though I'm not sure that is strictly necessary. (same for remove_virtual_filename too)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem to be necessary, since it is compiling here?

(but already changed utf8 to UTF-8)


if vsi_handle == NULL:
raise OSError('failed to map buffer to file')
if VSIFCloseL(vsi_handle) != 0:
raise OSError('failed to close mapped file handle')

return vsi_filename


def remove_virtual_file(vsi_filename):
return VSIUnlink(vsi_filename.encode("utf8"))


cdef void set_proj_search_path(str path):
"""Set PROJ library data file search path for use in GDAL."""
cdef char **paths = NULL
Expand Down
28 changes: 24 additions & 4 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,26 @@
from pyogrio.raw import read, write


def _stringify_path(path):
"""
Convert path-like to a string if possible, pass-through other objects
"""
if isinstance(path, str):
return path

# checking whether path implements the filesystem protocol
try:
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
return path.__fspath__() # new in python 3.6
except AttributeError:
pass

# pass-though other objects
return path


def read_dataframe(
path,
path_or_buffer,
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
/,
layer=None,
encoding=None,
columns=None,
Expand All @@ -26,8 +44,8 @@ def read_dataframe(

Parameters
----------
path : str
A dataset path or URI.
path_or_buffer : pathlib.Path or str, or bytes buffer
A dataset path or URI, or raw buffer.
layer : int or str, optional (default: first layer)
If an integer is provided, it corresponds to the index of the layer
with the data source. If a string is provided, it must match the name
Expand Down Expand Up @@ -85,8 +103,10 @@ def read_dataframe(
except ImportError:
raise ImportError("geopandas is required to use pyogrio.read_dataframe()")

path_or_buffer = _stringify_path(path_or_buffer)

meta, index, geometry, field_data = read(
path,
path_or_buffer,
layer=layer,
encoding=encoding,
columns=columns,
Expand Down
58 changes: 40 additions & 18 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

with GDALEnv():
from pyogrio._io import ogr_read, ogr_read_info, ogr_list_layers, ogr_write
from pyogrio._ogr import buffer_to_virtual_file, remove_virtual_file


DRIVERS = {
Expand All @@ -17,7 +18,8 @@


def read(
path,
path_or_buffer,
/,
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
layer=None,
encoding=None,
columns=None,
Expand All @@ -37,8 +39,8 @@ def read(

Parameters
----------
path : pathlib.Path or str
A dataset path or URI.
path_or_buffer : pathlib.Path or str, or bytes buffer
A dataset path or URI, or raw buffer.
layer : int or str, optional (default: first layer)
If an integer is provided, it corresponds to the index of the layer
with the data source. If a string is provided, it must match the name
Expand Down Expand Up @@ -98,26 +100,46 @@ def read(
"geometry": "<geometry type>"
}
"""
path = vsi_path(str(path))
if hasattr(path_or_buffer, "read"):
path_or_buffer = path_or_buffer.read()

from_buffer = False
if isinstance(path_or_buffer, bytes):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here I am checking for bytes vs strings to determine whether it's a path or in-memory bytes. I don't know if that is robust enough? Or do we want a separate read_buffer or so?
Alternatively (or in addition), we could also support "file-like" objects (objects that have a read() method that will return the bytes). That's eg what fiona does in their open() method.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think file-like objects that have a read() method would provide a more general solution? For example, the is_zipped below check is limited to a single zip format, whereas the file-like pattern would let user construct and pass in a ZipFile or GzipFile class instance. I'm not familiar with fsspec, but it seems like the file-like pattern would support that as well.

from_buffer = True
ext = ""
is_zipped = path_or_buffer[:4].startswith(b'PK\x03\x04')
if is_zipped:
ext = ".zip"
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
path = buffer_to_virtual_file(path_or_buffer, ext=ext)
if is_zipped:
path = "/vsizip/" + path
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
else:
path = vsi_path(str(path_or_buffer))

if not "://" in path:
if not "/vsi" in path.lower() and not os.path.exists(path):
raise ValueError(f"'{path}' does not exist")

return ogr_read(
path,
layer=layer,
encoding=encoding,
columns=columns,
read_geometry=read_geometry,
force_2d=force_2d,
skip_features=skip_features,
max_features=max_features or 0,
where=where,
bbox=bbox,
fids=fids,
return_fids=return_fids,
)
try:
result = ogr_read(
path,
layer=layer,
encoding=encoding,
columns=columns,
read_geometry=read_geometry,
force_2d=force_2d,
skip_features=skip_features,
max_features=max_features or 0,
where=where,
bbox=bbox,
fids=fids,
return_fids=return_fids,
)
finally:
if from_buffer:
remove_virtual_file(path)

return result


def write(
Expand Down
62 changes: 62 additions & 0 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,65 @@ def test_write_unsupported(tmpdir, naturalearth_lowres):

with pytest.raises(DataSourceError, match="does not support write functionality"):
write(filename, geometry, field_data, driver="OpenFileGDB", **meta)


def assert_equal_result(result1, result2):
meta1, index1, geometry1, field_data1 = result1
meta2, index2, geometry2, field_data2 = result2

assert np.array_equal(meta1["fields"], meta2["fields"])
assert np.array_equal(index1, index2)
# assert np.array_equal(geometry1, geometry2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor nit: remove commented line

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Turned it into a small explanation why we are using pygeos here

pygeos = pytest.importorkskip("pygeos")
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
assert pygeos.equals_exact(
pygeos.from_wkb(geometry1), pygeos.from_wkb(geometry2), tolerance=0.00001
).all()
brendan-ward marked this conversation as resolved.
Show resolved Hide resolved
assert all([np.array_equal(f1, f2) for f1, f2 in zip(field_data1, field_data2)])


@pytest.mark.parametrize(
"driver,ext",
[
("GeoJSON", "geojson"),
("GPKG", "gpkg")
]
)
def test_read_from_bytes(tmpdir, naturalearth_lowres, driver, ext):
meta, index, geometry, field_data = read(naturalearth_lowres)
filename = os.path.join(str(tmpdir), f"test.{ext}")
write(filename, geometry, field_data, driver=driver, **meta)

with open(filename, "rb") as f:
buffer = f.read()

result2 = read(buffer)
assert_equal_result((meta, index, geometry, field_data), result2)


def test_read_from_bytes_zipped(tmpdir, naturalearth_lowres_vsi):
path, vsi_path = naturalearth_lowres_vsi
meta, index, geometry, field_data = read(vsi_path)

with open(path, "rb") as f:
buffer = f.read()

result2 = read(buffer)
assert_equal_result((meta, index, geometry, field_data), result2)


@pytest.mark.parametrize(
"driver,ext",
[
("GeoJSON", "geojson"),
("GPKG", "gpkg")
]
)
def test_read_from_file_like(tmpdir, naturalearth_lowres, driver, ext):
meta, index, geometry, field_data = read(naturalearth_lowres)
filename = os.path.join(str(tmpdir), f"test.{ext}")
write(filename, geometry, field_data, driver=driver, **meta)

with open(filename, "rb") as f:
result2 = read(f)

assert_equal_result((meta, index, geometry, field_data), result2)