Skip to content

Commit

Permalink
ENH: support reading from in-memory buffers (#25)
Browse files Browse the repository at this point in the history
Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>
Co-authored-by: Brendan Ward <bcward@astutespruce.com>
  • Loading branch information
3 people authored Apr 2, 2022
1 parent a807404 commit 22f6878
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 23 deletions.
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Major enhancements

- support for reading from file-like objects and in-memory buffers (#25)
- index of GeoDataFrame created by `read_dataframe` can now optionally be set
to the FID of the features that are read, as `int64` dtype. Note that some
drivers start FID numbering at 0 whereas others start numbering at 1.
Expand All @@ -18,7 +19,7 @@

### Potentially breaking changes

- Consolided error handling to better use GDAL error messages and specific
- Consolidated error handling to better use GDAL error messages and specific
exception classes (#39). Note that this is a breaking change only if you are
relying on specific error classes to be emitted.

Expand Down
11 changes: 11 additions & 0 deletions pyogrio/_ogr.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Contains declarations against GDAL / OGR API
from libc.stdint cimport int64_t
from libc.stdio cimport FILE


cdef extern from "cpl_conv.h":
Expand Down Expand Up @@ -34,6 +35,16 @@ cdef extern from "cpl_string.h":
char** CSLAddString(char **list, const char *string)


cdef extern from "cpl_vsi.h" nogil:

ctypedef FILE VSILFILE

VSILFILE *VSIFileFromMemBuffer(const char *path, void *data,
int data_len, int take_ownership)
int VSIFCloseL(VSILFILE *fp)
int VSIUnlink(const char *path)


cdef extern from "ogr_core.h":
ctypedef enum OGRErr:
OGRERR_NONE # success
Expand Down
25 changes: 25 additions & 0 deletions pyogrio/_ogr.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
from uuid import uuid4
import warnings

from pyogrio._err cimport exc_wrap_int, exc_wrap_ogrerr
Expand Down Expand Up @@ -122,6 +123,30 @@ def ogr_list_drivers():
return drivers


def buffer_to_virtual_file(bytesbuf, ext=''):
"""Maps a bytes buffer to a virtual file.
`ext` is empty or begins with a period and contains at most one period.
This (and remove_virtual_file) is originally copied from the Fiona project
(https://github.com/Toblerity/Fiona/blob/c388e9adcf9d33e3bb04bf92b2ff210bbce452d9/fiona/ogrext.pyx#L1863-L1879)
"""

vsi_filename = f"/vsimem/{uuid4().hex + ext}"

vsi_handle = VSIFileFromMemBuffer(vsi_filename.encode("UTF-8"), <unsigned char *>bytesbuf, len(bytesbuf), 0)

if vsi_handle == NULL:
raise OSError('failed to map buffer to file')
if VSIFCloseL(vsi_handle) != 0:
raise OSError('failed to close mapped file handle')

return vsi_filename


def remove_virtual_file(vsi_filename):
return VSIUnlink(vsi_filename.encode("UTF-8"))


cdef void set_proj_search_path(str path):
"""Set PROJ library data file search path for use in GDAL."""
cdef char **paths = NULL
Expand Down
26 changes: 22 additions & 4 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,24 @@
from pyogrio.raw import read, write


def _stringify_path(path):
"""
Convert path-like to a string if possible, pass-through other objects
"""
if isinstance(path, str):
return path

# checking whether path implements the filesystem protocol
if hasattr(path, "__fspath__"):
return path.__fspath__()

# pass-though other objects
return path


def read_dataframe(
path,
path_or_buffer,
/,
layer=None,
encoding=None,
columns=None,
Expand All @@ -26,8 +42,8 @@ def read_dataframe(
Parameters
----------
path : str
A dataset path or URI.
path_or_buffer : pathlib.Path or str, or bytes buffer
A dataset path or URI, or raw buffer.
layer : int or str, optional (default: first layer)
If an integer is provided, it corresponds to the index of the layer
with the data source. If a string is provided, it must match the name
Expand Down Expand Up @@ -85,8 +101,10 @@ def read_dataframe(
except ImportError:
raise ImportError("geopandas is required to use pyogrio.read_dataframe()")

path_or_buffer = _stringify_path(path_or_buffer)

meta, index, geometry, field_data = read(
path,
path_or_buffer,
layer=layer,
encoding=encoding,
columns=columns,
Expand Down
58 changes: 40 additions & 18 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

with GDALEnv():
from pyogrio._io import ogr_read, ogr_read_info, ogr_list_layers, ogr_write
from pyogrio._ogr import buffer_to_virtual_file, remove_virtual_file


DRIVERS = {
Expand All @@ -17,7 +18,8 @@


def read(
path,
path_or_buffer,
/,
layer=None,
encoding=None,
columns=None,
Expand All @@ -37,8 +39,8 @@ def read(
Parameters
----------
path : pathlib.Path or str
A dataset path or URI.
path_or_buffer : pathlib.Path or str, or bytes buffer
A dataset path or URI, or raw buffer.
layer : int or str, optional (default: first layer)
If an integer is provided, it corresponds to the index of the layer
with the data source. If a string is provided, it must match the name
Expand Down Expand Up @@ -98,26 +100,46 @@ def read(
"geometry": "<geometry type>"
}
"""
path = vsi_path(str(path))
if hasattr(path_or_buffer, "read"):
path_or_buffer = path_or_buffer.read()

from_buffer = False
if isinstance(path_or_buffer, bytes):
from_buffer = True
ext = ""
is_zipped = path_or_buffer[:4].startswith(b'PK\x03\x04')
if is_zipped:
ext = ".zip"
path = buffer_to_virtual_file(path_or_buffer, ext=ext)
if is_zipped:
path = "/vsizip/" + path
else:
path = vsi_path(str(path_or_buffer))

if not "://" in path:
if not "/vsi" in path.lower() and not os.path.exists(path):
raise ValueError(f"'{path}' does not exist")

return ogr_read(
path,
layer=layer,
encoding=encoding,
columns=columns,
read_geometry=read_geometry,
force_2d=force_2d,
skip_features=skip_features,
max_features=max_features or 0,
where=where,
bbox=bbox,
fids=fids,
return_fids=return_fids,
)
try:
result = ogr_read(
path,
layer=layer,
encoding=encoding,
columns=columns,
read_geometry=read_geometry,
force_2d=force_2d,
skip_features=skip_features,
max_features=max_features or 0,
where=where,
bbox=bbox,
fids=fids,
return_fids=return_fids,
)
finally:
if from_buffer:
remove_virtual_file(path)

return result


def write(
Expand Down
64 changes: 64 additions & 0 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,3 +299,67 @@ def test_write_unsupported(tmpdir, naturalearth_lowres):

with pytest.raises(DataSourceError, match="does not support write functionality"):
write(filename, geometry, field_data, driver="OpenFileGDB", **meta)


def assert_equal_result(result1, result2):
meta1, index1, geometry1, field_data1 = result1
meta2, index2, geometry2, field_data2 = result2

assert np.array_equal(meta1["fields"], meta2["fields"])
assert np.array_equal(index1, index2)
# a plain `assert np.array_equal(geometry1, geometry2)` doesn't work because
# the WKB values are not exactly equal, therefore parsing with pygeos to compare
# with tolerance
pygeos = pytest.importorskip("pygeos")
assert pygeos.equals_exact(
pygeos.from_wkb(geometry1), pygeos.from_wkb(geometry2), tolerance=0.00001
).all()
assert all([np.array_equal(f1, f2) for f1, f2 in zip(field_data1, field_data2)])


@pytest.mark.parametrize(
"driver,ext",
[
("GeoJSON", "geojson"),
("GPKG", "gpkg")
]
)
def test_read_from_bytes(tmpdir, naturalearth_lowres, driver, ext):
meta, index, geometry, field_data = read(naturalearth_lowres)
filename = os.path.join(str(tmpdir), f"test.{ext}")
write(filename, geometry, field_data, driver=driver, **meta)

with open(filename, "rb") as f:
buffer = f.read()

result2 = read(buffer)
assert_equal_result((meta, index, geometry, field_data), result2)


def test_read_from_bytes_zipped(tmpdir, naturalearth_lowres_vsi):
path, vsi_path = naturalearth_lowres_vsi
meta, index, geometry, field_data = read(vsi_path)

with open(path, "rb") as f:
buffer = f.read()

result2 = read(buffer)
assert_equal_result((meta, index, geometry, field_data), result2)


@pytest.mark.parametrize(
"driver,ext",
[
("GeoJSON", "geojson"),
("GPKG", "gpkg")
]
)
def test_read_from_file_like(tmpdir, naturalearth_lowres, driver, ext):
meta, index, geometry, field_data = read(naturalearth_lowres)
filename = os.path.join(str(tmpdir), f"test.{ext}")
write(filename, geometry, field_data, driver=driver, **meta)

with open(filename, "rb") as f:
result2 = read(f)

assert_equal_result((meta, index, geometry, field_data), result2)

0 comments on commit 22f6878

Please sign in to comment.