-
-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
✨ PyogrioReaderIterDataPipe for reading vector OGR files #19
Changes from all commits
c29b5fd
62c4541
8d0d8dd
63114cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,99 @@ | ||||||||
""" | ||||||||
DataPipes for :doc:`pyogrio <pyogrio:index>`. | ||||||||
""" | ||||||||
from typing import Any, Dict, Iterator, Optional, Tuple | ||||||||
|
||||||||
try: | ||||||||
import pyogrio | ||||||||
except: | ||||||||
pyogrio = None | ||||||||
from torchdata.datapipes import functional_datapipe | ||||||||
from torchdata.datapipes.iter import IterDataPipe | ||||||||
from torchdata.datapipes.utils import StreamWrapper | ||||||||
|
||||||||
|
||||||||
@functional_datapipe("read_from_pyogrio") | ||||||||
class PyogrioReaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]): | ||||||||
""" | ||||||||
Takes vector files (e.g. FlatGeoBuf, GeoPackage, GeoJSON) from local disk | ||||||||
or URLs (as long as they can be read by pyogrio) and yields tuples of | ||||||||
filename and :py:class:`geopandas.GeoDataFrame` objects | ||||||||
(functional name: ``read_from_pyogrio``). | ||||||||
|
||||||||
Based on | ||||||||
https://github.com/pytorch/data/blob/v0.3.0/torchdata/datapipes/iter/load/iopath.py#L37-L83 | ||||||||
|
||||||||
Parameters | ||||||||
---------- | ||||||||
source_datapipe : IterDataPipe[str] | ||||||||
A DataPipe that contains filepaths or URL links to vector files such as | ||||||||
FlatGeoBuf, GeoPackage, GeoJSON, etc. | ||||||||
|
||||||||
kwargs : Optional | ||||||||
Extra keyword arguments to pass to | ||||||||
`pyogrio.read_dataframe <https://pyogrio.readthedocs.io/en/latest/api.html#geopandas-integration>`_. | ||||||||
Comment on lines
+33
to
+34
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally the intersphinx directive would look like this, but the permalink is not available for some reason. Closest I could get to
Suggested change
Edit: PR at geopandas/pyogrio#130 to resolve this. |
||||||||
|
||||||||
Yields | ||||||||
------ | ||||||||
stream_obj : Tuple[str, geopandas.GeoDataFrame] | ||||||||
A tuple consisting of the filename that was passed in, and a | ||||||||
:py:class:`geopandas.GeoDataFrame` object containing the vector data. | ||||||||
|
||||||||
Raises | ||||||||
------ | ||||||||
ModuleNotFoundError | ||||||||
If ``pyogrio`` is not installed. See | ||||||||
:doc:`install instructions for pyogrio <pyogrio:install>`, and ensure | ||||||||
that ``geopandas`` is installed too (e.g. via | ||||||||
``pip install pyogrio[geopandas]``) before using this class. | ||||||||
|
||||||||
Example | ||||||||
------- | ||||||||
>>> import pytest | ||||||||
>>> pyogrio = pytest.importorskip("pyogrio") | ||||||||
... | ||||||||
>>> from torchdata.datapipes.iter import IterableWrapper | ||||||||
>>> from zen3geo.datapipes import PyogrioReader | ||||||||
... | ||||||||
>>> # Read in GeoTIFF data using DataPipe | ||||||||
>>> file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg" | ||||||||
>>> dp = IterableWrapper(iterable=[file_url]) | ||||||||
>>> dp_pyogrio = dp.read_from_pyogrio() | ||||||||
... | ||||||||
>>> # Loop or iterate over the DataPipe stream | ||||||||
>>> it = iter(dp_pyogrio) | ||||||||
>>> filename, geodataframe = next(it) | ||||||||
>>> filename | ||||||||
'https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg' | ||||||||
>>> geodataframe | ||||||||
StreamWrapper< col_bool col_int8 ... col_float64 geometry | ||||||||
0 1.0 1.0 ... 1.5 POINT (0.00000 0.00000) | ||||||||
1 0.0 2.0 ... 2.5 POINT (1.00000 1.00000) | ||||||||
2 1.0 3.0 ... 3.5 POINT (2.00000 2.00000) | ||||||||
3 NaN NaN ... NaN POINT (4.00000 4.00000) | ||||||||
<BLANKLINE> | ||||||||
[4 rows x 12 columns]> | ||||||||
""" | ||||||||
|
||||||||
def __init__( | ||||||||
self, source_datapipe: IterDataPipe[str], **kwargs: Optional[Dict[str, Any]] | ||||||||
) -> None: | ||||||||
if pyogrio is None: | ||||||||
raise ModuleNotFoundError( | ||||||||
"Package `pyogrio` is required to be installed to use this datapipe. " | ||||||||
"Please use `pip install pyogrio[geopandas]` or " | ||||||||
"`conda install -c conda-forge pyogrio` " | ||||||||
"to install the package" | ||||||||
) | ||||||||
self.source_datapipe: IterDataPipe[str] = source_datapipe | ||||||||
self.kwargs = kwargs | ||||||||
|
||||||||
def __iter__(self) -> Iterator[Tuple]: | ||||||||
for filename in self.source_datapipe: | ||||||||
yield ( | ||||||||
filename, | ||||||||
StreamWrapper(pyogrio.read_dataframe(filename, **self.kwargs)), | ||||||||
) | ||||||||
|
||||||||
def __len__(self) -> int: | ||||||||
return len(self.source_datapipe) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
""" | ||
Tests for pyogrio datapipes. | ||
""" | ||
import pytest | ||
from torchdata.datapipes.iter import IterableWrapper | ||
|
||
from zen3geo.datapipes import PyogrioReader | ||
|
||
pyogrio = pytest.importorskip("pyogrio") | ||
|
||
# %% | ||
def test_pyogrio_reader(): | ||
""" | ||
Ensure that PyogrioReader works to read in a GeoTIFF file and outputs a | ||
tuple made up of a filename and an xarray.DataArray object. | ||
""" | ||
file_url: str = "https://github.com/geopandas/pyogrio/raw/v0.4.0a1/pyogrio/tests/fixtures/test_gpkg_nulls.gpkg" | ||
dp = IterableWrapper(iterable=[file_url]) | ||
|
||
# Using class constructors | ||
dp_pyogrio = PyogrioReader(source_datapipe=dp) | ||
# Using functional form (recommended) | ||
dp_pyogrio = dp.read_from_pyogrio() | ||
|
||
assert len(dp_pyogrio) == 1 | ||
it = iter(dp_pyogrio) | ||
filename, geodataframe = next(it) | ||
|
||
assert isinstance(filename, str) | ||
assert geodataframe.shape == (4, 12) | ||
assert any(geodataframe.isna()) | ||
assert all(geodataframe.geom_type == "Point") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to bump this to stable
pyogrio=0.4.0
version once that is released. Edit: see PR #21.