Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: support URI schemes (zip://, s3://) by converting to vsi paths #43

Merged
merged 7 commits into from
Feb 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions docs/source/introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,52 @@ extension of the filename:
`.gpkg`: `GPKG`
`.json`: `GeoJSON`

## Reading from compressed files / archives

GDAL supports reading directly from an archive, such as a zipped folder, without
the need to manually unpack the archive first. This is especially useful when
the dataset, such as a ESRI Shapefile, consists of multiple files and is
distributed as a zipped archive.

GDAL handles this through the concept of [virtual file systems](https://gdal.org/user/virtual_file_systems.html)
using a `/vsiPREFIX/..` path (for example `/vsizip/..`). For convenience,
pyogrio also supports passing the path with the more common URI syntax
using `zip://..`:

```python
>>> read_dataframe("/vsizip/ne_10m_admin_0_countries.zip")
>>> read_dataframe("zip://ne_10m_admin_0_countries.zip")
```

## Reading from remote filesystems

GDAL supports several remote filesystems, such as S3, Google Cloud or Azure,
out of the box through the concept of virtual file systems. See
[GDAL's docs on network file systems](https://gdal.org/user/virtual_file_systems.html#network-based-file-systems)
for more details.
You can use GDAL's native `/vsi../` notation, but for convenience, pyogrio
also supports passing the path with the more common URI syntax:

```python
>>> read_dataframe("/vsis3/bucket/data.geojson")
>>> read_dataframe("s3://bucket/data.geojson")
```

It is also possible to combine multiple virtual filesystems, such as reading
a zipped folder (see section above) from a remote filesystem:

```python
>>> read_dataframe("vsizip/vsis3/bucket/shapefile.zip")
>>> read_dataframe("zip+s3://bucket/shapefile.zip")
```

You can also read from a URL with this syntax:

```python
>>> read_dataframe("https://s3.amazonaws.com/bucket/data.geojson")
>>> read_dataframe("zip+https://s3.amazonaws.com/bucket/shapefile.zip")
```

## Configuration options

It is possible to set
Expand Down
8 changes: 6 additions & 2 deletions pyogrio/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from pyogrio._env import GDALEnv
from pyogrio.util import vsi_path


with GDALEnv():
from pyogrio._ogr import (
Expand Down Expand Up @@ -99,9 +101,10 @@ def read_bounds(
fids are global IDs read from the FID field of the dataset
bounds are ndarray of shape(4, n) containig ``xmin``, ``ymin``, ``xmax``, ``ymax``
"""
path = vsi_path(str(path))

return ogr_read_bounds(
str(path),
path,
layer=layer,
skip_features=skip_features,
max_features=max_features or 0,
Expand Down Expand Up @@ -140,7 +143,8 @@ def read_info(path, layer=None, encoding=None):
"features": <feature count>
}
"""
return ogr_read_info(str(path), layer=layer, encoding=encoding)
path = vsi_path(str(path))
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
return ogr_read_info(path, layer=layer, encoding=encoding)


def set_gdal_config_options(options):
Expand Down
8 changes: 1 addition & 7 deletions pyogrio/geopandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def read_dataframe(
Parameters
----------
path : str
path to file
A dataset path or URI.
layer : int or str, optional (default: first layer)
If an integer is provided, it corresponds to the index of the layer
with the data source. If a string is provided, it must match the name
Expand Down Expand Up @@ -85,12 +85,6 @@ def read_dataframe(
except ImportError:
raise ImportError("geopandas is required to use pyogrio.read_dataframe()")

path = str(path)

if not "://" in path:
if not "/vsi" in path.lower() and not os.path.exists(path):
raise ValueError(f"'{path}' does not exist")

meta, index, geometry, field_data = read(
path,
layer=layer,
Expand Down
10 changes: 8 additions & 2 deletions pyogrio/raw.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os

from pyogrio._env import GDALEnv
from pyogrio.util import vsi_path

with GDALEnv():
from pyogrio._io import ogr_read, ogr_read_info, ogr_list_layers, ogr_write
Expand Down Expand Up @@ -37,7 +38,7 @@ def read(
Parameters
----------
path : pathlib.Path or str
data source path
A dataset path or URI.
layer : int or str, optional (default: first layer)
If an integer is provided, it corresponds to the index of the layer
with the data source. If a string is provided, it must match the name
Expand Down Expand Up @@ -97,9 +98,14 @@ def read(
"geometry": "<geometry type>"
}
"""
path = vsi_path(str(path))

if not "://" in path:
if not "/vsi" in path.lower() and not os.path.exists(path):
raise ValueError(f"'{path}' does not exist")

return ogr_read(
str(path),
path,
layer=layer,
encoding=encoding,
columns=columns,
Expand Down
13 changes: 9 additions & 4 deletions pyogrio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pyogrio import __gdal_version_string__, __version__, list_drivers


data_dir = Path(__file__).parent.resolve() / "fixtures"
_data_dir = Path(__file__).parent.resolve() / "fixtures"


def pytest_report_header(config):
Expand All @@ -18,9 +18,14 @@ def pytest_report_header(config):
)


@pytest.fixture
def data_dir():
return _data_dir


@pytest.fixture(scope="session")
def naturalearth_lowres():
return data_dir / Path("naturalearth_lowres/naturalearth_lowres.shp")
return _data_dir / Path("naturalearth_lowres/naturalearth_lowres.shp")


@pytest.fixture
Expand All @@ -34,10 +39,10 @@ def naturalearth_lowres_vsi(tmp_path, naturalearth_lowres):
filename = f"{naturalearth_lowres.stem}.{ext}"
out.write(naturalearth_lowres.parent / filename, filename)

return f"/vsizip/{path}/{naturalearth_lowres.name}"
return path, f"/vsizip/{path}/{naturalearth_lowres.name}"


@pytest.fixture(scope="session")
def test_fgdb_vsi():
return f"/vsizip/{data_dir}/test_fgdb.gdb.zip"
return f"/vsizip/{_data_dir}/test_fgdb.gdb.zip"

2 changes: 1 addition & 1 deletion pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_list_layers(naturalearth_lowres, naturalearth_lowres_vsi, test_fgdb_vsi
)

assert array_equal(
list_layers(naturalearth_lowres_vsi), [["naturalearth_lowres", "Polygon"]]
list_layers(naturalearth_lowres_vsi[1]), [["naturalearth_lowres", "Polygon"]]
)

# Measured 3D is downgraded to 2.5D during read
Expand Down
2 changes: 1 addition & 1 deletion pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_read_dataframe(naturalearth_lowres):


def test_read_dataframe_vsi(naturalearth_lowres_vsi):
df = read_dataframe(naturalearth_lowres_vsi)
df = read_dataframe(naturalearth_lowres_vsi[1])
assert len(df) == 177


Expand Down
147 changes: 147 additions & 0 deletions pyogrio/tests/test_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import os
import contextlib

import pytest

import pyogrio
import pyogrio.raw
from pyogrio.util import vsi_path


@contextlib.contextmanager
def change_cwd(path):
curdir = os.getcwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(curdir)


@pytest.mark.parametrize(
"path, expected",
[
# local file paths that should be passed through as is
("data.gpkg", "data.gpkg"),
("/home/user/data.gpkg", "/home/user/data.gpkg"),
(r"C:\User\Documents\data.gpkg", r"C:\User\Documents\data.gpkg"),
("file:///home/user/data.gpkg", "/home/user/data.gpkg"),
# cloud URIs
("s3://testing/data.gpkg", "/vsis3/testing/data.gpkg"),
("gs://testing/data.gpkg", "/vsigs/testing/data.gpkg"),
("az://testing/data.gpkg", "/vsiaz/testing/data.gpkg"),
("adl://testing/data.gpkg", "/vsiadls/testing/data.gpkg"),
("adls://testing/data.gpkg", "/vsiadls/testing/data.gpkg"),
("hdfs://testing/data.gpkg", "/vsihdfs/testing/data.gpkg"),
("webhdfs://testing/data.gpkg", "/vsiwebhdfs/testing/data.gpkg"),
# archives
("zip://data.zip", "/vsizip/data.zip"),
("tar://data.tar", "/vsitar/data.tar"),
("gzip://data.gz", "/vsigzip/data.gz"),
("tar://./my.tar!my.geojson", "/vsitar/./my.tar/my.geojson"),
(
"zip://home/data/shapefile.zip!layer.shp",
"/vsizip/home/data/shapefile.zip/layer.shp",
),
# combined schemes
("zip+s3://testing/shapefile.zip", "/vsizip/vsis3/testing/shapefile.zip"),
(
"zip+https://s3.amazonaws.com/testing/shapefile.zip",
"/vsizip/vsicurl/https://s3.amazonaws.com/testing/shapefile.zip",
),
],
)
def test_vsi_path(path, expected):
assert vsi_path(path) == expected


def test_vsi_path_unknown():
# unrecognized URI gets passed through as is
assert vsi_path("s4://test/data.geojson") == "s4://test/data.geojson"


def test_vsi_handling_read_functions(naturalearth_lowres_vsi):
# test that all different read entry points have the path handling
# (a zip:// path would otherwise fail)
path, _ = naturalearth_lowres_vsi
path = "zip://" + str(path)

result = pyogrio.raw.read(path)
assert len(result[2]) == 177

result = pyogrio.read_info(path)
assert result["features"] == 177

result = pyogrio.read_bounds(path)
assert len(result[0]) == 177

result = pyogrio.read_dataframe(path)
assert len(result) == 177


def test_path_absolute(data_dir):
# pathlib path
path = data_dir / "naturalearth_lowres/naturalearth_lowres.shp"
df = pyogrio.read_dataframe(path)
len(df) == 177

# str path
df = pyogrio.read_dataframe(str(path))
len(df) == 177


def test_path_relative(data_dir):
with change_cwd(data_dir):
df = pyogrio.read_dataframe("naturalearth_lowres/naturalearth_lowres.shp")
len(df) == 177


def test_uri_local_file(data_dir):
uri = "file://" + str(data_dir / "naturalearth_lowres/naturalearth_lowres.shp")
df = pyogrio.read_dataframe(uri)
len(df) == 177


def test_zip_path(naturalearth_lowres_vsi):
path, path_vsi = naturalearth_lowres_vsi
path_zip = "zip://" + str(path)

# absolute zip path
df = pyogrio.read_dataframe(path_zip)
assert len(df) == 177

# relative zip path
with change_cwd(path.parent):
df = pyogrio.read_dataframe("zip://" + path.name)
assert len(df) == 177

# absolute vsizip path
df = pyogrio.read_dataframe(path_vsi)
assert len(df) == 177


@pytest.mark.network
def test_url():
df = pyogrio.read_dataframe(
"https://raw.githubusercontent.com/geopandas/pyogrio/main/pyogrio/tests/fixtures/naturalearth_lowres/naturalearth_lowres.shp"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we OK with having tests that require network?
I added a marker for now, so at least it is easy to skip them (but by default this marker doesn't do anything)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think it is fine. It is surely useful and we have similar one in geopandas as well.

)
assert len(df) == 177


@pytest.mark.network
def test_url_with_zip():
df = pyogrio.read_dataframe(
"zip+https://s3.amazonaws.com/fiona-testing/coutwildrnp.zip"
)
assert len(df) == 67


@pytest.fixture
def aws_env_setup(monkeypatch):
monkeypatch.setenv("AWS_NO_SIGN_REQUEST", "YES")


@pytest.mark.network
def test_uri_s3(aws_env_setup):
df = pyogrio.read_dataframe('zip+s3://fiona-testing/coutwildrnp.zip')
assert len(df) == 67
1 change: 1 addition & 0 deletions pyogrio/tests/test_raw_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def test_read(naturalearth_lowres):


def test_vsi_read_layers(naturalearth_lowres_vsi):
_, naturalearth_lowres_vsi = naturalearth_lowres_vsi
assert array_equal(
list_layers(naturalearth_lowres_vsi), [["naturalearth_lowres", "Polygon"]]
)
Expand Down
Loading