Skip to content

Commit 19e0110

Browse files
authored
Fix non-empty meta when reading from shapefiles (#305)
1 parent d84e299 commit 19e0110

File tree

5 files changed

+99
-1
lines changed

5 files changed

+99
-1
lines changed

dask_geopandas/io/file.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ def read_file(
114114
read_geometry = False
115115
meta = pyogrio.read_dataframe(
116116
path, layer=layer, columns=columns, read_geometry=read_geometry, max_features=5
117-
)
117+
).head(0)
118118

119119
# Define parts
120120
parts = []

dask_geopandas/tests/io/test_arrow.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,15 @@ def test_index(tmp_path, naturalearth_lowres):
214214
assert "hilbert_distance" not in result.columns
215215
assert result.index.name is None
216216
assert result.index.compute()[0] == 0
217+
218+
219+
def test_read_meta_is_empty(tmp_path, naturalearth_lowres):
220+
df = geopandas.read_file(naturalearth_lowres)
221+
222+
basedir = tmp_path / "dataset"
223+
basedir.mkdir()
224+
df.iloc[:100].to_feather(basedir / "data.0.feather")
225+
df.iloc[100:].to_feather(basedir / "data.1.feather")
226+
227+
result = dask_geopandas.read_feather(basedir)
228+
assert len(result._meta) == 0
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import geopandas
2+
3+
import dask_geopandas
4+
5+
import pytest
6+
from geopandas.testing import assert_geodataframe_equal
7+
8+
try:
9+
import pyogrio # noqa: F401
10+
11+
PYOGRIO = True
12+
except ImportError:
13+
PYOGRIO = False
14+
15+
BACKENDS = ["arrow", "file", "parquet"]
16+
17+
18+
@pytest.fixture(params=BACKENDS)
19+
def backend(request):
20+
param = request.param
21+
if not PYOGRIO and param == "file":
22+
pytest.skip("Unable to import pyogrio for file backend")
23+
return param
24+
25+
26+
def from_arrow_backend(path, tmp_path, npartitions):
27+
df = geopandas.read_file(path)
28+
basedir = tmp_path / "dataset"
29+
basedir.mkdir()
30+
ddf = dask_geopandas.from_geopandas(df, npartitions=npartitions)
31+
for i, part in enumerate(ddf.partitions):
32+
part.compute().to_feather(basedir / f"data.{i}.feather")
33+
return dask_geopandas.read_feather(basedir)
34+
35+
36+
def from_file_backend(path, tmp_path, npartitions):
37+
return dask_geopandas.read_file(path, npartitions=npartitions)
38+
39+
40+
def from_parquet_backend(path, tmp_path, npartitions):
41+
ddf = dask_geopandas.from_geopandas(
42+
geopandas.read_file(path), npartitions=npartitions
43+
)
44+
basedir = tmp_path / "dataset"
45+
ddf.to_parquet(basedir)
46+
return dask_geopandas.read_parquet(basedir)
47+
48+
49+
def get_from_backend(backend, data_path, tmp_path, npartitions=4):
50+
if backend == "arrow":
51+
ddf = from_arrow_backend(data_path, tmp_path, npartitions)
52+
elif backend == "file":
53+
ddf = from_file_backend(data_path, tmp_path, npartitions)
54+
elif backend == "parquet":
55+
ddf = from_parquet_backend(data_path, tmp_path, npartitions)
56+
else:
57+
raise ValueError()
58+
return ddf
59+
60+
61+
def test_spatial_shuffle_integration(backend, naturalearth_lowres, tmp_path):
62+
ddf = get_from_backend(backend, naturalearth_lowres, tmp_path)
63+
new_idx = ddf.hilbert_distance()
64+
expected = ddf.compute().set_index(new_idx.compute())
65+
66+
result = ddf.spatial_shuffle()
67+
# Sort because the index is shuffled
68+
assert_geodataframe_equal(result.compute().sort_index(), expected.sort_index())

dask_geopandas/tests/io/test_file.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ def test_read_file_columns(naturalearth_lowres):
9292
assert_geoseries_equal(result.compute(), df["geometry"])
9393

9494

95+
def test_read_file_meta_is_empty(naturalearth_lowres):
96+
path = naturalearth_lowres
97+
result = dask_geopandas.read_file(path, npartitions=4)
98+
assert len(result._meta) == 0
99+
100+
95101
def test_read_file_layer(tmp_path):
96102
df_points = geopandas.GeoDataFrame(
97103
{

dask_geopandas/tests/io/test_parquet.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,3 +248,15 @@ def test_read_parquet_default_crs(tmp_path):
248248
result = dask_geopandas.read_parquet(filename)
249249
assert result.crs.equals(pyproj.CRS("OGC:CRS84"))
250250
assert result["other_geom"].crs.equals(pyproj.CRS("OGC:CRS84"))
251+
252+
253+
def test_read_parquet_meta_is_empty(tmp_path, naturalearth_lowres):
254+
# basic roundtrip
255+
df = geopandas.read_file(naturalearth_lowres)
256+
ddf = dask_geopandas.from_geopandas(df, npartitions=4)
257+
258+
basedir = tmp_path / "dataset"
259+
ddf.to_parquet(basedir)
260+
261+
result = dask_geopandas.read_parquet(basedir)
262+
assert len(result._meta) == 0

0 commit comments

Comments
 (0)