Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Calculate feature count ourselves if GDAL returns unknown count (-1), better support OSM data #271

Merged
merged 9 commits into from
Aug 22, 2023
9 changes: 8 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# CHANGELOG

## 0.6.1 (???)
## 0.7.0 (???)

### Improvements

- Calculate feature count by iterating over features if GDAL returns an
unknown count for a data layer (e.g., OSM driver); this may have signficant
performance impacts for some data sources that would otherwise return an
unknown count (count is used in `read_info`, `read`, `read_dataframe`) (#271).

### Bug fixes

Expand Down
72 changes: 66 additions & 6 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,50 @@ cdef get_driver(OGRDataSourceH ogr_dataset):
return driver


cdef get_feature_count(OGRLayerH ogr_layer):
"""Get the feature count of a layer.

If GDAL returns an unknown count (-1), this iterates over every feature
to calculate the count.

Parameters
----------
ogr_layer : pointer to open OGR layer

Returns
-------
int
count of features
"""

cdef int feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)

# if GDAL refuses to give us the feature count, we have to loop over all
# features ourselves and get the count. This can happen for some drivers
# (e.g., OSM) or if a where clause is invalid but not rejected as error
if feature_count == -1:
# make sure layer is read from beginning
OGR_L_ResetReading(ogr_layer)

feature_count = 0
while True:
try:
exc_wrap_pointer(OGR_L_GetNextFeature(ogr_layer))
feature_count +=1

except NullPointerError:
# No more rows available, so stop reading
break

# driver may raise other errors, e.g., for OSM if node ids are not
# increasing, the default config option OSM_USE_CUSTOM_INDEXING=YES
# causes errors iterating over features
except CPLE_BaseError as exc:
raise DataLayerError(f"Could not iterate over features: {str(exc)}")

return feature_count


cdef set_metadata(GDALMajorObjectH obj, object metadata):
"""Set metadata on a dataset or layer

Expand Down Expand Up @@ -486,11 +530,28 @@ cdef apply_where_filter(OGRLayerH ogr_layer, str where):
if err != OGRERR_NONE:
try:
exc_check()
name = OGR_L_GetName(ogr_layer)
except CPLE_BaseError as exc:
raise ValueError(str(exc))

raise ValueError(f"Invalid SQL query for layer '{name}': '{where}'")
raise ValueError(f"Invalid SQL query for layer '{OGR_L_GetName(ogr_layer)}': '{where}'")

# attempt to detect invalid SQL query for GPKG by checking count and first
theroggy marked this conversation as resolved.
Show resolved Hide resolved
# feature; a count of -1 may signal invalid SQL or be because layer does
# not directly support getting feature count (e.g., OSM)
if OGR_L_GetFeatureCount(ogr_layer, 1) == -1:
OGR_L_ResetReading(ogr_layer)
try:
exc_wrap_pointer(OGR_L_GetNextFeature(ogr_layer))

except NullPointerError:
pass

except CPLE_BaseError as exc:
if 'failed to prepare SQL' in str(exc):
raise ValueError(f"Invalid SQL query for layer '{OGR_L_GetName(ogr_layer)}': '{where}'") from None

else:
raise DataLayerError(f"Could not iterate over features: {str(exc)}") from None


cdef apply_spatial_filter(OGRLayerH ogr_layer, bbox):
Expand Down Expand Up @@ -526,11 +587,10 @@ cdef validate_feature_range(OGRLayerH ogr_layer, int skip_features=0, int max_fe
skip_features : number of features to skip from beginning of available range
max_features : maximum number of features to read from available range
"""
feature_count = OGR_L_GetFeatureCount(ogr_layer, 1)
feature_count = get_feature_count(ogr_layer)
num_features = max_features

if feature_count <= 0:
# the count comes back as -1 if the where clause above is invalid but not rejected as error
if feature_count == 0:
name = OGR_L_GetName(ogr_layer)
warnings.warn(f"Layer '{name}' does not have any features to read")
return 0, 0
Expand Down Expand Up @@ -1327,7 +1387,7 @@ def ogr_read_info(
'fields': fields[:,2], # return only names
'dtypes': fields[:,3],
'geometry_type': get_geometry_type(ogr_layer),
'features': OGR_L_GetFeatureCount(ogr_layer, 1),
'features': get_feature_count(ogr_layer),
'driver': get_driver(ogr_dataset),
"capabilities": {
"random_read": OGR_L_TestCapability(ogr_layer, OLCRandomRead),
Expand Down
4 changes: 4 additions & 0 deletions pyogrio/tests/fixtures/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,7 @@ This was extracted from https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrogra
```bash
ogr2ogr test_mixed_surface.gpkg NHDPLUS_H_0308_HU4_GDB.gdb NHDWaterbody -where '"NHDPlusID" = 15000300070477' -select "NHDPlusID"
```

### OSM PBF test

This was downloaded from https://github.com/openstreetmap/OSM-binary/blob/master/resources/sample.pbf
Binary file added pyogrio/tests/fixtures/sample.osm.pbf
Binary file not shown.
18 changes: 17 additions & 1 deletion pyogrio/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
get_gdal_config_option,
get_gdal_data_path,
)
from pyogrio.errors import DataSourceError
from pyogrio.errors import DataSourceError, DataLayerError

from pyogrio._env import GDALEnv

Expand Down Expand Up @@ -268,6 +268,22 @@ def test_read_info_invalid_dataset_kwargs(naturalearth_lowres):
read_info(naturalearth_lowres, INVALID="YES")


def test_read_info_force_feature_count_exception(data_dir):
with pytest.raises(DataLayerError, match="Could not iterate over features"):
read_info(data_dir / "sample.osm.pbf")


def test_read_info_force_feature_count(data_dir):
# the sample OSM file has non-increasing node IDs which causes the default
# custom indexing to raise an exception iterating over features
set_gdal_config_options({"OSM_USE_CUSTOM_INDEXING": False})
meta = read_info(data_dir / "sample.osm.pbf")
assert meta["features"] == 8

# reset the config option
set_gdal_config_options({"OSM_USE_CUSTOM_INDEXING": None})


@pytest.mark.parametrize(
"name,value,expected",
[
Expand Down
Loading