-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FIX: correctly use GDAL auto-decoding of shapefiles when encoding is set #384
Changes from 9 commits
2bfc862
2b38f94
e738f7b
3362dae
b991038
ecb56f6
994be50
901320f
4313fca
5c4657d
ef602d5
0e7e932
146967d
2bdf4dc
b8cc902
e7db258
eaecb18
a623501
d615fd7
d2b43f3
815620a
6eca265
34bacc3
e088311
58d1d3f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -134,6 +134,34 @@ cdef char** dict_to_options(object values): | |
return options | ||
|
||
|
||
cdef const char* override_threadlocal_config_option(const char* key, const char* value): | ||
"""Set the CPLSetThreadLocalConfigOption for key=value | ||
|
||
Parameters | ||
---------- | ||
key : const char* | ||
python string must decoded to UTF-8 before calling this | ||
value : const char* | ||
python string must decoded to UTF-8 before calling this | ||
|
||
Returns | ||
------- | ||
const char* | ||
value previously set for key, so that it can be later restored. Caller | ||
is responsible for freeing this via CPLFree() if not NULL. | ||
""" | ||
cdef const char *prev_value = CPLGetThreadLocalConfigOption(key, NULL) | ||
if prev_value != NULL: | ||
# strings returned from config options may be replaced via | ||
# CPLSetConfigOption() below; GDAL instructs us to save a copy | ||
# in a new string | ||
prev_value = CPLStrdup(prev_value) | ||
|
||
CPLSetThreadLocalConfigOption(key, value) | ||
|
||
return prev_value | ||
|
||
|
||
cdef void* ogr_open(const char* path_c, int mode, char** options) except NULL: | ||
cdef void* ogr_dataset = NULL | ||
|
||
|
@@ -486,13 +514,27 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer): | |
# read without recoding. Hence, it is up to you to supply the data in the | ||
# appropriate encoding. More info: | ||
# https://gdal.org/development/rfc/rfc23_ogr_unicode.html#oftstring-oftstringlist-fields | ||
# NOTE: for shapefiles, this always returns False for the layer returned | ||
# when executing SQL, even when it supports UTF-8 (patched below); | ||
# this may be fixed by https://github.com/OSGeo/gdal/pull/9649 (GDAL >=3.9.0?) | ||
return "UTF-8" | ||
|
||
driver = get_driver(ogr_dataset) | ||
if driver == "ESRI Shapefile": | ||
# Typically, OGR_L_TestCapability returns True for OLCStringsAsUTF8 for ESRI | ||
# Shapefile so this won't be reached. However, for old versions of GDAL and/or | ||
# if libraries are missing, this fallback could be needed. | ||
# OGR_L_TestCapability returns True for OLCStringsAsUTF8 (above) for | ||
# shapefiles when a .cpg file is present with a valid encoding, or GDAL | ||
# auto-detects the encoding from the code page of the .dbf file, or | ||
# SHAPE_ENCODING config option is set, or ENCODING layer creation option | ||
# is specified (shapefiles only). Otherwise, we can only assume that | ||
# shapefiles are in their default encoding of ISO-8859-1 (which may be | ||
# incorrect and must be overridden by user-provided encoding) | ||
|
||
# Always use the first layer to test capabilities until detection for | ||
# SQL results from shapefiles are fixed (above) | ||
# This block should only be used for unfixed versions of GDAL (<3.9.0?) | ||
if OGR_L_TestCapability(GDALDatasetGetLayer(ogr_dataset, 0), OLCStringsAsUTF8): | ||
return "UTF-8" | ||
|
||
return "ISO-8859-1" | ||
|
||
if driver == "OSM": | ||
|
@@ -1111,6 +1153,8 @@ def ogr_read( | |
cdef OGRLayerH ogr_layer = NULL | ||
cdef int feature_count = 0 | ||
cdef double xmin, ymin, xmax, ymax | ||
cdef const char *prev_shape_encoding = NULL | ||
cdef bint override_shape_encoding = False | ||
|
||
path_b = path.encode('utf-8') | ||
path_c = path_b | ||
|
@@ -1142,9 +1186,26 @@ def ogr_read( | |
raise ValueError("'max_features' must be >= 0") | ||
|
||
try: | ||
if encoding: | ||
override_shape_encoding = True | ||
|
||
# for shapefiles, SHAPE_ENCODING must be set before opening the file | ||
# to prevent automatic decoding to UTF-8 by GDAL, so we save previous | ||
# SHAPE_ENCODING so that it can be restored later | ||
# (we do this for all data sources where encoding is set because | ||
# we don't know the driver until after it is opened, which is too late) | ||
encoding_b = encoding.encode("UTF-8") | ||
encoding_c = encoding_b | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Most likely there is a good reason, but I wonder... why isn't this moved inside the function as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I originally wanted to keep There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Hmm, not entirely sure that is correct, cython docs say those are |
||
prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) | ||
|
||
dataset_options = dict_to_options(dataset_kwargs) | ||
ogr_dataset = ogr_open(path_c, 0, dataset_options) | ||
|
||
if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": | ||
# Because SHAPE_ENCODING is set above, GDAL will automatically decode | ||
# to UTF-8 | ||
encoding = "UTF-8" | ||
|
||
brendan-ward marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if sql is None: | ||
if layer is None: | ||
layer = get_default_layer(ogr_dataset) | ||
|
@@ -1156,7 +1217,14 @@ def ogr_read( | |
|
||
# Encoding is derived from the user, from the dataset capabilities / type, | ||
# or from the system locale | ||
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) | ||
if encoding: | ||
if get_driver(ogr_dataset) == "ESRI Shapefile": | ||
# Because SHAPE_ENCODING is set above, GDAL will automatically | ||
# decode shapefiles to UTF-8; ignore any encoding set by user | ||
encoding = "UTF-8" | ||
|
||
else: | ||
encoding = detect_encoding(ogr_dataset, ogr_layer) | ||
|
||
fields = get_fields(ogr_layer, encoding) | ||
|
||
|
@@ -1249,6 +1317,13 @@ def ogr_read( | |
GDALClose(ogr_dataset) | ||
ogr_dataset = NULL | ||
|
||
# reset SHAPE_ENCODING config parameter if temporarily set above | ||
if override_shape_encoding: | ||
CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) | ||
|
||
if prev_shape_encoding != NULL: | ||
CPLFree(<void*>prev_shape_encoding) | ||
|
||
return ( | ||
meta, | ||
fid_data, | ||
|
@@ -1285,6 +1360,8 @@ def ogr_open_arrow( | |
cdef char **fields_c = NULL | ||
cdef const char *field_c = NULL | ||
cdef char **options = NULL | ||
cdef const char *prev_shape_encoding = NULL | ||
cdef bint override_shape_encoding = False | ||
cdef ArrowArrayStream stream | ||
cdef ArrowSchema schema | ||
|
||
|
@@ -1328,6 +1405,18 @@ def ogr_open_arrow( | |
|
||
reader = None | ||
try: | ||
if encoding: | ||
override_shape_encoding = True | ||
|
||
# for shapefiles, SHAPE_ENCODING must be set before opening the file | ||
# to prevent automatic decoding to UTF-8 by GDAL, so we save previous | ||
# SHAPE_ENCODING so that it can be restored later | ||
# (we do this for all data sources where encoding is set because | ||
# we don't know the driver until after it is opened, which is too late) | ||
encoding_b = encoding.encode("UTF-8") | ||
encoding_c = encoding_b | ||
prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) | ||
|
||
dataset_options = dict_to_options(dataset_kwargs) | ||
ogr_dataset = ogr_open(path_c, 0, dataset_options) | ||
|
||
|
@@ -1342,7 +1431,14 @@ def ogr_open_arrow( | |
|
||
# Encoding is derived from the user, from the dataset capabilities / type, | ||
# or from the system locale | ||
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) | ||
if encoding: | ||
if get_driver(ogr_dataset) == "ESRI Shapefile": | ||
# Because SHAPE_ENCODING is set above, GDAL will automatically | ||
# decode shapefiles to UTF-8; ignore any encoding set by user | ||
encoding = "UTF-8" | ||
|
||
else: | ||
encoding = detect_encoding(ogr_dataset, ogr_layer) | ||
|
||
fields = get_fields(ogr_layer, encoding, use_arrow=True) | ||
|
||
|
@@ -1450,6 +1546,13 @@ def ogr_open_arrow( | |
GDALClose(ogr_dataset) | ||
ogr_dataset = NULL | ||
|
||
# reset SHAPE_ENCODING config parameter if temporarily set above | ||
if override_shape_encoding: | ||
CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) | ||
|
||
if prev_shape_encoding != NULL: | ||
CPLFree(<void*>prev_shape_encoding) | ||
|
||
def ogr_read_bounds( | ||
str path, | ||
object layer=None, | ||
|
@@ -1518,12 +1621,25 @@ def ogr_read_info( | |
cdef char **dataset_options = NULL | ||
cdef OGRDataSourceH ogr_dataset = NULL | ||
cdef OGRLayerH ogr_layer = NULL | ||
cdef const char *prev_shape_encoding = NULL | ||
cdef bint override_shape_encoding = False | ||
|
||
path_b = path.encode('utf-8') | ||
path_c = path_b | ||
|
||
|
||
try: | ||
if encoding: | ||
override_shape_encoding = True | ||
|
||
# for shapefiles, SHAPE_ENCODING must be set before opening the file | ||
# to prevent automatic decoding to UTF-8 by GDAL, so we save previous | ||
# SHAPE_ENCODING so that it can be restored later | ||
# (we do this for all data sources where encoding is set because | ||
# we don't know the driver until after it is opened, which is too late) | ||
encoding_b = encoding.encode("UTF-8") | ||
encoding_c = encoding_b | ||
prev_shape_encoding = override_threadlocal_config_option("SHAPE_ENCODING", encoding_c) | ||
|
||
dataset_options = dict_to_options(dataset_kwargs) | ||
ogr_dataset = ogr_open(path_c, 0, dataset_options) | ||
|
||
|
@@ -1533,7 +1649,13 @@ def ogr_read_info( | |
|
||
# Encoding is derived from the user, from the dataset capabilities / type, | ||
# or from the system locale | ||
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) | ||
if encoding and get_driver(ogr_dataset) == "ESRI Shapefile": | ||
# Because SHAPE_ENCODING is set above, GDAL will automatically | ||
# decode shapefiles to UTF-8; ignore any encoding set by user | ||
encoding = "UTF-8" | ||
|
||
else: | ||
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) | ||
|
||
fields = get_fields(ogr_layer, encoding) | ||
|
||
|
@@ -1566,6 +1688,13 @@ def ogr_read_info( | |
GDALClose(ogr_dataset) | ||
ogr_dataset = NULL | ||
|
||
# reset SHAPE_ENCODING config parameter if temporarily set above | ||
if override_shape_encoding: | ||
CPLSetThreadLocalConfigOption("SHAPE_ENCODING", prev_shape_encoding) | ||
|
||
if prev_shape_encoding != NULL: | ||
CPLFree(<void*>prev_shape_encoding) | ||
|
||
return meta | ||
|
||
|
||
|
@@ -1600,7 +1729,7 @@ cdef str get_default_layer(OGRDataSourceH ogr_dataset): | |
------- | ||
str | ||
the name of the default layer to be read. | ||
|
||
""" | ||
layers = get_layer_names(ogr_dataset) | ||
first_layer_name = layers[0][0] | ||
|
@@ -1632,7 +1761,7 @@ cdef get_layer_names(OGRDataSourceH ogr_dataset): | |
------- | ||
ndarray(n) | ||
array of layer names | ||
|
||
""" | ||
cdef OGRLayerH ogr_layer = NULL | ||
|
||
|
@@ -1862,8 +1991,9 @@ cdef create_ogr_dataset_layer( | |
# Setup layer creation options | ||
|
||
if driver == 'ESRI Shapefile': | ||
# Fiona only sets encoding for shapefiles; other drivers do not support | ||
# encoding as an option. | ||
# ENCODING option must be set for shapefiles to properly write *.cpg | ||
# file containing the encoding; this is not a supported option for | ||
# other drivers | ||
if encoding is None: | ||
encoding = "UTF-8" | ||
encoding_b = encoding.upper().encode('UTF-8') | ||
|
@@ -1988,10 +2118,18 @@ def ogr_write( | |
&ogr_dataset, &ogr_layer, | ||
) | ||
|
||
# Now the dataset and layer have been created, we can properly determine the | ||
# encoding. It is derived from the user, from the dataset capabilities / type, | ||
# or from the system locale | ||
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) | ||
|
||
if driver == 'ESRI Shapefile': | ||
# force encoding for remaining operations to be in UTF-8 (even if user | ||
# provides an encoding) because GDAL will automatically convert those to | ||
# the target encoding because ENCODING is set as a layer creation option | ||
encoding = "UTF-8" | ||
|
||
else: | ||
# Now the dataset and layer have been created, we can properly determine the | ||
# encoding. It is derived from the user, from the dataset capabilities / type, | ||
# or from the system locale | ||
encoding = encoding or detect_encoding(ogr_dataset, ogr_layer) | ||
|
||
### Create the fields | ||
field_types = None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does this last sentence mean exactly? Is that for writing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that was for writing. Will make more explicit.