Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: fix encoding issues on windows for some formats #361

Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
37f5964
ENH: improve handling of encoding on windows
theroggy Feb 23, 2024
e6d4169
Add test to check encoding used to write csv
theroggy Feb 23, 2024
a5b6891
Fix writing string data in correct encoding
theroggy Feb 23, 2024
18e65f7
Encoding detection when writing based on driver
theroggy Feb 23, 2024
b6b3f6a
encode string values to the detected encoding
theroggy Feb 23, 2024
698ad27
Rollback unneeded change
theroggy Feb 23, 2024
28b30d9
replace filecmp with manual check
theroggy Feb 23, 2024
b55a4cd
encoding can only be determined after creating the output layer
theroggy Feb 23, 2024
c22d6ad
Fix xlsx encoding
theroggy Feb 24, 2024
fcfdd31
Always encode field names in UTF8
theroggy Feb 24, 2024
3ced6e7
Add GeoJSONSeq to be utf8 for old gdal versions
theroggy Feb 24, 2024
4cc390d
Update CHANGES.md
theroggy Feb 24, 2024
4f6fe84
Update CHANGES.md
theroggy Feb 24, 2024
70c71c1
Try disabling the XLSX utf8 hardcoding use as it should be redundant
theroggy Feb 24, 2024
4814c00
Add logging regarding locale.setpreferredencoding
theroggy Feb 24, 2024
9d8680d
Move encoding detection to where the layer has already been further d…
theroggy Feb 24, 2024
d0fcd0b
Move detection to after where transation is started
theroggy Feb 24, 2024
b0911b9
Rollback changes to debug XLSX issue in gdal
theroggy Feb 24, 2024
2e7f138
Add column name with special characters in csv tests
theroggy Feb 24, 2024
9723475
Update test_geopandas_io.py
theroggy Feb 24, 2024
78a30b4
Specify different encodings in csv tests
theroggy Feb 26, 2024
85cbbe8
Merge remote-tracking branch 'upstream/main' into ENH-improve-handlin…
theroggy Mar 1, 2024
e7141c5
Set default encoding for shapefile to UTF-8 for all platforms.
theroggy Mar 1, 2024
5c3004a
Merge remote-tracking branch 'upstream/main' into ENH-improve-handlin…
theroggy Mar 5, 2024
b3d31d8
Improve changelog entry for PR 366 (arrow metadata)
theroggy Mar 5, 2024
ea061e0
Add UTF-8 shapefiles to changelog
theroggy Mar 5, 2024
9255602
Centralize getprefferedencoding + improve inline doc
theroggy Mar 5, 2024
f836bd2
Add check that shp is written in UTF-8 by default on all platforms
theroggy Mar 5, 2024
350fe0d
Simplify detect_encoding again + improve doc
theroggy Mar 5, 2024
ba9b808
Add encoding test for shp writing
theroggy Mar 5, 2024
1cc34d5
Add more documentation
theroggy Mar 5, 2024
23ec6f7
Update _io.pyx
theroggy Mar 5, 2024
c6a9625
Merge branch 'main' into ENH-improve-handling-of-encoding-on-windows
brendan-ward Apr 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

- Fix error in `write_dataframe` if input has a date column and
non-consecutive index values (#325).
- Fix encoding issues on windows for some formats (e.g. ".csv") (#361).
theroggy marked this conversation as resolved.
Show resolved Hide resolved
- Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
a boolean column is detected due to error in GDAL reading boolean values (#335)
this has been fixed in GDAL >= 3.8.3.
Expand Down
52 changes: 45 additions & 7 deletions pyogrio/_io.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ cdef get_metadata(GDALMajorObjectH obj):


cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
"""Attempt to detect the encoding of the layer.
"""Attempt to detect the encoding of the dataset or layer.
If it supports UTF-8, use that.
If it is a shapefile, it must otherwise be ISO-8859-1.

Expand All @@ -480,6 +480,22 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
return 'UTF-8'

driver = get_driver(ogr_dataset)
return get_encoding_for_driver(driver)


cdef get_encoding_for_driver(str driver):
"""Get the typical encoding for the driver, or None.

Parameters
----------
driver : str
driver to get the encoding for

Returns
-------
str or None
the typical encoding if this is applicable for the driver, or None
"""
if driver == 'ESRI Shapefile':
return 'ISO-8859-1'
theroggy marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -488,6 +504,16 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
# per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use
return "UTF-8"

if driver in ("XLSX", "ODS"):
# TestCapability for OLCStringsAsUTF8 for XLSX and ODS was False for new files
# being created for GDAL < 3.8.5. Once these versions of GDAL are no longer
# supported, this can be removed.
return "UTF-8"

if driver == "GeoJSONSeq":
# In old gdal versions, OLCStringsAsUTF8 wasn't advertised yet.
theroggy marked this conversation as resolved.
Show resolved Hide resolved
return "UTF-8"

return None


Expand Down Expand Up @@ -1700,6 +1726,12 @@ cdef create_ogr_dataset_layer(
the caller still needs to set up the layer definition, i.e. create the
fields).

Parameters
----------
encoding : str
Only used if `driver` is "ESRI Shapefile". If not None, it overrules the default
shapefile encoding.

Returns
-------
bool :
Expand Down Expand Up @@ -1791,6 +1823,8 @@ cdef create_ogr_dataset_layer(
if driver == 'ESRI Shapefile':
# Fiona only sets encoding for shapefiles; other drivers do not support
# encoding as an option.
if encoding is None:
theroggy marked this conversation as resolved.
Show resolved Hide resolved
encoding = get_encoding_for_driver(driver)
encoding_b = encoding.upper().encode('UTF-8')
encoding_c = encoding_b
layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
Expand Down Expand Up @@ -1906,22 +1940,27 @@ def ogr_write(
gdal_tz_offsets = {}

### Setup up dataset and layer
if not encoding:
encoding = locale.getpreferredencoding()

layer_created = create_ogr_dataset_layer(
theroggy marked this conversation as resolved.
Show resolved Hide resolved
path, layer, driver, crs, geometry_type, encoding,
dataset_kwargs, layer_kwargs, append,
dataset_metadata, layer_metadata,
&ogr_dataset, &ogr_layer,
)

# Now the dataset and layer have been created, we can properly determine the
# encoding. It is derived from the user, from the dataset capabilities / type,
# or from the system locale
encoding = (
encoding
or detect_encoding(ogr_dataset, ogr_layer)
or locale.getpreferredencoding()
theroggy marked this conversation as resolved.
Show resolved Hide resolved
)

### Create the fields
field_types = None
if num_fields > 0:
field_types = infer_field_types([field.dtype for field in field_data])

### Create the fields
if layer_created:
for i in range(num_fields):
field_type, field_subtype, width, precision = field_types[i]
Expand Down Expand Up @@ -2020,7 +2059,6 @@ def ogr_write(
OGR_F_SetFieldNull(ogr_feature, field_idx)

elif field_type == OFTString:
# TODO: encode string using approach from _get_internal_encoding which checks layer capabilities
if (
field_value is None
or (isinstance(field_value, float) and isnan(field_value))
Expand All @@ -2032,7 +2070,7 @@ def ogr_write(
field_value = str(field_value)

try:
value_b = field_value.encode("UTF-8")
value_b = field_value.encode(encoding)
theroggy marked this conversation as resolved.
Show resolved Hide resolved
OGR_F_SetFieldString(ogr_feature, field_idx, value_b)

except AttributeError:
Expand Down
44 changes: 44 additions & 0 deletions pyogrio/tests/test_geopandas_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,25 @@ def spatialite_available(path):
return False


def test_read_csv_encoding(tmp_path):
# Write csv test file. Depending on the os this will be written in a different
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
theroggy marked this conversation as resolved.
Show resolved Hide resolved
csv_path = tmp_path / "test.csv"
with open(csv_path, "w") as csv:
csv.write("näme,city\n")
csv.write("Wilhelm Röntgen,Zürich\n")

# Read csv. The data should be read with the same default encoding as the csv file
# was written in, but should have been converted to utf-8 in the dataframe returned.
# Hence, the asserts below, with strings in utf-8, be OK.
df = read_dataframe(csv_path)

assert len(df) == 1
assert df.columns.tolist() == ["näme", "city"]
assert df.city.tolist() == ["Zürich"]
assert df.näme.tolist() == ["Wilhelm Röntgen"]


def test_read_dataframe(naturalearth_lowres_all_ext):
df = read_dataframe(naturalearth_lowres_all_ext)

Expand Down Expand Up @@ -782,6 +801,31 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
assert df.iloc[0].geometry.area > area_canada


def test_write_csv_encoding(tmp_path):
"""Test if write_dataframe uses the default encoding correctly."""
# Write csv test file. Depending on the os this will be written in a different
# encoding: for linux and macos this is utf-8, for windows it is cp1252.
csv_path = tmp_path / "testg.csv"

with open(csv_path, "w") as csv:
csv.write("näme,city\n")
csv.write("Wilhelm Röntgen,Zürich\n")

# Write csv test file with the same data using write_dataframe. It should use the
# same encoding as above.
df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
write_dataframe(df, csv_pyogrio_path)

# If the files written are binary identical, they were written using the same
# encoding.
with open(csv_path, "r") as csv:
csv_str = csv.read()
with open(csv_pyogrio_path, "r") as csv_pyogrio:
csv_pyogrio_str = csv_pyogrio.read()
assert csv_str == csv_pyogrio_str


@pytest.mark.parametrize("ext", ALL_EXTS)
def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
input_gdf = read_dataframe(naturalearth_lowres)
Expand Down