geopandas · brendan-ward · Apr 4, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,6 +6,7 @@
 
 -   Fix error in `write_dataframe` if input has a date column and
     non-consecutive index values (#325).
+-   Fix encoding issues on windows for some formats (e.g. ".csv") (#361).
 -   Raise exception in `read_arrow` or `read_dataframe(..., use_arrow=True)` if
     a boolean column is detected due to error in GDAL reading boolean values (#335)
     this has been fixed in GDAL >= 3.8.3.

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -462,7 +462,7 @@ cdef get_metadata(GDALMajorObjectH obj):
 
 
 cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
-    """Attempt to detect the encoding of the layer.
+    """Attempt to detect the encoding of the dataset or layer.
     If it supports UTF-8, use that.
     If it is a shapefile, it must otherwise be ISO-8859-1.
 
@@ -480,6 +480,22 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
         return 'UTF-8'
 
     driver = get_driver(ogr_dataset)
+    return get_encoding_for_driver(driver)
+
+
+cdef get_encoding_for_driver(str driver):
+    """Get the typical encoding for the driver, or None.
+
+    Parameters
+    ----------
+    driver : str
+        driver to get the encoding for
+
+    Returns
+    -------
+    str or None
+        the typical encoding if this is applicable for the driver, or None
+    """
     if driver == 'ESRI Shapefile':
         return 'ISO-8859-1'
 
@@ -488,6 +504,16 @@ cdef detect_encoding(OGRDataSourceH ogr_dataset, OGRLayerH ogr_layer):
         # per https://help.openstreetmap.org/questions/2172/what-encoding-does-openstreetmap-use
         return "UTF-8"
 
+    if driver in ("XLSX", "ODS"):
+        # TestCapability for OLCStringsAsUTF8 for XLSX and ODS was False for new files
+        # being created for GDAL < 3.8.5. Once these versions of GDAL are no longer
+        # supported, this can be removed.
+        return "UTF-8"
+
+    if driver == "GeoJSONSeq":
+        # In old gdal versions, OLCStringsAsUTF8 wasn't advertised yet.
+        return "UTF-8"
+
     return None
 
 
@@ -1700,6 +1726,12 @@ cdef create_ogr_dataset_layer(
     the caller still needs to set up the layer definition, i.e. create the
     fields).
 
+    Parameters
+    ----------
+    encoding : str
+        Only used if `driver` is "ESRI Shapefile". If not None, it overrules the default
+        shapefile encoding.
+
     Returns
     -------
     bool :
@@ -1791,6 +1823,8 @@ cdef create_ogr_dataset_layer(
         if driver == 'ESRI Shapefile':
             # Fiona only sets encoding for shapefiles; other drivers do not support
             # encoding as an option.
+            if encoding is None:
+                encoding = get_encoding_for_driver(driver)
             encoding_b = encoding.upper().encode('UTF-8')
             encoding_c = encoding_b
             layer_options = CSLSetNameValue(layer_options, "ENCODING", encoding_c)
@@ -1906,22 +1940,27 @@ def ogr_write(
         gdal_tz_offsets = {}
 
     ### Setup up dataset and layer
-    if not encoding:
-        encoding = locale.getpreferredencoding()
-
     layer_created = create_ogr_dataset_layer(
         path, layer, driver, crs, geometry_type, encoding,
         dataset_kwargs, layer_kwargs, append,
         dataset_metadata, layer_metadata,
         &ogr_dataset, &ogr_layer,
     )
 
+    # Now the dataset and layer have been created, we can properly determine the
+    # encoding. It is derived from the user, from the dataset capabilities / type,
+    # or from the system locale
+    encoding = (
+        encoding
+        or detect_encoding(ogr_dataset, ogr_layer)
+        or locale.getpreferredencoding()
+    )
+
     ### Create the fields
     field_types = None
     if num_fields > 0:
         field_types = infer_field_types([field.dtype for field in field_data])
 
-    ### Create the fields
     if layer_created:
         for i in range(num_fields):
             field_type, field_subtype, width, precision = field_types[i]
@@ -2020,7 +2059,6 @@ def ogr_write(
                     OGR_F_SetFieldNull(ogr_feature, field_idx)
 
                 elif field_type == OFTString:
-                    # TODO: encode string using approach from _get_internal_encoding which checks layer capabilities
                     if (
                         field_value is None
                         or (isinstance(field_value, float) and isnan(field_value))
@@ -2032,7 +2070,7 @@ def ogr_write(
                             field_value = str(field_value)
 
                         try:
-                            value_b = field_value.encode("UTF-8")
+                            value_b = field_value.encode(encoding)
                             OGR_F_SetFieldString(ogr_feature, field_idx, value_b)
 
                         except AttributeError:

diff --git a/pyogrio/tests/test_geopandas_io.py b/pyogrio/tests/test_geopandas_io.py
@@ -62,6 +62,25 @@ def spatialite_available(path):
         return False
 
 
+def test_read_csv_encoding(tmp_path):
+    # Write csv test file. Depending on the os this will be written in a different
+    # encoding: for linux and macos this is utf-8, for windows it is cp1252.
+    csv_path = tmp_path / "test.csv"
+    with open(csv_path, "w") as csv:
+        csv.write("näme,city\n")
+        csv.write("Wilhelm Röntgen,Zürich\n")
+
+    # Read csv. The data should be read with the same default encoding as the csv file
+    # was written in, but should have been converted to utf-8 in the dataframe returned.
+    # Hence, the asserts below, with strings in utf-8, be OK.
+    df = read_dataframe(csv_path)
+
+    assert len(df) == 1
+    assert df.columns.tolist() == ["näme", "city"]
+    assert df.city.tolist() == ["Zürich"]
+    assert df.näme.tolist() == ["Wilhelm Röntgen"]
+
+
 def test_read_dataframe(naturalearth_lowres_all_ext):
     df = read_dataframe(naturalearth_lowres_all_ext)
 
@@ -782,6 +801,31 @@ def test_read_sql_dialect_sqlite_gpkg(naturalearth_lowres, use_arrow):
     assert df.iloc[0].geometry.area > area_canada
 
 
+def test_write_csv_encoding(tmp_path):
+    """Test if write_dataframe uses the default encoding correctly."""
+    # Write csv test file. Depending on the os this will be written in a different
+    # encoding: for linux and macos this is utf-8, for windows it is cp1252.
+    csv_path = tmp_path / "testg.csv"
+
+    with open(csv_path, "w") as csv:
+        csv.write("näme,city\n")
+        csv.write("Wilhelm Röntgen,Zürich\n")
+
+    # Write csv test file with the same data using write_dataframe. It should use the
+    # same encoding as above.
+    df = pd.DataFrame({"näme": ["Wilhelm Röntgen"], "city": ["Zürich"]})
+    csv_pyogrio_path = tmp_path / "test_pyogrio.csv"
+    write_dataframe(df, csv_pyogrio_path)
+
+    # If the files written are binary identical, they were written using the same
+    # encoding.
+    with open(csv_path, "r") as csv:
+        csv_str = csv.read()
+    with open(csv_pyogrio_path, "r") as csv_pyogrio:
+        csv_pyogrio_str = csv_pyogrio.read()
+    assert csv_str == csv_pyogrio_str
+
+
 @pytest.mark.parametrize("ext", ALL_EXTS)
 def test_write_dataframe(tmp_path, naturalearth_lowres, ext):
     input_gdf = read_dataframe(naturalearth_lowres)