From f50e14844bb38f9c2a59234f896bfdbc0a9afbf5 Mon Sep 17 00:00:00 2001
From: Lukas Riedel <34276446+peanutfun@users.noreply.github.com>
Date: Mon, 8 Jul 2024 11:48:20 +0200
Subject: [PATCH] Explicitly convert `event_name` to strings in
 `Impact.from_hdf5` (#894)

* Try converting non-string impact event names from H5
* Only write H5 file if Impact.event_name is strings
* Fix bug where only the first event name was checked
* Update CHANGELOG.md

---------

Co-authored-by: emanuel-schmid <schmide@ethz.ch>
---
 CHANGELOG.md                       |  2 ++
 climada/engine/impact.py           | 26 ++++++++++++++++++--------
 climada/engine/test/test_impact.py | 19 +++++++++++++++----
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 203f7230d5..c76e7ba2af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -24,6 +24,8 @@ CLIMADA tutorials. [#872](https://github.com/CLIMADA-project/climada_python/pull
 - Improved error messages produced by `ImpactCalc.impact()` in case impact function in the exposures is not found in impf_set [#863](https://github.com/CLIMADA-project/climada_python/pull/863)
 - Update the Holland et al. 2010 TC windfield model and introduce `model_kwargs` parameter to adjust model parameters [#846](https://github.com/CLIMADA-project/climada_python/pull/846)
 - Changed module structure: `climada.hazard.Hazard` has been split into the modules `base`, `io` and `plot` [#871](https://github.com/CLIMADA-project/climada_python/pull/871)
+- `Impact.from_hdf5` now calls `str` on `event_name` data that is not strings, and issue a warning then [#894](https://github.com/CLIMADA-project/climada_python/pull/894)
+- `Impact.write_hdf5` now throws an error if `event_name` is does not contain strings exclusively [#894](https://github.com/CLIMADA-project/climada_python/pull/894)
 
 ### Fixed
 
diff --git a/climada/engine/impact.py b/climada/engine/impact.py
index e9afda5903..97bbd6d90f 100644
--- a/climada/engine/impact.py
+++ b/climada/engine/impact.py
@@ -937,11 +937,6 @@ def write_hdf5(self, file_path: Union[str, Path], dense_imp_mat: bool=False):
 
         The impact matrix can be stored in a sparse or dense format.
 
-        Notes
-        -----
-        This writer does not support attributes with variable types. Please make sure
-        that ``event_name`` is a list of equally-typed values, e.g., all ``str``.
-
         Parameters
         ----------
         file_path : str or Path
@@ -950,6 +945,11 @@ def write_hdf5(self, file_path: Union[str, Path], dense_imp_mat: bool=False):
             If ``True``, write the impact matrix as dense matrix that can be more easily
             interpreted by common H5 file readers but takes up (vastly) more space.
             Defaults to ``False``.
+
+        Raises
+        ------
+        TypeError
+            If :py:attr:`event_name` does not contain strings exclusively.
         """
         # Define writers for all types (will be filled later)
         type_writers = dict()
@@ -983,7 +983,7 @@ def write(group: h5py.Group, name: str, value: Any):
 
         def _str_type_helper(values: Collection):
             """Return string datatype if we assume 'values' contains strings"""
-            if isinstance(next(iter(values)), str):
+            if all((isinstance(val, str) for val in values)):
                 return h5py.string_dtype()
             return None
 
@@ -1037,6 +1037,8 @@ def write_csr(group, name, value):
             # Now write all attributes
             # NOTE: Remove leading underscore to write '_tot_value' as regular attribute
             for name, value in self.__dict__.items():
+                if name == "event_name" and _str_type_helper(value) is None:
+                    raise TypeError("'event_name' must be a list of strings")
                 write(file, name.lstrip("_"), value)
 
     def write_sparse_csr(self, file_name):
@@ -1240,10 +1242,18 @@ def from_hdf5(cls, file_path: Union[str, Path]):
             ).intersection(file.keys())
             kwargs.update({attr: file[attr][:] for attr in array_attrs})
 
-            # Special handling for 'event_name' because it's a list of strings
+            # Special handling for 'event_name' because it should be a list of strings
             if "event_name" in file:
                 # pylint: disable=no-member
-                kwargs["event_name"] = list(file["event_name"].asstr()[:])
+                try:
+                    event_name = file["event_name"].asstr()[:]
+                except TypeError:
+                    LOGGER.warning(
+                        "'event_name' is not stored as strings. Trying to decode "
+                        "values with 'str()' instead."
+                    )
+                    event_name = map(str, file["event_name"][:])
+                kwargs["event_name"] = list(event_name)
 
         # Create the impact object
         return cls(**kwargs)
diff --git a/climada/engine/test/test_impact.py b/climada/engine/test/test_impact.py
index a0b458ca55..0e70e993b0 100644
--- a/climada/engine/test/test_impact.py
+++ b/climada/engine/test/test_impact.py
@@ -779,7 +779,8 @@ def test_select_event_identity_pass(self):
         ent.exposures.assign_centroids(hazard)
 
         # Compute the impact over the whole exposures
-        imp = ImpactCalc(ent.exposures, ent.impact_funcs, hazard).impact(save_mat=True, assign_centroids=False)
+        imp = ImpactCalc(ent.exposures, ent.impact_funcs, hazard).impact(
+            save_mat=True, assign_centroids=False)
 
         sel_imp = imp.select(event_ids=imp.event_id,
                              event_names=imp.event_name,
@@ -1019,10 +1020,11 @@ def test_write_hdf5_without_imp_mat(self):
 
     def test_write_hdf5_type_fail(self):
         """Test that writing attributes with varying types results in an error"""
-        self.impact.event_name = [1, "a", 1.0, "b", "c", "d"]
-        with self.assertRaises(TypeError) as cm:
+        self.impact.event_name = ["a", 1, 1.0, "b", "c", "d"]
+        with self.assertRaisesRegex(
+            TypeError, "'event_name' must be a list of strings"
+        ):
             self.impact.write_hdf5(self.filepath)
-        self.assertIn("No conversion path for dtype", str(cm.exception))
 
     def test_cycle_hdf5(self):
         """Test writing and reading the same object"""
@@ -1120,6 +1122,15 @@ def test_read_hdf5_full(self):
         impact = Impact.from_hdf5(self.filepath)
         npt.assert_array_equal(impact.imp_mat.toarray(), [[0, 1, 2], [3, 0, 0]])
 
+        # Check with non-string event_name
+        event_name = [1.2, 2]
+        with h5py.File(self.filepath, "r+") as file:
+            del file["event_name"]
+            file.create_dataset("event_name", data=event_name)
+        with self.assertLogs("climada.engine.impact", "WARNING") as cm:
+            impact = Impact.from_hdf5(self.filepath)
+        self.assertIn("'event_name' is not stored as strings", cm.output[0])
+        self.assertListEqual(impact.event_name, ["1.2", "2.0"])
 
 # Execute Tests
 if __name__ == "__main__":