Merge pull request #14 from openscm/bug-fix

znicholls · web-flow · commit ddd4d8bd0961 · 2025-04-12T17:06:00.000+02:00
Fix bug in update index levels
diff --git a/changelog/14.fix.md b/changelog/14.fix.md
@@ -0,0 +1,3 @@
+Fixed up [pandas_openscm.index_manipulation.update_levels][].
+It now drops unused levels by default first, to avoid applying the updates to values that aren't being used.
+The same fixes are propagated to [pandas_openscm.accessors.DataFramePandasOpenSCMAccessor.update_index_levels][] and [pandas_openscm.index_manipulation.update_index_levels_func][].
diff --git a/src/pandas_openscm/accessors.py b/src/pandas_openscm/accessors.py
@@ -626,7 +626,10 @@ def to_long_data(self, time_col_name: str = "time") -> pd.DataFrame:
         return ts_to_long_data(self._df, time_col_name=time_col_name)
 
     def update_index_levels(
-        self, updates: dict[Any, Callable[[Any], Any]]
+        self,
+        updates: dict[Any, Callable[[Any], Any]],
+        copy: bool = True,
+        remove_unused_levels: bool = True,
     ) -> pd.DataFrame:
         """
         Update the index levels
@@ -639,13 +642,28 @@ def update_index_levels(
             Each key is the index level to which the updates will be applied.
             Each value is a function which updates the levels to their new values.
 
+        copy
+            Should the [pd.DataFrame][pandas.DataFrame] be copied before returning?
+
+        remove_unused_levels
+            Remove unused levels before applying the update
+
+            Specifically, call
+            [pd.MultiIndex.remove_unused_levels][pandas.MultiIndex.remove_unused_levels].
+
+            This avoids trying to update levels that aren't being used.
+
         Returns
         -------
         :
             [pd.DataFrame][pandas.DataFrame] with updates applied to its index
         """
-        # Have to copy as the index is replaced in place
-        return update_index_levels_func(self._df, updates=updates, copy=True)
+        return update_index_levels_func(
+            self._df,
+            updates=updates,
+            copy=copy,
+            remove_unused_levels=remove_unused_levels,
+        )
 
 
 def register_pandas_accessor(namespace: str = "openscm") -> None:
diff --git a/src/pandas_openscm/index_manipulation.py b/src/pandas_openscm/index_manipulation.py
@@ -309,7 +309,10 @@ def update_index_from_candidates(
 
 
 def update_index_levels_func(
-    df: pd.DataFrame, updates: dict[Any, Callable[[Any], Any]], copy: bool = True
+    df: pd.DataFrame,
+    updates: dict[Any, Callable[[Any], Any]],
+    copy: bool = True,
+    remove_unused_levels: bool = True,
 ) -> pd.DataFrame:
     """
     Update the index levels of a [pd.DataFrame][pandas.DataFrame]
@@ -328,6 +331,11 @@ def update_index_levels_func(
     copy
         Should `df` be copied before returning?
 
+    remove_unused_levels
+        Call `df.index.remove_unused_levels` before updating the levels
+
+        This avoids trying to update levels that aren't being used.
+
     Returns
     -------
     :
@@ -344,13 +352,17 @@ def update_index_levels_func(
         )
         raise TypeError(msg)
 
-    df.index = update_levels(df.index, updates=updates)
+    df.index = update_levels(
+        df.index, updates=updates, remove_unused_levels=remove_unused_levels
+    )
 
     return df
 
 
 def update_levels(
-    ini: pd.MultiIndex, updates: dict[Any, Callable[[Any], Any]]
+    ini: pd.MultiIndex,
+    updates: dict[Any, Callable[[Any], Any]],
+    remove_unused_levels: bool = True,
 ) -> pd.MultiIndex:
     """
     Update the levels of a [pd.MultiIndex][pandas.MultiIndex]
@@ -366,6 +378,11 @@ def update_levels(
         Each key is the level to which the updates will be applied.
         Each value is a function which updates the levels to their new values.
 
+    remove_unused_levels
+        Call `ini.remove_unused_levels` before updating the levels
+
+        This avoids trying to update levels that aren't being used.
+
     Returns
     -------
     :
@@ -376,6 +393,9 @@ def update_levels(
     KeyError
         A level in `updates` is not a level in `ini`
     """
+    if remove_unused_levels:
+        ini = ini.remove_unused_levels()  # type: ignore
+
     levels: list[pd.Index[Any]] = list(ini.levels)
     codes: list[list[int]] = list(ini.codes)
 
diff --git a/tests/integration/index_manipulation/test_integration_index_manipulation_update_levels.py b/tests/integration/index_manipulation/test_integration_index_manipulation_update_levels.py
@@ -123,6 +123,74 @@ def test_update_index_levels_missing_level():
         update_levels(start, updates=updates)
 
 
+def test_doesnt_trip_over_droped_levels(setup_pandas_accessor):
+    def update_func(in_v: int) -> int:
+        if in_v < 0:
+            msg = f"Value must be greater than zero, received {in_v}"
+            raise ValueError(msg)
+
+        return in_v * -1
+
+    start = pd.MultiIndex.from_tuples(
+        [
+            ("sa", "va", "kg", 0),
+            ("sb", "vb", "m", 1),
+            ("sa", "va", "kg", 2),
+            ("sa", "vb", "kg", -2),
+        ],
+        names=["scenario", "variable", "unit", "run_id"],
+    )
+
+    updates = {"run_id": update_func}
+
+    res = update_levels(start[:-1], updates=updates)
+
+    exp = pd.MultiIndex.from_tuples(
+        [
+            ("sa", "va", "kg", 0),
+            ("sb", "vb", "m", -1),
+            ("sa", "va", "kg", -2),
+        ],
+        names=["scenario", "variable", "unit", "run_id"],
+    )
+    pd.testing.assert_index_equal(res, exp)
+
+    # If you turn the drop off, you get an error
+    exp_error_no_removal = pytest.raises(
+        ValueError, match=re.escape("Value must be greater than zero, received -2")
+    )
+    with exp_error_no_removal:
+        # Even though we're not using the levels,
+        # they still get mapped if we don't remove them
+        update_levels(start[:-1], updates=updates, remove_unused_levels=False)
+
+    # Same thing but from a DataFrame
+    start_df = pd.DataFrame(
+        np.zeros((start.shape[0], 3)), columns=[2010, 2020, 2030], index=start
+    )
+
+    res_df = update_index_levels_func(start_df.iloc[:-1, :], updates=updates)
+
+    exp_df = pd.DataFrame(
+        np.zeros((exp.shape[0], 3)), columns=start_df.columns, index=exp
+    )
+
+    pd.testing.assert_frame_equal(res_df, exp_df)
+    with exp_error_no_removal:
+        update_index_levels_func(
+            start_df.iloc[:-1, :], updates=updates, remove_unused_levels=False
+        )
+
+    # Lastly, test the accessor
+    pd.testing.assert_frame_equal(
+        start_df.iloc[:-1, :].openscm.update_index_levels(updates), exp_df
+    )
+    with exp_error_no_removal:
+        start_df.iloc[:-1, :].openscm.update_index_levels(
+            updates, remove_unused_levels=False
+        )
+
+
 def test_accessor(setup_pandas_accessor):
     start = pd.DataFrame(
         np.arange(2 * 4).reshape((4, 2)),

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Fixed up [pandas_openscm.index_manipulation.update_levels][].`
	`2`	`+It now drops unused levels by default first, to avoid applying the updates to values that aren't being used.`
	`3`	`+The same fixes are propagated to [pandas_openscm.accessors.DataFramePandasOpenSCMAccessor.update_index_levels][] and [pandas_openscm.index_manipulation.update_index_levels_func][].`