Skip to content

Commit ddd4d8b

Browse files
authored
Merge pull request #14 from openscm/bug-fix
Fix bug in update index levels
2 parents b2132f0 + c92ddce commit ddd4d8b

File tree

4 files changed

+115
-6
lines changed

4 files changed

+115
-6
lines changed

changelog/14.fix.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fixed up [pandas_openscm.index_manipulation.update_levels][].
2+
It now drops unused levels by default first, to avoid applying the updates to values that aren't being used.
3+
The same fixes are propagated to [pandas_openscm.accessors.DataFramePandasOpenSCMAccessor.update_index_levels][] and [pandas_openscm.index_manipulation.update_index_levels_func][].

src/pandas_openscm/accessors.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,10 @@ def to_long_data(self, time_col_name: str = "time") -> pd.DataFrame:
626626
return ts_to_long_data(self._df, time_col_name=time_col_name)
627627

628628
def update_index_levels(
629-
self, updates: dict[Any, Callable[[Any], Any]]
629+
self,
630+
updates: dict[Any, Callable[[Any], Any]],
631+
copy: bool = True,
632+
remove_unused_levels: bool = True,
630633
) -> pd.DataFrame:
631634
"""
632635
Update the index levels
@@ -639,13 +642,28 @@ def update_index_levels(
639642
Each key is the index level to which the updates will be applied.
640643
Each value is a function which updates the levels to their new values.
641644
645+
copy
646+
Should the [pd.DataFrame][pandas.DataFrame] be copied before returning?
647+
648+
remove_unused_levels
649+
Remove unused levels before applying the update
650+
651+
Specifically, call
652+
[pd.MultiIndex.remove_unused_levels][pandas.MultiIndex.remove_unused_levels].
653+
654+
This avoids trying to update levels that aren't being used.
655+
642656
Returns
643657
-------
644658
:
645659
[pd.DataFrame][pandas.DataFrame] with updates applied to its index
646660
"""
647-
# Have to copy as the index is replaced in place
648-
return update_index_levels_func(self._df, updates=updates, copy=True)
661+
return update_index_levels_func(
662+
self._df,
663+
updates=updates,
664+
copy=copy,
665+
remove_unused_levels=remove_unused_levels,
666+
)
649667

650668

651669
def register_pandas_accessor(namespace: str = "openscm") -> None:

src/pandas_openscm/index_manipulation.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,10 @@ def update_index_from_candidates(
309309

310310

311311
def update_index_levels_func(
312-
df: pd.DataFrame, updates: dict[Any, Callable[[Any], Any]], copy: bool = True
312+
df: pd.DataFrame,
313+
updates: dict[Any, Callable[[Any], Any]],
314+
copy: bool = True,
315+
remove_unused_levels: bool = True,
313316
) -> pd.DataFrame:
314317
"""
315318
Update the index levels of a [pd.DataFrame][pandas.DataFrame]
@@ -328,6 +331,11 @@ def update_index_levels_func(
328331
copy
329332
Should `df` be copied before returning?
330333
334+
remove_unused_levels
335+
Call `df.index.remove_unused_levels` before updating the levels
336+
337+
This avoids trying to update levels that aren't being used.
338+
331339
Returns
332340
-------
333341
:
@@ -344,13 +352,17 @@ def update_index_levels_func(
344352
)
345353
raise TypeError(msg)
346354

347-
df.index = update_levels(df.index, updates=updates)
355+
df.index = update_levels(
356+
df.index, updates=updates, remove_unused_levels=remove_unused_levels
357+
)
348358

349359
return df
350360

351361

352362
def update_levels(
353-
ini: pd.MultiIndex, updates: dict[Any, Callable[[Any], Any]]
363+
ini: pd.MultiIndex,
364+
updates: dict[Any, Callable[[Any], Any]],
365+
remove_unused_levels: bool = True,
354366
) -> pd.MultiIndex:
355367
"""
356368
Update the levels of a [pd.MultiIndex][pandas.MultiIndex]
@@ -366,6 +378,11 @@ def update_levels(
366378
Each key is the level to which the updates will be applied.
367379
Each value is a function which updates the levels to their new values.
368380
381+
remove_unused_levels
382+
Call `ini.remove_unused_levels` before updating the levels
383+
384+
This avoids trying to update levels that aren't being used.
385+
369386
Returns
370387
-------
371388
:
@@ -376,6 +393,9 @@ def update_levels(
376393
KeyError
377394
A level in `updates` is not a level in `ini`
378395
"""
396+
if remove_unused_levels:
397+
ini = ini.remove_unused_levels() # type: ignore
398+
379399
levels: list[pd.Index[Any]] = list(ini.levels)
380400
codes: list[list[int]] = list(ini.codes)
381401

tests/integration/index_manipulation/test_integration_index_manipulation_update_levels.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,74 @@ def test_update_index_levels_missing_level():
123123
update_levels(start, updates=updates)
124124

125125

126+
def test_doesnt_trip_over_droped_levels(setup_pandas_accessor):
127+
def update_func(in_v: int) -> int:
128+
if in_v < 0:
129+
msg = f"Value must be greater than zero, received {in_v}"
130+
raise ValueError(msg)
131+
132+
return in_v * -1
133+
134+
start = pd.MultiIndex.from_tuples(
135+
[
136+
("sa", "va", "kg", 0),
137+
("sb", "vb", "m", 1),
138+
("sa", "va", "kg", 2),
139+
("sa", "vb", "kg", -2),
140+
],
141+
names=["scenario", "variable", "unit", "run_id"],
142+
)
143+
144+
updates = {"run_id": update_func}
145+
146+
res = update_levels(start[:-1], updates=updates)
147+
148+
exp = pd.MultiIndex.from_tuples(
149+
[
150+
("sa", "va", "kg", 0),
151+
("sb", "vb", "m", -1),
152+
("sa", "va", "kg", -2),
153+
],
154+
names=["scenario", "variable", "unit", "run_id"],
155+
)
156+
pd.testing.assert_index_equal(res, exp)
157+
158+
# If you turn the drop off, you get an error
159+
exp_error_no_removal = pytest.raises(
160+
ValueError, match=re.escape("Value must be greater than zero, received -2")
161+
)
162+
with exp_error_no_removal:
163+
# Even though we're not using the levels,
164+
# they still get mapped if we don't remove them
165+
update_levels(start[:-1], updates=updates, remove_unused_levels=False)
166+
167+
# Same thing but from a DataFrame
168+
start_df = pd.DataFrame(
169+
np.zeros((start.shape[0], 3)), columns=[2010, 2020, 2030], index=start
170+
)
171+
172+
res_df = update_index_levels_func(start_df.iloc[:-1, :], updates=updates)
173+
174+
exp_df = pd.DataFrame(
175+
np.zeros((exp.shape[0], 3)), columns=start_df.columns, index=exp
176+
)
177+
178+
pd.testing.assert_frame_equal(res_df, exp_df)
179+
with exp_error_no_removal:
180+
update_index_levels_func(
181+
start_df.iloc[:-1, :], updates=updates, remove_unused_levels=False
182+
)
183+
184+
# Lastly, test the accessor
185+
pd.testing.assert_frame_equal(
186+
start_df.iloc[:-1, :].openscm.update_index_levels(updates), exp_df
187+
)
188+
with exp_error_no_removal:
189+
start_df.iloc[:-1, :].openscm.update_index_levels(
190+
updates, remove_unused_levels=False
191+
)
192+
193+
126194
def test_accessor(setup_pandas_accessor):
127195
start = pd.DataFrame(
128196
np.arange(2 * 4).reshape((4, 2)),

0 commit comments

Comments
 (0)