Skip to content

Commit 73dfc30

Browse files
authored
PERF: Improve performance of MultiIndex._verify_integrity (#51873)
* PERF: Improve performance of MultiIndex._verify_integrity * Update
1 parent c9b560c commit 73dfc30

File tree

2 files changed

+32
-8
lines changed

2 files changed

+32
-8
lines changed

doc/source/whatsnew/v2.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ Performance improvements
117117
- Performance improvement when parsing strings to ``boolean[pyarrow]`` dtype (:issue:`51730`)
118118
- Performance improvement when searching an :class:`Index` sliced from other indexes (:issue:`51738`)
119119
- Performance improvement in :meth:`Series.combine_first` (:issue:`51777`)
120+
- Performance improvement in :meth:`MultiIndex.set_levels` and :meth:`MultiIndex.set_codes` when ``verify_integrity=True`` (:issue:`51873`)
120121
- Performance improvement in :func:`factorize` for object columns not containing strings (:issue:`51921`)
121122

122123
.. ---------------------------------------------------------------------------

pandas/core/indexes/multi.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -379,14 +379,21 @@ def _validate_codes(self, level: list, code: list):
379379
code = np.where(null_mask[code], -1, code) # type: ignore[assignment]
380380
return code
381381

382-
def _verify_integrity(self, codes: list | None = None, levels: list | None = None):
382+
def _verify_integrity(
383+
self,
384+
codes: list | None = None,
385+
levels: list | None = None,
386+
levels_to_verify: list[int] | range | None = None,
387+
):
383388
"""
384389
Parameters
385390
----------
386391
codes : optional list
387392
Codes to check for validity. Defaults to current codes.
388393
levels : optional list
389394
Levels to check for validity. Defaults to current levels.
395+
levels_to_validate: optional list
396+
Specifies the levels to verify.
390397
391398
Raises
392399
------
@@ -403,14 +410,19 @@ def _verify_integrity(self, codes: list | None = None, levels: list | None = Non
403410
# nlevels matches nor that sortorder matches actually sortorder.
404411
codes = codes or self.codes
405412
levels = levels or self.levels
413+
if levels_to_verify is None:
414+
levels_to_verify = range(len(levels))
406415

407416
if len(levels) != len(codes):
408417
raise ValueError(
409418
"Length of levels and codes must match. NOTE: "
410419
"this index is in an inconsistent state."
411420
)
412421
codes_length = len(codes[0])
413-
for i, (level, level_codes) in enumerate(zip(levels, codes)):
422+
for i in levels_to_verify:
423+
level = levels[i]
424+
level_codes = codes[i]
425+
414426
if len(level_codes) != codes_length:
415427
raise ValueError(
416428
f"Unequal code lengths: {[len(code_) for code_ in codes]}"
@@ -435,10 +447,14 @@ def _verify_integrity(self, codes: list | None = None, levels: list | None = Non
435447
f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}"
436448
)
437449

438-
codes = [
439-
self._validate_codes(level, code) for level, code in zip(levels, codes)
440-
]
441-
new_codes = FrozenList(codes)
450+
result_codes = []
451+
for i in range(len(levels)):
452+
if i in levels_to_verify:
453+
result_codes.append(self._validate_codes(levels[i], codes[i]))
454+
else:
455+
result_codes.append(codes[i])
456+
457+
new_codes = FrozenList(result_codes)
442458
return new_codes
443459

444460
@classmethod
@@ -824,6 +840,7 @@ def _set_levels(
824840
new_levels = FrozenList(
825841
ensure_index(lev, copy=copy)._view() for lev in levels
826842
)
843+
level_numbers = list(range(len(new_levels)))
827844
else:
828845
level_numbers = [self._get_level_number(lev) for lev in level]
829846
new_levels_list = list(self._levels)
@@ -832,7 +849,9 @@ def _set_levels(
832849
new_levels = FrozenList(new_levels_list)
833850

834851
if verify_integrity:
835-
new_codes = self._verify_integrity(levels=new_levels)
852+
new_codes = self._verify_integrity(
853+
levels=new_levels, levels_to_verify=level_numbers
854+
)
836855
self._codes = new_codes
837856

838857
names = self.names
@@ -990,11 +1009,13 @@ def _set_codes(
9901009
if level is not None and len(codes) != len(level):
9911010
raise ValueError("Length of codes must match length of levels.")
9921011

1012+
level_numbers: list[int] | range
9931013
if level is None:
9941014
new_codes = FrozenList(
9951015
_coerce_indexer_frozen(level_codes, lev, copy=copy).view()
9961016
for lev, level_codes in zip(self._levels, codes)
9971017
)
1018+
level_numbers = range(len(new_codes))
9981019
else:
9991020
level_numbers = [self._get_level_number(lev) for lev in level]
10001021
new_codes_list = list(self._codes)
@@ -1006,7 +1027,9 @@ def _set_codes(
10061027
new_codes = FrozenList(new_codes_list)
10071028

10081029
if verify_integrity:
1009-
new_codes = self._verify_integrity(codes=new_codes)
1030+
new_codes = self._verify_integrity(
1031+
codes=new_codes, levels_to_verify=level_numbers
1032+
)
10101033

10111034
self._codes = new_codes
10121035

0 commit comments

Comments
 (0)