From 39048f95c5048b95505abc3afaec3bf386cbdf10 Mon Sep 17 00:00:00 2001 From: keewis Date: Sat, 30 Jan 2021 00:05:57 +0100 Subject: [PATCH] speed up the repr for big MultiIndex objects (#4846) * print the repr of a multiindex using only a subset of the coordinate values * don't index if we have less items than available width * don't try to shorten arrays which are way too short * col_width seems to be the maximum number of elements, not characters * add a asv benchmark * Apply suggestions from code review Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- asv_bench/benchmarks/repr.py | 18 ++++++++++++++++++ xarray/core/formatting.py | 11 +++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 asv_bench/benchmarks/repr.py diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py new file mode 100644 index 00000000000..b218c0be870 --- /dev/null +++ b/asv_bench/benchmarks/repr.py @@ -0,0 +1,18 @@ +import pandas as pd + +import xarray as xr + + +class ReprMultiIndex: + def setup(self, key): + index = pd.MultiIndex.from_product( + [range(10000), range(10000)], names=("level_0", "level_1") + ) + series = pd.Series(range(100000000), index=index) + self.da = xr.DataArray(series) + + def time_repr(self): + repr(self.da) + + def time_repr_html(self): + self.da._repr_html_() diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 282620e3569..0c1be1cc175 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -300,11 +300,18 @@ def _summarize_coord_multiindex(coord, col_width, marker): def _summarize_coord_levels(coord, col_width, marker="-"): + if len(coord) > 100 and col_width < len(coord): + n_values = col_width + indices = list(range(0, n_values)) + list(range(-n_values, 0)) + subset = coord[indices] + else: + subset = coord + return "\n".join( summarize_variable( - lname, coord.get_level_variable(lname), col_width, marker=marker + lname, subset.get_level_variable(lname), col_width, marker=marker ) - for lname in coord.level_names + for lname in subset.level_names )