Issue #1548 fix performance ipf many wells (#1552)

JoerivanEngelen · web-flow · commit df307c96e66f · 2025-06-10T16:02:24.000+02:00
Fixes #1548 # Description - Fix issue with poor performance ``GridAgnosticWell.from_imod5_data`` when there are >10K wells in an IPF by moving an index explicitly beforehand. This requires one loop over all well groups, after which we can just use pandas/xarray functionality to do reqruired data transformations. - Properly type annotate helper functions in imod.mf6.wel that accept a sequence of pandas groupby objects. # Checklist  - [x] Links to correct issue - [x] Update changelog, if changes affect users - [x] PR title starts with ``Issue #nr``, e.g. ``Issue #737`` - [ ] Unit tests were added - [ ] **If feature added**: Added/extended example
diff --git a/docs/api/changelog.rst b/docs/api/changelog.rst
@@ -52,6 +52,9 @@ Fixed
   unstructured discretization.
 - Fixed bug in :func:`imod.formats.prj.open_projectfile_data` which caused an
   error when a periods keyword was used having an upper case.
+- Poor performance of :meth:`imod.mf6.Well.from_imod5_data` and
+  :meth:`imod.mf6.LayeredWell.from_imod5_data` when the ``imod5_data`` contained
+  a well system with a large number of wells (>10k).
 - :meth:`imod.mf6.River.from_imod5_data`,
   :meth:`imod.mf6.Drainage.from_imod5_data`,
   :meth:`imod.mf6.GeneralHeadBoundary.from_imod5_data` can now deal with
diff --git a/imod/mf6/wel.py b/imod/mf6/wel.py
@@ -6,7 +6,7 @@
 import warnings
 from collections.abc import Iterable
 from datetime import datetime
-from typing import Any, Callable, Optional, Self, Tuple, Union, cast
+from typing import Any, Callable, Optional, Self, Sequence, Tuple, Union, cast
 
 import cftime
 import numpy as np
@@ -16,7 +16,6 @@
 import xugrid as xu
 
 import imod
-import imod.mf6.utilities
 from imod.common.interfaces.ipointdatapackage import IPointDataPackage
 from imod.common.utilities.grid import broadcast_to_full_domain
 from imod.common.utilities.layer import create_layered_top
@@ -82,36 +81,25 @@ def mask_2D(package: GridAgnosticWell, domain_2d: GridDataArray) -> GridAgnostic
 
 
 def _df_groups_to_da_rates(
-    unique_well_groups: pd.api.typing.DataFrameGroupBy,
+    unique_well_groups: Sequence[pd.api.typing.DataFrameGroupBy],
 ) -> xr.DataArray:
     # Convert dataframes all groups to DataArrays
-    is_steady_state = "time" not in unique_well_groups[0].columns
-    if is_steady_state:
-        da_groups = [
-            xr.DataArray(df_group["rate"].sum()) for df_group in unique_well_groups
-        ]
+    columns = list(unique_well_groups[0].columns)
+    columns.remove("rate")
+    is_transient = "time" in columns
+    gb_and_summed = pd.concat(unique_well_groups).groupby(columns).sum()
+    if is_transient:
+        index_names = ["time", "index"]
     else:
-        da_groups = [
-            xr.DataArray(
-                df_group["rate"], dims=("time"), coords={"time": df_group["time"]}
-            )
-            for df_group in unique_well_groups
-        ]
-        # Groupby time and sum to aggregate wells with the exact same x, y, and
-        # filter top/bottom.
-        da_groups = [da_group.groupby("time").sum() for da_group in da_groups]
-    # Assign index coordinates
-    da_groups = [
-        da_group.expand_dims(dim="index").assign_coords(index=[i])
-        for i, da_group in enumerate(da_groups)
-    ]
-    # Concatenate datarrays along index dimension
-    return xr.concat(da_groups, dim="index")
+        index_names = ["index"]
+    # Unset multi-index, then set index to index_names
+    df_temp = gb_and_summed.reset_index().set_index(index_names)
+    return df_temp["rate"].to_xarray()
 
 
 def _prepare_well_rates_from_groups(
     pkg_data: dict,
-    unique_well_groups: pd.api.typing.DataFrameGroupBy,
+    unique_well_groups: Sequence[pd.api.typing.DataFrameGroupBy],
     start_times: StressPeriodTimesType,
 ) -> xr.DataArray:
     """
@@ -690,8 +678,12 @@ def from_imod5_data(
         # Associated wells need additional grouping by id
         if pkg_data["has_associated"]:
             colnames_group.append("id")
-        wel_index, unique_well_groups = zip(*df.groupby(colnames_group))
-
+        wel_index, well_groups_untagged = zip(*df.groupby(colnames_group))
+        # Explictly sign an index to each group, so that the
+        # DataArray of rates can be created with a unique index.
+        unique_well_groups = [
+            group.assign(index=i) for i, group in enumerate(well_groups_untagged)
+        ]
         # Unpack wel indices by zipping
         varnames = [("x", float), ("y", float)] + cls._depth_colnames
         index_values = zip(*wel_index)
diff --git a/imod/tests/test_mf6/test_utilities/test_resampling.py b/imod/tests/test_mf6/test_utilities/test_resampling.py
@@ -17,6 +17,7 @@ def initialize_timeseries(times: list[datetime], rates: list[float]) -> pd.DataF
     timeseries["id"] = "ID"
     timeseries["filt_top"] = 20
     timeseries["filt_bot"] = 10
+    timeseries["index"] = 0
 
     return timeseries
 
@@ -195,7 +196,7 @@ def test_mean_timeseries():
     dummy_times = [datetime(1989, 1, 1)]
     expected_rates = np.mean(rates)
     expected_timeseries = initialize_timeseries(dummy_times, expected_rates)
-    col_order = ["x", "y", "id", "filt_top", "filt_bot", "rate"]
+    col_order = ["x", "y", "id", "filt_top", "filt_bot", "index", "rate"]
     expected_timeseries = expected_timeseries[col_order]
 
     pd.testing.assert_frame_equal(
diff --git a/imod/util/expand_repetitions.py b/imod/util/expand_repetitions.py
@@ -104,7 +104,7 @@ def resample_timeseries(
     # The entries before the start of the well timeseries do not have data yet,
     # so we fill them in here. Keep rate to zero and pad the location columns with
     # the first entry.
-    location_columns = ["x", "y", "id", "filt_top", "filt_bot"]
+    location_columns = ["x", "y", "id", "filt_top", "filt_bot", "index"]
     time_before_start_input = (
         intermediate_df["time"].values < well_rate["time"].values[0]
     )

Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ def resample_timeseries(`
`104`	`104`	`# The entries before the start of the well timeseries do not have data yet,`
`105`	`105`	`# so we fill them in here. Keep rate to zero and pad the location columns with`
`106`	`106`	`# the first entry.`
`107`		`- location_columns = ["x", "y", "id", "filt_top", "filt_bot"]`
	`107`	`+ location_columns = ["x", "y", "id", "filt_top", "filt_bot", "index"]`
`108`	`108`	`time_before_start_input = (`
`109`	`109`	`intermediate_df["time"].values < well_rate["time"].values[0]`
`110`	`110`	`)`