bio-phys · hejung · Nov 22, 2025 · Nov 22, 2025 · Nov 22, 2025 · Nov 23, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,8 +9,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
+- `aimmd.distributed`: Improve `aimmd.distributed.pathmovers.PathMover` class inheritance structure
+- `aimmd.distributed`: The density adaption is now a part of the `aimmd.distributed.SPSelector` classes, the additional density adaption scheme "lazzeri" has been added.
+- `aimmd.distributed.Brain`: use tqdm progress bars
+- `aimmd.distributed.pathmovers`: shooting pathmovers now use the `aimmd.distributed.MDEngineSpec` dataclass as input argument to specify all the MD engine/propagation options together instead of as previously specifying each argument separately
 - Complete rewrite of `aimmd.distributed.CommittorSimulation`, update and improve corresponding example notebook.
 
+### Removed
+
+- `aimmd.distributed`:  The density collection `BrainTask` has been removed as it is no longer needed due to the rework of density collection (see changed).
+
 ## [0.9.3] - 2025-08-03
 
 ### Added

diff --git a/aimmd/base/density_collection.py b/aimmd/base/density_collection.py
@@ -0,0 +1,237 @@
+# This file is part of aimmd
+#
+# aimmd is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# aimmd is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with aimmd. If not, see <https://www.gnu.org/licenses/>.
+"""
+This file contains the helper classes to perform collection of (and correction for)
+the density of configurations on trajectories when projected to the selection coordinate.
+
+In the standard case we will attempt to flatten the distribution of (shooting)
+configurations along the committor.
+"""
+import asyncio
+import logging
+
+import numpy as np
+
+
+logger = logging.getLogger(__name__)
+
+
+class DensityCollector:
+    # TODO: expand docstring
+    """
+    Keep track of density of configurations on trajectories projected to probabilities.
+    """
+
+    def __init__(self, n_dim: int, bins: int = 10) -> None:
+        self.n_dim = n_dim
+        self.bins = bins
+        self.density_histogram = np.zeros(tuple(bins for _ in range(n_dim)))
+        self._trajectories = []
+        self._weights = []
+        self._n_samp = 0
+
+        # need to know the number of forbidden bins, i.e. where sum_prob > 1
+        bounds = np.arange(0., 1., 1./bins)
+        # this magic line creates all possible combinations of probs
+        # as an array of shape (bins**n_probs, n_probs)
+        combs = np.array(np.meshgrid(*tuple(bounds for _ in range(self.n_dim)))
+                         ).T.reshape(-1, self.n_dim)
+        sums = np.sum(combs, axis=1)
+        n_forbidden_bins = len(np.where(sums > 1)[0])
+        self._n_allowed_bins = bins**self.n_dim - n_forbidden_bins
+
+    def reset(self) -> None:
+        # TODO: document this!
+        self.density_histogram = np.zeros(tuple(self.bins for _ in range(self.n_dim)))
+        self._trajectories = []
+        self._weights = []
+        self._n_samp = 0
+
+    # TODO: cleanup everything below!
+
+    def add_density_for_trajectories(self, model, trajectories, weights=None):
+        # TODO: rewrite docstring when we know what this function does!
+        """
+        Evaluate the density on the given trajectories.
+
+        Only **add** the counts for the added trajectories according to the
+        current models predictions to the existing histogram in probability
+        space.
+        Additionally store trajectories/descriptors for later reevaluation.
+
+        See self.reevaluate_density() to only recreate the density estimate
+        without adding new trajectories.
+
+        Parameters:
+        -----------
+        model - aimmd.base.RCModel predicting commitment probabilities
+        trajectories - iterator/iterable of trajectories to evaluate
+        counts - None or list of weights for the trajectories,
+                 i.e. we will add every trajectory with weight=counts,
+                 if None, every trajectory has equal weight,
+                 can also be a list of np.arrays (each of the same length as
+                 the corresponding trajectory), i.e. supports different weights
+                 per configuration.
+
+        """
+        if weights is None:
+            # give each point an equal weight of 1
+            weights = [np.ones(shape=(len(t), )) for t in trajectories]
+        # now predict for the newly added
+        preds = [model(tra) for tra in trajectories]
+        for pred, tra, w in zip(preds, trajectories, weights):
+            # and add to histogram
+            histo, _ = np.histogramdd(sample=pred,
+                                      bins=self.bins,
+                                      range=[(0., 1.)
+                                             for _ in range(self.n_dim)],
+                                      weights=w,
+                                      )
+            self.density_histogram += histo
+            self._n_samp += pred.shape[0]
+            # store trajectories and weights for reevaluation
+            self._trajectories.append(tra)
+            self._weights.append(w)
+
+    def reevaluate_density(self, model):
+        # TODO: rewrite the docstring when we know what this function does!
+        """
+        Reevaluate the density for all stored trajectories.
+
+        Will replace the density histogram with a new density estimate for all
+        trajectories from current models prediction.
+
+        Parameters:
+        -----------
+        model - aimmd.base.RCModel predicting commitment probabilities
+
+        """
+        # keep a ref to current trajs and weights, then reset self
+        trajs, weights = self._trajectories, self._weights
+        self.reset()
+        # and finally (re)add all trajectories using the current model
+        self.add_density_for_trajectories(model=model, trajectories=trajs,
+                                          weights=weights)
+
+    def get_counts(self, probabilities):
+        # TODO: Docstring!
+        """
+        Return the current counts in bin for a given probability vector.
+
+        Parameters:
+        -----------
+        probabilities - numpy.ndarray, shape=(n_points, self.n_dim)
+
+        Returns:
+        --------
+        counts - numpy.ndarray, shape=(n_points,), values of the density
+                 counter at the given points in probability-space
+
+        """
+        # we take the min to make sure we are always in the
+        # histogram range, even if p = 1
+        idxs = tuple(np.intp(
+                        np.minimum(np.floor(probabilities[:, i] * self.bins),
+                                   self.bins - 1)
+                             )
+                     for i in range(self.n_dim)
+                     )
+        return self.density_histogram[idxs]
+
+    def get_correction(self, probabilities):
+        # TODO: Docstring!
+        """
+        Return the 'flattening factor' for the observed density of points.
+
+        The factor is calculated as total_count / counts[probabilities],
+        i.e. the factor is 1 / rho(probabilities).
+        Note that we replace the potential infinite values appearing if
+        the counts in a bin are zero by a large but finite value derived
+        from the idea of add-one-smoothing.
+        I.e. instead of our estimated rho = 0 we use 0 < rho << 1 when
+        calculating 1 / rho.
+        """
+        dens = self.get_counts(probabilities)
+        if (norm := np.sum(self.density_histogram)) == 0.:
+            return np.ones_like(dens)  # we dont have any density yet
+        with np.errstate(divide="ignore"):
+            # ignore errors through division by zero
+            factor = norm / dens
+        # Now replace all potential infs by a large (but finite) value
+        if np.any(np.isinf(factor)):
+            # weight_1_samp is the average weight per sample configuration
+            weight_1_samp = norm / self._n_samp
+            # the bit below is similar to laplace add-one smoothing, just
+            # that we use weight_1_samp instead of adding a one, because we
+            # do not know if the average weight per sample \approx 1
+            # [in laplace add-one smoothing we would use:
+            #   (norm + self._n_allowed_bins) / (dens + 1)
+            #  here dens = 0 (since we got the inf)]
+            #  and self._n_allowed_bins becomes self._n_allowed_bins * weight_1_samp,
+            #  i.e., we add one average sample into each allowed bin
+            #  (as in laplace add-one smoothing)
+            factor[factor == np.inf] = norm/weight_1_samp + self._n_allowed_bins
+            # NOTE: this is what we used before
+            #factor[factor == np.inf] = ((norm + self._n_allowed_bins)
+            #                            / weight_1_samp
+            #                            )
+
+        return factor
+
+
+# TODO: maybe just merge the two classes into one and have the async versions of
+#       the method have an "_async" suffix in their name?! (and a check for async-model?)
+#       ...this makes it more readable and we also get rid of the overwrite mismatch for
+#        the methods
+class DensityCollectorAsync(DensityCollector):
+    # NOTE: Take the docstrings of the superclass (i.e. the non-async version)
+    __doc__ = ("Async version of the DensityCollector \n"
+               + "Parent class docstring: \n"
+               + str(DensityCollector.__doc__)
+               )
+    # NOTE: "steal" docstrings also for the methods we overwrite, they can
+    #        (and should) be taken from DensityCollector
+
+    async def add_density_for_trajectories(self, model, trajectories, weights=None):
+        if weights is None:
+            # give each point an equal weight of 1
+            weights = [np.ones(shape=(len(t), )) for t in trajectories]
+        # now predict for the newly added
+        preds = await asyncio.gather(*(model(tra) for tra in trajectories))
+        for pred, tra, w in zip(preds, trajectories, weights):
+            # and add to histogram
+            histo, _ = np.histogramdd(sample=pred,
+                                      bins=self.bins,
+                                      range=[(0., 1.)
+                                             for _ in range(self.n_dim)],
+                                      weights=w,
+                                      )
+            self.density_histogram += histo
+            self._n_samp += pred.shape[0]
+            # store trajectories and weights for reevaluation
+            self._trajectories.append(tra)
+            self._weights.append(w)
+
+    add_density_for_trajectories.__doc__ = DensityCollector.add_density_for_trajectories.__doc__
+
+    async def reevaluate_density(self, model):
+        # keep a ref to current trajs and weights, then reset self
+        trajs, weights = self._trajectories, self._weights
+        self.reset()
+        # and finally (re)add all trajectories using the current model
+        await self.add_density_for_trajectories(model=model, trajectories=trajs,
+                                                weights=weights)
+
+    reevaluate_density.__doc__ = DensityCollector.reevaluate_density.__doc__
diff --git a/aimmd/base/storage.py b/aimmd/base/storage.py
@@ -22,11 +22,12 @@
 import h5py
 import asyncmd
 import numpy as np
-from pkg_resources import parse_version
+#from pkg_resources import parse_version  # pkg_resources is deprecated
+from packaging.version import parse as parse_version
 
 from . import _H5PY_PATH_DICT
 from .trainset import TrainSet
-from ..distributed.pathmovers import ModelDependentPathMover
+from ..distributed.pathmovers import PathMover
 from .. import __version__
 
 
@@ -78,6 +79,8 @@ def write(self, byts):
 
 
 # buffered version, seems to be a bit faster(?)
+# TODO: inherit from BytesIO/FileIO or the corresponding ABC
+#class BytesStreamtoH5pyBuffered(io.BytesIO):
 class BytesStreamtoH5pyBuffered:
     """
     'Translate' from python bytes objects to arrays of uint8. Buffered Version.
@@ -168,6 +171,8 @@ def write(self, byts):
         return add_len
 
 
+# TODO: inherit from BytesIO/FileIO or the corresponding ABC
+#class H5pytoBytesStream(io.BytesIO):
 class H5pytoBytesStream:
     """
     'Translate' from arrays of uint8s to python bytes objects.
@@ -441,23 +446,23 @@ def __init__(self, grp, modelstore, mcstep=None):
 
     def save(self, mcstep):
         mover = mcstep.mover
-        if isinstance(mover, ModelDependentPathMover):
+        if isinstance(mover, PathMover):
             mover_modelstore = mover.modelstore
             # check if they use the same hdf5 group, the RCModelshelf objects
             # do not need to be the same (and most often often are not)
             if mover.modelstore is not None:
                 # movers that have been pickled can have modelstore=None
                 if mover_modelstore._group != self._modelstore._group:
                     logger.error("saving a mcstep with a 'foreign' modelstore")
-            mcstep.mover.modelstore = None
+                mcstep.mover.modelstore = None
         super().save(obj=mcstep, overwrite=False, buffsize=2**22)
-        if isinstance(mover, ModelDependentPathMover):
+        if isinstance(mover, PathMover):
             # reset the modelstore of the mc.mover in case we use it somewhere else
             mcstep.mover.modelstore = mover_modelstore
 
     def load(self):
         mcstep = super().load(buffsize=2**22)
-        if isinstance(mcstep.mover, ModelDependentPathMover):
+        if isinstance(mcstep.mover, PathMover):
             mcstep.mover.modelstore = self._modelstore
         return mcstep
 
@@ -635,7 +640,7 @@ def save(self, obj, buffsize=2 ** 22):
         obj.modelstore = None
         mover_modelstores = []
         for mover in obj.movers:
-            if isinstance(mover, ModelDependentPathMover):
+            if isinstance(mover, PathMover):
                 if mover.modelstore._group != self.modelstore._group:
                     logger.error("Saving a mover in the PathSamplingChain with"
                                  " a different modelstore.")
@@ -653,7 +658,7 @@ def save(self, obj, buffsize=2 ** 22):
         obj.mcstep_collection = pcs_step_collection
         obj.modelstore = pcs_modelstore
         for mover, mover_ms in zip(obj.movers, mover_modelstores):
-            if isinstance(mover, ModelDependentPathMover):
+            if isinstance(mover, PathMover):
                 mover.modelstore = mover_ms
         return
 
@@ -664,7 +669,7 @@ def load(self, buffsize=2 ** 22):
         #       the step will not have a modelstore set anymore because it has
         #       been pickled)
         for mover in obj.movers:
-            if isinstance(mover, ModelDependentPathMover):
+            if isinstance(mover, PathMover):
                 mover.modelstore = self.modelstore
         obj.modelstore = self.modelstore
         return obj
@@ -866,13 +871,13 @@ def __init__(self, fname, mode='a', copy_asyncmd_caches=False):
         self._dirname = os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), fname)))
         if ("w" in mode) or ("a" in mode and not fexists):
             # first creation of file: write aimmd compatibility version string
-            self._store.attrs["storage_version"] = np.string_(
+            self._store.attrs["storage_version"] = np.bytes_(
                                                     self._compatibility_version
                                                               )
-            self._store.attrs["aimmd_version"] = np.string_(__version__)
+            self._store.attrs["aimmd_version"] = np.bytes_(__version__)
             # save the current (i.e. where the file is when we opened it) dirname to attrs
             # we need this to be able to save tensorflow models properly
-            self._store.attrs["dirname"] = np.string_(self._dirname)
+            self._store.attrs["dirname"] = np.bytes_(self._dirname)
         else:
             store_version = parse_version(
                             self._store.attrs["storage_version"].decode("ASCII")
@@ -894,7 +899,7 @@ def __init__(self, fname, mode='a', copy_asyncmd_caches=False):
                 # i.e. try to not break backwards compatibility
                 if mode != "r":
                     # storage open with write intent, so just add the attr for dirname
-                    self._store.attrs["dirname"] = np.string_(self._dirname)
+                    self._store.attrs["dirname"] = np.bytes_(self._dirname)
                     logger.debug("Converted 'old' storage to 'new' format by adding "
                                  + "the 'dirname' attr.")
                 else:
@@ -915,7 +920,7 @@ def __init__(self, fname, mode='a', copy_asyncmd_caches=False):
                                      + "if you did not copy the KerasRCmodel directory yourself.")
                     else:
                         # we can just change the path
-                        self._store.attrs["dirname"] = np.string_(self._dirname)
+                        self._store.attrs["dirname"] = np.bytes_(self._dirname)
                         # but we warn because the folder with the models must be copied
                         # TODO/FIXME: automatically copy the folder from old to new location?
                         logger.warn("The directory containing the storage changed, we updated it in the storage."

diff --git a/aimmd/distributed/__init__.py b/aimmd/distributed/__init__.py
@@ -15,6 +15,7 @@
 along with AIMMD. If not, see <https://www.gnu.org/licenses/>.
 """
 from . import config
-from .committors import CommittorSimulation, CommittorConfiguration, CommittorEngineSpec
+from .dataclasses import MDEngineSpec
+from .committors import CommittorSimulation, CommittorConfiguration
 # TODO: what PathSampling stuff do we want to have available at the top level?
 from .pathsampling import Brain