Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- `aimmd.distributed`: Improve `aimmd.distributed.pathmovers.PathMover` class inheritance structure
- `aimmd.distributed`: The density adaption is now a part of the `aimmd.distributed.SPSelector` classes, the additional density adaption scheme "lazzeri" has been added.
- `aimmd.distributed.Brain`: use tqdm progress bars
- `aimmd.distributed.pathmovers`: shooting pathmovers now use the `aimmd.distributed.MDEngineSpec` dataclass as input argument to specify all the MD engine/propagation options together instead of as previously specifying each argument separately
- Complete rewrite of `aimmd.distributed.CommittorSimulation`, update and improve corresponding example notebook.

### Removed

- `aimmd.distributed`: The density collection `BrainTask` has been removed as it is no longer needed due to the rework of density collection (see changed).

## [0.9.3] - 2025-08-03

### Added
Expand Down
237 changes: 237 additions & 0 deletions aimmd/base/density_collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
# This file is part of aimmd
#
# aimmd is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# aimmd is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with aimmd. If not, see <https://www.gnu.org/licenses/>.
"""
This file contains the helper classes to perform collection of (and correction for)
the density of configurations on trajectories when projected to the selection coordinate.

In the standard case we will attempt to flatten the distribution of (shooting)
configurations along the committor.
"""
import asyncio
import logging

import numpy as np


logger = logging.getLogger(__name__)


class DensityCollector:
# TODO: expand docstring
"""
Keep track of density of configurations on trajectories projected to probabilities.
"""

def __init__(self, n_dim: int, bins: int = 10) -> None:
self.n_dim = n_dim
self.bins = bins
self.density_histogram = np.zeros(tuple(bins for _ in range(n_dim)))
self._trajectories = []
self._weights = []
self._n_samp = 0

# need to know the number of forbidden bins, i.e. where sum_prob > 1
bounds = np.arange(0., 1., 1./bins)
# this magic line creates all possible combinations of probs
# as an array of shape (bins**n_probs, n_probs)
combs = np.array(np.meshgrid(*tuple(bounds for _ in range(self.n_dim)))
).T.reshape(-1, self.n_dim)
sums = np.sum(combs, axis=1)
n_forbidden_bins = len(np.where(sums > 1)[0])
self._n_allowed_bins = bins**self.n_dim - n_forbidden_bins

def reset(self) -> None:
# TODO: document this!
self.density_histogram = np.zeros(tuple(self.bins for _ in range(self.n_dim)))
self._trajectories = []
self._weights = []
self._n_samp = 0

# TODO: cleanup everything below!

def add_density_for_trajectories(self, model, trajectories, weights=None):
# TODO: rewrite docstring when we know what this function does!
"""
Evaluate the density on the given trajectories.

Only **add** the counts for the added trajectories according to the
current models predictions to the existing histogram in probability
space.
Additionally store trajectories/descriptors for later reevaluation.

See self.reevaluate_density() to only recreate the density estimate
without adding new trajectories.

Parameters:
-----------
model - aimmd.base.RCModel predicting commitment probabilities
trajectories - iterator/iterable of trajectories to evaluate
counts - None or list of weights for the trajectories,
i.e. we will add every trajectory with weight=counts,
if None, every trajectory has equal weight,
can also be a list of np.arrays (each of the same length as
the corresponding trajectory), i.e. supports different weights
per configuration.

"""
if weights is None:
# give each point an equal weight of 1
weights = [np.ones(shape=(len(t), )) for t in trajectories]
# now predict for the newly added
preds = [model(tra) for tra in trajectories]
for pred, tra, w in zip(preds, trajectories, weights):
# and add to histogram
histo, _ = np.histogramdd(sample=pred,
bins=self.bins,
range=[(0., 1.)
for _ in range(self.n_dim)],
weights=w,
)
self.density_histogram += histo
self._n_samp += pred.shape[0]
# store trajectories and weights for reevaluation
self._trajectories.append(tra)
self._weights.append(w)

def reevaluate_density(self, model):
# TODO: rewrite the docstring when we know what this function does!
"""
Reevaluate the density for all stored trajectories.

Will replace the density histogram with a new density estimate for all
trajectories from current models prediction.

Parameters:
-----------
model - aimmd.base.RCModel predicting commitment probabilities

"""
# keep a ref to current trajs and weights, then reset self
trajs, weights = self._trajectories, self._weights
self.reset()
# and finally (re)add all trajectories using the current model
self.add_density_for_trajectories(model=model, trajectories=trajs,
weights=weights)

def get_counts(self, probabilities):
# TODO: Docstring!
"""
Return the current counts in bin for a given probability vector.

Parameters:
-----------
probabilities - numpy.ndarray, shape=(n_points, self.n_dim)

Returns:
--------
counts - numpy.ndarray, shape=(n_points,), values of the density
counter at the given points in probability-space

"""
# we take the min to make sure we are always in the
# histogram range, even if p = 1
idxs = tuple(np.intp(
np.minimum(np.floor(probabilities[:, i] * self.bins),
self.bins - 1)
)
for i in range(self.n_dim)
)
return self.density_histogram[idxs]

def get_correction(self, probabilities):
# TODO: Docstring!
"""
Return the 'flattening factor' for the observed density of points.

The factor is calculated as total_count / counts[probabilities],
i.e. the factor is 1 / rho(probabilities).
Note that we replace the potential infinite values appearing if
the counts in a bin are zero by a large but finite value derived
from the idea of add-one-smoothing.
I.e. instead of our estimated rho = 0 we use 0 < rho << 1 when
calculating 1 / rho.
"""
dens = self.get_counts(probabilities)
if (norm := np.sum(self.density_histogram)) == 0.:
return np.ones_like(dens) # we dont have any density yet
with np.errstate(divide="ignore"):
# ignore errors through division by zero
factor = norm / dens
# Now replace all potential infs by a large (but finite) value
if np.any(np.isinf(factor)):
# weight_1_samp is the average weight per sample configuration
weight_1_samp = norm / self._n_samp
# the bit below is similar to laplace add-one smoothing, just
# that we use weight_1_samp instead of adding a one, because we
# do not know if the average weight per sample \approx 1
# [in laplace add-one smoothing we would use:
# (norm + self._n_allowed_bins) / (dens + 1)
# here dens = 0 (since we got the inf)]
# and self._n_allowed_bins becomes self._n_allowed_bins * weight_1_samp,
# i.e., we add one average sample into each allowed bin
# (as in laplace add-one smoothing)
factor[factor == np.inf] = norm/weight_1_samp + self._n_allowed_bins
# NOTE: this is what we used before
#factor[factor == np.inf] = ((norm + self._n_allowed_bins)
# / weight_1_samp
# )

return factor


# TODO: maybe just merge the two classes into one and have the async versions of
# the method have an "_async" suffix in their name?! (and a check for async-model?)
# ...this makes it more readable and we also get rid of the overwrite mismatch for
# the methods
class DensityCollectorAsync(DensityCollector):
# NOTE: Take the docstrings of the superclass (i.e. the non-async version)
__doc__ = ("Async version of the DensityCollector \n"
+ "Parent class docstring: \n"
+ str(DensityCollector.__doc__)
)
# NOTE: "steal" docstrings also for the methods we overwrite, they can
# (and should) be taken from DensityCollector

async def add_density_for_trajectories(self, model, trajectories, weights=None):
if weights is None:
# give each point an equal weight of 1
weights = [np.ones(shape=(len(t), )) for t in trajectories]
# now predict for the newly added
preds = await asyncio.gather(*(model(tra) for tra in trajectories))
for pred, tra, w in zip(preds, trajectories, weights):
# and add to histogram
histo, _ = np.histogramdd(sample=pred,
bins=self.bins,
range=[(0., 1.)
for _ in range(self.n_dim)],
weights=w,
)
self.density_histogram += histo
self._n_samp += pred.shape[0]
# store trajectories and weights for reevaluation
self._trajectories.append(tra)
self._weights.append(w)

add_density_for_trajectories.__doc__ = DensityCollector.add_density_for_trajectories.__doc__

async def reevaluate_density(self, model):
# keep a ref to current trajs and weights, then reset self
trajs, weights = self._trajectories, self._weights
self.reset()
# and finally (re)add all trajectories using the current model
await self.add_density_for_trajectories(model=model, trajectories=trajs,
weights=weights)

reevaluate_density.__doc__ = DensityCollector.reevaluate_density.__doc__
33 changes: 19 additions & 14 deletions aimmd/base/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,12 @@
import h5py
import asyncmd
import numpy as np
from pkg_resources import parse_version
#from pkg_resources import parse_version # pkg_resources is deprecated
from packaging.version import parse as parse_version

from . import _H5PY_PATH_DICT
from .trainset import TrainSet
from ..distributed.pathmovers import ModelDependentPathMover
from ..distributed.pathmovers import PathMover
from .. import __version__


Expand Down Expand Up @@ -78,6 +79,8 @@ def write(self, byts):


# buffered version, seems to be a bit faster(?)
# TODO: inherit from BytesIO/FileIO or the corresponding ABC
#class BytesStreamtoH5pyBuffered(io.BytesIO):
class BytesStreamtoH5pyBuffered:
"""
'Translate' from python bytes objects to arrays of uint8. Buffered Version.
Expand Down Expand Up @@ -168,6 +171,8 @@ def write(self, byts):
return add_len


# TODO: inherit from BytesIO/FileIO or the corresponding ABC
#class H5pytoBytesStream(io.BytesIO):
class H5pytoBytesStream:
"""
'Translate' from arrays of uint8s to python bytes objects.
Expand Down Expand Up @@ -441,23 +446,23 @@ def __init__(self, grp, modelstore, mcstep=None):

def save(self, mcstep):
mover = mcstep.mover
if isinstance(mover, ModelDependentPathMover):
if isinstance(mover, PathMover):
mover_modelstore = mover.modelstore
# check if they use the same hdf5 group, the RCModelshelf objects
# do not need to be the same (and most often often are not)
if mover.modelstore is not None:
# movers that have been pickled can have modelstore=None
if mover_modelstore._group != self._modelstore._group:
logger.error("saving a mcstep with a 'foreign' modelstore")
mcstep.mover.modelstore = None
mcstep.mover.modelstore = None
super().save(obj=mcstep, overwrite=False, buffsize=2**22)
if isinstance(mover, ModelDependentPathMover):
if isinstance(mover, PathMover):
# reset the modelstore of the mc.mover in case we use it somewhere else
mcstep.mover.modelstore = mover_modelstore

def load(self):
mcstep = super().load(buffsize=2**22)
if isinstance(mcstep.mover, ModelDependentPathMover):
if isinstance(mcstep.mover, PathMover):
mcstep.mover.modelstore = self._modelstore
return mcstep

Expand Down Expand Up @@ -635,7 +640,7 @@ def save(self, obj, buffsize=2 ** 22):
obj.modelstore = None
mover_modelstores = []
for mover in obj.movers:
if isinstance(mover, ModelDependentPathMover):
if isinstance(mover, PathMover):
if mover.modelstore._group != self.modelstore._group:
logger.error("Saving a mover in the PathSamplingChain with"
" a different modelstore.")
Expand All @@ -653,7 +658,7 @@ def save(self, obj, buffsize=2 ** 22):
obj.mcstep_collection = pcs_step_collection
obj.modelstore = pcs_modelstore
for mover, mover_ms in zip(obj.movers, mover_modelstores):
if isinstance(mover, ModelDependentPathMover):
if isinstance(mover, PathMover):
mover.modelstore = mover_ms
return

Expand All @@ -664,7 +669,7 @@ def load(self, buffsize=2 ** 22):
# the step will not have a modelstore set anymore because it has
# been pickled)
for mover in obj.movers:
if isinstance(mover, ModelDependentPathMover):
if isinstance(mover, PathMover):
mover.modelstore = self.modelstore
obj.modelstore = self.modelstore
return obj
Expand Down Expand Up @@ -866,13 +871,13 @@ def __init__(self, fname, mode='a', copy_asyncmd_caches=False):
self._dirname = os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), fname)))
if ("w" in mode) or ("a" in mode and not fexists):
# first creation of file: write aimmd compatibility version string
self._store.attrs["storage_version"] = np.string_(
self._store.attrs["storage_version"] = np.bytes_(
self._compatibility_version
)
self._store.attrs["aimmd_version"] = np.string_(__version__)
self._store.attrs["aimmd_version"] = np.bytes_(__version__)
# save the current (i.e. where the file is when we opened it) dirname to attrs
# we need this to be able to save tensorflow models properly
self._store.attrs["dirname"] = np.string_(self._dirname)
self._store.attrs["dirname"] = np.bytes_(self._dirname)
else:
store_version = parse_version(
self._store.attrs["storage_version"].decode("ASCII")
Expand All @@ -894,7 +899,7 @@ def __init__(self, fname, mode='a', copy_asyncmd_caches=False):
# i.e. try to not break backwards compatibility
if mode != "r":
# storage open with write intent, so just add the attr for dirname
self._store.attrs["dirname"] = np.string_(self._dirname)
self._store.attrs["dirname"] = np.bytes_(self._dirname)
logger.debug("Converted 'old' storage to 'new' format by adding "
+ "the 'dirname' attr.")
else:
Expand All @@ -915,7 +920,7 @@ def __init__(self, fname, mode='a', copy_asyncmd_caches=False):
+ "if you did not copy the KerasRCmodel directory yourself.")
else:
# we can just change the path
self._store.attrs["dirname"] = np.string_(self._dirname)
self._store.attrs["dirname"] = np.bytes_(self._dirname)
# but we warn because the folder with the models must be copied
# TODO/FIXME: automatically copy the folder from old to new location?
logger.warn("The directory containing the storage changed, we updated it in the storage."
Expand Down
3 changes: 2 additions & 1 deletion aimmd/distributed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
along with AIMMD. If not, see <https://www.gnu.org/licenses/>.
"""
from . import config
from .committors import CommittorSimulation, CommittorConfiguration, CommittorEngineSpec
from .dataclasses import MDEngineSpec
from .committors import CommittorSimulation, CommittorConfiguration
# TODO: what PathSampling stuff do we want to have available at the top level?
from .pathsampling import Brain
Loading
Loading