Skip to content

Commit

Permalink
Merge pull request #147 from SEDenmarkLab/dev-lena-benchmarks
Browse files Browse the repository at this point in the history
Additional benchmarks for SI
  • Loading branch information
esalx authored Oct 15, 2024
2 parents 01e5a32 + 6a69a52 commit 21f2bf9
Show file tree
Hide file tree
Showing 4 changed files with 236 additions and 0 deletions.
88 changes: 88 additions & 0 deletions benchmarks/archive_collection_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# This script is to show the process of timing measurement
import molli as ml
import timeit

N = 3

# ml.aux.assert_molli_version_min("1.0.0b2")

_clib = ml.ConformerLibrary("bpa_test.clib")

with _clib.reading():
ensembles = {k: v for k, v in _clib.items()}


def _dir_col(path, overwrite=False):
return ml.storage.Collection(
path,
backend=ml.storage.DirCollectionBackend,
value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()),
value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(),
ext=".mol2",
readonly=False,
overwrite=overwrite,
)


def _zip_col(path, overwrite=False):
return ml.storage.Collection(
path,
backend=ml.storage.ZipCollectionBackend,
value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()),
value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(),
ext=".mol2",
readonly=False,
overwrite=overwrite,
)


def _tar_col(path, overwrite=False):
return ml.storage.Collection(
path,
backend=ml.storage.TarCollectionBackend,
value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()),
value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(),
ext=".mol2",
readonly=False,
overwrite=overwrite,
)


def _ukv_col(path, overwrite=False):
return ml.ConformerLibrary(
path,
readonly=False,
overwrite=overwrite,
)


# Note: bpa_test_deflate5.zip is not here as you cannot write into the compressed format
for prep, path in (
(_ukv_col, "bpa_test.clib"),
(_tar_col, "bpa_test.tar"),
(_zip_col, "bpa_test.zip"),
# (_dir_col, "bpa_test"),
):
clib_write_times = timeit.Timer(
stmt="""with library.writing():\n for k, v in ensembles.items(): library[k]=v""",
setup="""library = prep(path, overwrite=True)""",
globals=globals(),
).repeat(5, number=1)

print("Writing times", path, min(clib_write_times), clib_write_times, flush=True)

# Note: bpa_test_deflate5.zip is written from the compressed "bpa_test" directory created after the first one
for prep, path in (
(_ukv_col, "bpa_test.clib"),
(_tar_col, "bpa_test.tar"),
(_zip_col, "bpa_test.zip"),
# (_zip_col, "bpa_test_deflate5.zip"),
# (_dir_col, "bpa_test"),
):
clib_read_times = timeit.Timer(
stmt="""with library.reading():\n for k, v in library.items(): pass""",
setup="""library = prep(path, overwrite=False)""",
globals=globals(),
).repeat(5, number=1)

print("Read times", path, min(clib_read_times), clib_read_times, flush=True)
66 changes: 66 additions & 0 deletions benchmarks/hd5f_collection_comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import molli as ml
import timeit
import h5py
import numpy as np

_clib = ml.ConformerLibrary("bpa_test.clib")

with _clib.reading():
ensembles = {k: v for k, v in _clib.items()}

path = "bpa_test.hdf5"


def write_hdf5():
with h5py.File(path, "w") as f:
for entry_key, ensemble in ensembles.items():
group = f.create_group(entry_key)
group.create_dataset(
name="coords",
data=ensemble.coords,
dtype="float32",
)

group.create_dataset(
name="atomic_charges", data=ensemble.atomic_charges, dtype="float32"
)
group.create_dataset(name="weights", data=ensemble.weights, dtype="float32")
group.create_dataset(
name="atoms",
data=[int(a.element) for a in ensemble.atoms],
dtype="int16",
)
atom_idxs = {a: i for i, a in enumerate(ensemble.atoms)}
group.create_dataset(
name="bonds",
data=[
[atom_idxs[bond.a1], atom_idxs[bond.a2], int(bond.btype)]
for bond in ensemble.bonds
],
dtype="int16",
)


def read_hdf5():
with h5py.File(path, "r") as f:
for entry_key in f.keys():
group = f[entry_key]
ens = ml.ConformerEnsemble(
[int(i) for i in group["atoms"][:]],
n_conformers=group["coords"].shape[0],
name=entry_key,
coords=group["coords"][:],
weights=group["weights"][:],
atomic_charges=group["atomic_charges"][:],
)
for a1, a2, bt in group["bonds"][:]:
ens.connect(int(a1), int(a2), btype=ml.BondType(bt))


# Measure writing speed
write_times = timeit.repeat(write_hdf5, repeat=5, number=1)
print(f"Writing time: min {min(write_times):.6f} seconds of {write_times}")

# Measure reading speed
read_times = timeit.repeat(read_hdf5, repeat=5, number=1)
print(f"Reading time: min {min(read_times):.6f} seconds of {read_times}")
1 change: 1 addition & 0 deletions molli/storage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
CollectionBackendBase,
DirCollectionBackend,
ZipCollectionBackend,
TarCollectionBackend,
UkvCollectionBackend,
MlibCollectionBackend,
)
Expand Down
81 changes: 81 additions & 0 deletions molli/storage/backends.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import io
from fasteners import InterProcessLock, InterProcessReaderWriterLock
from glob import glob
from zipfile import ZipFile, is_zipfile
from tarfile import TarFile, TarInfo, is_tarfile
from .ukvfile import UKVFile

import abc
Expand Down Expand Up @@ -317,6 +319,85 @@ def _truncate(self, key: bytes) -> bytes:
self._zipfile.remove(key)


class TarCollectionBackend(CollectionBackendBase):
# pass
def __init__(
self,
path,
*,
overwrite: bool = False,
readonly: bool = True,
ext: str = ".mol2",
mode: Literal["r", "w", "a", "x"] = "r",
bufsize=0,
) -> None:
self.ext = ext
super().__init__(path, mode=mode, bufsize=bufsize, readonly=readonly)

with self._lock.write_lock():
if not self._path.is_file():
with TarFile(
self._path,
mode="x",
):
pass
elif overwrite:
with TarFile(
self._path,
mode="w",
):
pass

def lock_acquire(self):
self._plock = InterProcessReaderWriterLock(rwlock(self._path))
self._plock.acquire()

def lock_release(self):
self._tarfile.close()
self._plock.release()

def begin_read(self):
if is_tarfile(str(self._path)):
self._tarfile = TarFile(self._path, mode="r")

def end_read(self):
self._tarfile.close()

def begin_write(self):
self._tarfile = TarFile(self._path, mode="a")

def end_write(self):
self._tarfile.close()

def get_path(self, key: str):
return f"{key}"

def update_keys(self):
self._keys = {
name for name in self._tarfile.getnames() if name.endswith(self.ext)
}

def _write(self, key: str, value: bytes):
tarinfo = TarInfo(name=f"{self.get_path(key)}{self.ext}")
tarinfo.size = len(value)
self._tarfile.addfile(tarinfo, io.BytesIO(value))

def _read(self, key: str) -> bytes:

try:
f = self._tarfile.extractfile(key)
return f.read()
except:
print(f"No such file or directory: {key}")

def _truncate(self, key: bytes) -> bytes:
with TarFile(
self._path,
mode="w",
):
pass


@deprecated(
"Mlib file format was significantly updated and replaced by UKV file format. Please consider"
" using `molli recollect` to update your Collection format."
Expand Down

0 comments on commit 21f2bf9

Please sign in to comment.