From 70d484e6d2e7f9932d6a668dd2d825f8fcfe97e6 Mon Sep 17 00:00:00 2001 From: KazuriCode Date: Tue, 30 Jul 2024 09:03:17 -0500 Subject: [PATCH 1/6] initial commit: comparing ukv vs zip and tar --- .../archive_collection_comparison_timing.py | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 molli/benchmarks/archive_collection_comparison_timing.py diff --git a/molli/benchmarks/archive_collection_comparison_timing.py b/molli/benchmarks/archive_collection_comparison_timing.py new file mode 100644 index 0000000..57a4ba6 --- /dev/null +++ b/molli/benchmarks/archive_collection_comparison_timing.py @@ -0,0 +1,100 @@ +# This script is to show the process of timing measurement +import molli as ml +import timeit + +N = 3 + +# ml.aux.assert_molli_version_min("1.0.0b2") + +_clib = ml.ConformerLibrary("bpa_test.clib") + +with _clib.reading(): + ensembles = {k: v for k, v in _clib.items()} + + +def _dir_col(path, overwrite=False): + return ml.storage.Collection( + path, + backend=ml.storage.DirCollectionBackend, + value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()), + value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(), + ext=".mol2", + readonly=False, + overwrite=overwrite, + ) + + +def _zip_col(path, overwrite=False): + return ml.storage.Collection( + path, + backend=ml.storage.ZipCollectionBackend, + value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()), + value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(), + ext=".mol2", + readonly=False, + overwrite=overwrite, + ) + + +def _7zip_col(path, overwrite=False): + return ml.storage.Collection( + path, + backend=ml.storage.SevenZipCollectionBackend, + value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()), + value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(), + ext=".mol2", + readonly=False, + overwrite=overwrite, + ) + + +def _tar_col(path, overwrite=False): + return ml.storage.Collection( + path, + backend=ml.storage.TarCollectionBackend, + value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()), + value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(), + ext=".mol2", + readonly=False, + overwrite=overwrite, + ) + + +def _ukv_col(path, overwrite=False): + return ml.ConformerLibrary( + path, + readonly=False, + overwrite=overwrite, + ) + + +# Note: bpa_test_deflate5.zip is not here as you cannot write into the compressed format +for prep, path in ( + (_ukv_col, "bpa_test.clib"), + (_tar_col, "bpa_test.tar"), + (_zip_col, "bpa_test.zip"), + # (_dir_col, "bpa_test"), +): + clib_write_times = timeit.Timer( + stmt="""with library.writing():\n for k, v in ensembles.items(): library[k]=v""", + setup="""library = prep(path, overwrite=True)""", + globals=globals(), + ).repeat(5, number=1) + + print("Writing times", path, min(clib_write_times), clib_write_times, flush=True) + +# Note: bpa_test_deflate5.zip is written from the compressed "bpa_test" directory created after the first one +for prep, path in ( + (_ukv_col, "bpa_test.clib"), + (_tar_col, "bpa_test.tar"), + (_zip_col, "bpa_test.zip"), + # (_zip_col, "bpa_test_deflate5.zip"), + # (_dir_col, "bpa_test"), +): + clib_read_times = timeit.Timer( + stmt="""with library.reading():\n for k, v in library.items(): pass""", + setup="""library = prep(path, overwrite=False)""", + globals=globals(), + ).repeat(5, number=1) + + print("Read times", path, min(clib_read_times), clib_read_times, flush=True) From 3ca5dd6f5eb349bd4654dcbd6186444b42667794 Mon Sep 17 00:00:00 2001 From: KazuriCode Date: Tue, 30 Jul 2024 09:04:55 -0500 Subject: [PATCH 2/6] archives vs ukv comparison: removed 7zip --- .../archive_collection_comparison_timing.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/molli/benchmarks/archive_collection_comparison_timing.py b/molli/benchmarks/archive_collection_comparison_timing.py index 57a4ba6..9df92a2 100644 --- a/molli/benchmarks/archive_collection_comparison_timing.py +++ b/molli/benchmarks/archive_collection_comparison_timing.py @@ -36,18 +36,6 @@ def _zip_col(path, overwrite=False): ) -def _7zip_col(path, overwrite=False): - return ml.storage.Collection( - path, - backend=ml.storage.SevenZipCollectionBackend, - value_decoder=lambda x: ml.ConformerEnsemble.loads_mol2(x.decode()), - value_encoder=lambda x: ml.ConformerEnsemble.dumps_mol2(x).encode(), - ext=".mol2", - readonly=False, - overwrite=overwrite, - ) - - def _tar_col(path, overwrite=False): return ml.storage.Collection( path, From 0fc516d5d11a62adc723ba6563fd1bf6a207a3da Mon Sep 17 00:00:00 2001 From: KazuriCode Date: Tue, 30 Jul 2024 09:17:14 -0500 Subject: [PATCH 3/6] added necessary backends --- molli/storage/__init__.py | 1 + molli/storage/backends.py | 81 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/molli/storage/__init__.py b/molli/storage/__init__.py index 61298c4..6199a83 100644 --- a/molli/storage/__init__.py +++ b/molli/storage/__init__.py @@ -2,6 +2,7 @@ CollectionBackendBase, DirCollectionBackend, ZipCollectionBackend, + TarCollectionBackend, UkvCollectionBackend, MlibCollectionBackend, ) diff --git a/molli/storage/backends.py b/molli/storage/backends.py index 1aabe46..fcea267 100644 --- a/molli/storage/backends.py +++ b/molli/storage/backends.py @@ -1,6 +1,8 @@ +import io from fasteners import InterProcessLock, InterProcessReaderWriterLock from glob import glob from zipfile import ZipFile, is_zipfile +from tarfile import TarFile, TarInfo, is_tarfile from .ukvfile import UKVFile import abc @@ -317,6 +319,85 @@ def _truncate(self, key: bytes) -> bytes: self._zipfile.remove(key) +class TarCollectionBackend(CollectionBackendBase): + # pass + def __init__( + self, + path, + *, + overwrite: bool = False, + readonly: bool = True, + ext: str = ".mol2", + mode: Literal["r", "w", "a", "x"] = "r", + bufsize=0, + ) -> None: + self.ext = ext + super().__init__(path, mode=mode, bufsize=bufsize, readonly=readonly) + + with self._lock.write_lock(): + if not self._path.is_file(): + with TarFile( + self._path, + mode="x", + ): + pass + elif overwrite: + with TarFile( + self._path, + mode="w", + ): + pass + + def lock_acquire(self): + self._plock = InterProcessReaderWriterLock(rwlock(self._path)) + self._plock.acquire() + + def lock_release(self): + self._tarfile.close() + self._plock.release() + + def begin_read(self): + if is_tarfile(str(self._path)): + self._tarfile = TarFile(self._path, mode="r") + + def end_read(self): + self._tarfile.close() + + def begin_write(self): + self._tarfile = TarFile(self._path, mode="a") + + def end_write(self): + self._tarfile.close() + + def get_path(self, key: str): + return f"{key}" + + def update_keys(self): + self._keys = { + name for name in self._tarfile.getnames() if name.endswith(self.ext) + } + + def _write(self, key: str, value: bytes): + tarinfo = TarInfo(name=f"{self.get_path(key)}{self.ext}") + tarinfo.size = len(value) + self._tarfile.addfile(tarinfo, io.BytesIO(value)) + + def _read(self, key: str) -> bytes: + + try: + f = self._tarfile.extractfile(key) + return f.read() + except: + print(f"No such file or directory: {key}") + + def _truncate(self, key: bytes) -> bytes: + with TarFile( + self._path, + mode="w", + ): + pass + + @deprecated( "Mlib file format was significantly updated and replaced by UKV file format. Please consider" " using `molli recollect` to update your Collection format." From fdba3f87d26bf21c2a7e790cdfc0639e26b18356 Mon Sep 17 00:00:00 2001 From: KazuriCode Date: Tue, 30 Jul 2024 09:22:21 -0500 Subject: [PATCH 4/6] added hdf5 comparison, updated file names --- ...ng.py => archive_collection_comparison.py} | 0 molli/benchmarks/hd5f_collection_comparison | 59 +++++++++++++++++++ 2 files changed, 59 insertions(+) rename molli/benchmarks/{archive_collection_comparison_timing.py => archive_collection_comparison.py} (100%) create mode 100644 molli/benchmarks/hd5f_collection_comparison diff --git a/molli/benchmarks/archive_collection_comparison_timing.py b/molli/benchmarks/archive_collection_comparison.py similarity index 100% rename from molli/benchmarks/archive_collection_comparison_timing.py rename to molli/benchmarks/archive_collection_comparison.py diff --git a/molli/benchmarks/hd5f_collection_comparison b/molli/benchmarks/hd5f_collection_comparison new file mode 100644 index 0000000..d00fe11 --- /dev/null +++ b/molli/benchmarks/hd5f_collection_comparison @@ -0,0 +1,59 @@ +import molli as ml +import timeit +import h5py +import numpy as np + +_clib = ml.ConformerLibrary("bpa_test.clib") + +with _clib.reading(): + ensembles = {k: v for k, v in _clib.items()} + +path = "bpa_test.hdf5" + + +def write_hdf5(): + with h5py.File(path, "w") as f: + for entry_key, ensemble in ensembles.items(): + group = f.create_group(entry_key) + group.create_dataset( + name="coords", + data=ensemble.coords, + ) + + group.create_dataset(name="atomic_charges", data=ensemble.atomic_charges) + group.create_dataset(name="weights", data=ensemble.weights) + group.create_dataset( + name="atoms", data=[int(a.element) for a in ensemble.atoms] + ) + atom_idxs = {a: i for i, a in enumerate(ensemble.atoms)} + group.create_dataset( + name="bonds", + data=[ + [atom_idxs[bond.a1], atom_idxs[bond.a2], int(bond.btype)] + for bond in ensemble.bonds + ], + ) + + +def read_hdf5(): + with h5py.File(path, "r") as f: + for entry_key in f.keys(): + group = f[entry_key] + ens = ml.ConformerEnsemble( + [int(i) for i in group["atoms"][:]], + name=entry_key, + coords=group["coords"][:], + weights=group["weights"][:], + atomic_charges=group["atomic_charges"][:], + ) + for a1, a2, bt in group["bonds"][:]: + ens.connect(int(a1), int(a2), btype=ml.BondType(bt)) + + +# Measure writing speed +write_times = timeit.repeat(write_hdf5, repeat=5, number=1) +print(f"Writing time: {min(write_times):.6f} seconds") + +# Measure reading speed +read_times = timeit.repeat(read_hdf5, repeat=5, number=1) +print(f"Reading time: {min(read_times):.6f} seconds") From a528e68b69de4ae0907b128fc367764c1808ba80 Mon Sep 17 00:00:00 2001 From: KazuriCode Date: Tue, 30 Jul 2024 15:41:19 -0500 Subject: [PATCH 5/6] moved files and improved performance --- .../archive_collection_comparison.py | 0 .../hd5f_collection_comparison.py | 16 +++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) rename {molli/benchmarks => benchmarks}/archive_collection_comparison.py (100%) rename molli/benchmarks/hd5f_collection_comparison => benchmarks/hd5f_collection_comparison.py (76%) diff --git a/molli/benchmarks/archive_collection_comparison.py b/benchmarks/archive_collection_comparison.py similarity index 100% rename from molli/benchmarks/archive_collection_comparison.py rename to benchmarks/archive_collection_comparison.py diff --git a/molli/benchmarks/hd5f_collection_comparison b/benchmarks/hd5f_collection_comparison.py similarity index 76% rename from molli/benchmarks/hd5f_collection_comparison rename to benchmarks/hd5f_collection_comparison.py index d00fe11..7a045dc 100644 --- a/molli/benchmarks/hd5f_collection_comparison +++ b/benchmarks/hd5f_collection_comparison.py @@ -18,12 +18,17 @@ def write_hdf5(): group.create_dataset( name="coords", data=ensemble.coords, + dtype="float32", ) - group.create_dataset(name="atomic_charges", data=ensemble.atomic_charges) - group.create_dataset(name="weights", data=ensemble.weights) group.create_dataset( - name="atoms", data=[int(a.element) for a in ensemble.atoms] + name="atomic_charges", data=ensemble.atomic_charges, dtype="float32" + ) + group.create_dataset(name="weights", data=ensemble.weights, dtype="float32") + group.create_dataset( + name="atoms", + data=[int(a.element) for a in ensemble.atoms], + dtype="int16", ) atom_idxs = {a: i for i, a in enumerate(ensemble.atoms)} group.create_dataset( @@ -32,6 +37,7 @@ def write_hdf5(): [atom_idxs[bond.a1], atom_idxs[bond.a2], int(bond.btype)] for bond in ensemble.bonds ], + dtype="int16", ) @@ -52,8 +58,8 @@ def read_hdf5(): # Measure writing speed write_times = timeit.repeat(write_hdf5, repeat=5, number=1) -print(f"Writing time: {min(write_times):.6f} seconds") +print(f"Writing time: min {min(write_times):.6f} seconds of {write_times}") # Measure reading speed read_times = timeit.repeat(read_hdf5, repeat=5, number=1) -print(f"Reading time: {min(read_times):.6f} seconds") +print(f"Reading time: min {min(read_times):.6f} seconds of {read_times}") From da11b37ddead718b9da67c30fd4c2b5f41c18bee Mon Sep 17 00:00:00 2001 From: KazuriCode Date: Fri, 9 Aug 2024 14:55:35 -0500 Subject: [PATCH 6/6] fix in read_hdf5() --- benchmarks/hd5f_collection_comparison.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/hd5f_collection_comparison.py b/benchmarks/hd5f_collection_comparison.py index 7a045dc..60f88a7 100644 --- a/benchmarks/hd5f_collection_comparison.py +++ b/benchmarks/hd5f_collection_comparison.py @@ -47,6 +47,7 @@ def read_hdf5(): group = f[entry_key] ens = ml.ConformerEnsemble( [int(i) for i in group["atoms"][:]], + n_conformers=group["coords"].shape[0], name=entry_key, coords=group["coords"][:], weights=group["weights"][:],