Skip to content

[On hold] Force mask #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openqdc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def datasets():
table = PrettyTable(["Name", "Forces", "Level of theory"])
for dataset in AVAILABLE_DATASETS:
empty_dataset = AVAILABLE_DATASETS[dataset].no_init()
has_forces = False if not empty_dataset.__force_methods__ else True
has_forces = False if not empty_dataset.force_methods else True
table.add_row([dataset, has_forces, ",".join(empty_dataset.__energy_methods__)])
table.align = "l"
print(table)
Expand Down
36 changes: 25 additions & 11 deletions openqdc/datasets/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import pickle as pkl
from copy import deepcopy
from itertools import compress
from os.path import join as p_join
from typing import Dict, List, Optional, Union

Expand Down Expand Up @@ -89,12 +90,11 @@ class BaseDataset:
Base class for datasets in the openQDC package.
"""

__energy_methods__ = []
__force_methods__ = []
energy_target_names = []
force_target_names = []
__energy_methods__ = []
__force_mask__ = []
__isolated_atom_energies__ = []

__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
Expand Down Expand Up @@ -158,7 +158,7 @@ def _convert_data(self):
f"Converting {self.__name__} data to the following units:\n\
Energy: {self.energy_unit},\n\
Distance: {self.distance_unit},\n\
Forces: {self.force_unit if self.__force_methods__ else 'None'}"
Forces: {self.force_unit if self.force_methods else 'None'}"
)
for key in self.data_keys:
self.data[key] = self._convert_on_loading(self.data[key], key)
Expand Down Expand Up @@ -222,7 +222,7 @@ def _precompute_E(self):
)

def _precompute_F(self):
if len(self.__force_methods__) == 0:
if len(self.force_methods) == 0:
return NOT_DEFINED
converted_force_data = self.convert_forces(self.data["forces"])
force_mean = np.nanmean(converted_force_data, axis=0)
Expand Down Expand Up @@ -270,7 +270,7 @@ def preprocess_path(self):
@property
def data_keys(self):
keys = list(self.data_types.keys())
if len(self.__force_methods__) == 0:
if len(self.force_methods) == 0:
keys.remove("forces")
return keys

Expand Down Expand Up @@ -302,6 +302,20 @@ def atoms_per_molecules(self):
except: # noqa
return None

@property
def energy_methods(self):
return self.__class__.__energy_methods__

@property
def force_methods(self):
return list(compress(self.energy_methods, self.force_mask))

@property
def force_mask(self):
if len(self.__class__.__force_mask__) == 0:
self.__class__.__force_mask__ = [False] * len(self.energy_methods)
return self.__class__.__force_mask__

def _set_units(self, en, ds):
old_en, old_ds = self.energy_unit, self.distance_unit
en = en if en is not None else old_en
Expand All @@ -311,16 +325,16 @@ def _set_units(self, en, ds):
self.set_energy_unit(en)
# if ds is not None:
self.set_distance_unit(ds)
if self.__force_methods__:
if self.force_methods:
self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit
self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__)

def _set_isolated_atom_energies(self):
if self.__energy_methods__ is None:
if self.energy_methods is None:
logger.error("No energy methods defined for this dataset.")
f = get_conversion("hartree", self.__energy_unit__)
self.__isolated_atom_energies__ = f(
np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__])
np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.energy_methods])
)

def convert_energy(self, x):
Expand Down Expand Up @@ -399,7 +413,7 @@ def read_preprocess(self, overwrite_local_cache=False):
f"Dataset {self.__name__} with the following units:\n\
Energy: {self.energy_unit},\n\
Distance: {self.distance_unit},\n\
Forces: {self.force_unit if self.__force_methods__ else 'None'}"
Forces: {self.force_unit if self.force_methods else 'None'}"
)
self.data = {}
for key in self.data_keys:
Expand Down Expand Up @@ -595,7 +609,7 @@ def get_statistics(self, normalization: str = "formation", return_none: bool = T
if normalization not in POSSIBLE_NORMALIZATION:
raise NormalizationNotAvailableError(normalization)
selected_stats = stats[normalization]
if len(self.__force_methods__) == 0 and not return_none:
if len(self.force_methods) == 0 and not return_none:
selected_stats.update(
{
"forces": {
Expand Down
7 changes: 1 addition & 6 deletions openqdc/datasets/potential/ani.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,6 @@ class ANI1CCX(ANI1):
"NPNO-CCSD(T):cc-pVTZ Correlation Energy",
"TPNO-CCSD(T):cc-pVDZ Correlation Energy",
]

__force_methods__ = []
force_target_names = []

def __smiles_converter__(self, x):
Expand Down Expand Up @@ -150,10 +148,7 @@ class ANI1X(ANI1):
"wB97x:def2-TZVPP Atomic Forces",
]

__force_methods__ = [
"wb97x/6-31g(d)",
"wb97x/cc-pvtz",
]
__force_mask__ = [False, False, False, False, False, False, True, True]

def convert_forces(self, x):
return super().convert_forces(x) * 0.529177249 # correct the Dataset error
Expand Down
5 changes: 1 addition & 4 deletions openqdc/datasets/potential/comp6.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,7 @@ class COMP6(BaseDataset):
# "WB97M-D3(BJ):def2-tzvp",
# "WB97M:def2-tzvp",
]

__force_methods__ = [
"wb97x/6-31g*",
]
__force_mask__ = [True, False, False, False, False, False, False]

force_target_names = [
"Gradient",
Expand Down
8 changes: 4 additions & 4 deletions openqdc/datasets/potential/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ class Dummy(BaseDataset):

__name__ = "dummy"
__energy_methods__ = ["I_solved_the_schrodinger_equation_by_hand", "PM6"]
__force_methods__ = ["I_made_up_random_forces", "writing_1_to_every_coordinate"]
__force_mask__ = [False, True]
__energy_unit__ = "kcal/mol"
__distance_unit__ = "ang"
__forces_unit__ = "kcal/mol/ang"

energy_target_names = [f"energy{i}" for i in range(len(__energy_methods__))]

force_target_names = [f"forces{i}" for i in range(len(__force_methods__))]
force_target_names = [f"forces{i}" for i in range(len(__force_mask__))]
__isolated_atom_energies__ = []
__average_n_atoms__ = None

Expand Down Expand Up @@ -75,8 +75,8 @@ def setup_dummy(self):
) # (sum(n_atoms), 5)
name = [f"dummy_{i}" for i in range(len(self))]
subset = ["dummy" for i in range(len(self))]
energies = np.random.rand(len(self), len(self.__energy_methods__))
forces = np.concatenate([np.random.randn(size, 3, len(self.__force_methods__)) * 100 for size in n_atoms])
energies = np.random.rand(len(self), len(self.energy_methods))
forces = np.concatenate([np.random.randn(size, 3, len(self.force_methods)) * 100 for size in n_atoms])
self.data = dict(
n_atoms=n_atoms,
position_idx_range=position_idx_range,
Expand Down
8 changes: 1 addition & 7 deletions openqdc/datasets/potential/gdml.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,7 @@ class GDML(BaseDataset):
"PBE-TS Energy",
]

__force_methods__ = [
"ccsd/cc-pvdz",
"ccsd(t)/cc-pvdz",
# "pbe/mbd", # MD22
# "pbe+mbd/tight", #MD22
"pbe/vdw-ts", # MD17
]
__force_mask__ = [True, True, True]

force_target_names = [
"CCSD Gradient",
Expand Down
4 changes: 1 addition & 3 deletions openqdc/datasets/potential/iso_17.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ class ISO17(BaseDataset):
"PBE-TS Energy",
]

__force_methods__ = [
"pbe/vdw-ts",
]
__force_mask__ = [True]

force_target_names = [
"PBE-TS Gradient",
Expand Down
2 changes: 1 addition & 1 deletion openqdc/datasets/potential/qm7x.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class QM7X(BaseDataset):

energy_target_names = ["ePBE0", "eMBD"]

__force_methods__ = ["pbe0/mbd", "dft3b"]
__force_mask__ = [True, True]

force_target_names = ["pbe0FOR", "vdwFOR"]

Expand Down
4 changes: 1 addition & 3 deletions openqdc/datasets/potential/sn2_rxn.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ class SN2RXN(BaseDataset):
"DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy",
]

__force_methods__ = [
"dsd-blyp-d3(bj)/def2-tzvp",
]
__force_mask__ = [True]

force_target_names = [
"DSD-BLYP-D3(BJ):def2-TZVP Gradient",
Expand Down
4 changes: 1 addition & 3 deletions openqdc/datasets/potential/solvated_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ class SolvatedPeptides(BaseDataset):
"revPBE-D3(BJ):def2-TZVP Atomization Energy",
]

__force_methods__ = [
"revpbe-d3(bj)/def2-tzvp",
]
__force_mask__ = [True]

force_target_names = [
"revPBE-D3(BJ):def2-TZVP Gradient",
Expand Down
2 changes: 1 addition & 1 deletion openqdc/datasets/potential/spice.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class Spice(BaseDataset):

__name__ = "spice"
__energy_methods__ = ["wb97m-d3bj/def2-tzvppd"]
__force_methods__ = ["wb97m-d3bj/def2-tzvppd"]
__force_mask__ = [True]
__energy_unit__ = "hartree"
__distance_unit__ = "bohr"
__forces_unit__ = "hartree/bohr"
Expand Down
5 changes: 1 addition & 4 deletions openqdc/datasets/potential/transition1x.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,7 @@ class Transition1X(BaseDataset):
"wB97x_6-31G(d).energy",
]

__force_methods__ = [
"wb97x/6-31G(d)",
]

__force_mask__ = [True]
force_target_names = [
"wB97x_6-31G(d).forces",
]
Expand Down