Mordred Barysz matrix#618
Conversation
| See skfp/fingerprints/data/mordred-community_bsd_license.txt for the license text. | ||
| """ | ||
|
|
||
| _BARYSZ_PROPERTIES = ["Z", "m", "v", "se", "pe", "are", "p", "i"] |
There was a problem hiding this comment.
Change names to follow convention from autocorrelation
| "VR3", | ||
| ] | ||
|
|
||
| _CARBON = Atom(6) |
|
|
||
| Requires a connected molecule (single fragment). | ||
| """ | ||
| return _barysz_values(mol_regular, n_frags), FEATURE_NAMES |
There was a problem hiding this comment.
Just compute them directly here
| len(_BARYSZ_PROPERTIES) * len(_BARYSZ_ATTRIBUTES), np.nan, dtype=np.float32 | ||
| ) | ||
|
|
||
| values: list[float | np.floating] = [] |
There was a problem hiding this comment.
I don't think you need to type this
| carbon_value = prop_func(_CARBON) | ||
|
|
||
| property_values = np.asarray( | ||
| [prop_func(atom) for atom in mol.GetAtoms()], dtype=float |
There was a problem hiding this comment.
Maybe np.float32, since we are using it everywhere?
| matrix[j, i] = weight | ||
|
|
||
| matrix = floyd_warshall(matrix, directed=False) | ||
| with np.errstate(divide="ignore", invalid="ignore"): |
There was a problem hiding this comment.
You can decorate the whole function probably
| PROPERTY_FUNCS: dict[str, Callable[[Atom], float]] = { | ||
| "Z": get_atomic_number, | ||
| "m": get_mass, | ||
| "v": get_van_der_waals_volume, | ||
| "se": get_sanderson_electronegativity, | ||
| "pe": get_pauling_electronegativity, | ||
| "are": get_allred_rochow_electronegativity, | ||
| "p": get_polarizability, | ||
| "i": get_ionization_potential, | ||
| } |
There was a problem hiding this comment.
Good idea, but use readable names. Also add this to autocorrelation
| _CARBON = Atom(6) | ||
|
|
||
| FEATURE_NAMES = [ | ||
| f"{attr}_Dz{prop}" for prop in _BARYSZ_PROPERTIES for attr in _BARYSZ_ATTRIBUTES |
There was a problem hiding this comment.
Once you rename properties to be more readable, also remember to rename overall Mordred feature names
| _SMILES = { | ||
| "Benzene": "c1ccccc1", | ||
| "Hexane": "CCCCCC", | ||
| "Caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", | ||
| "Aspirin": "CC(=O)Oc1ccccc1C(=O)O", | ||
| "Limonene": "CC1=CCC(=CC1)C(C)=C", | ||
| } | ||
|
|
||
| with open(Path(__file__).parent / "references" / "barysz_matrix.json") as f: | ||
| _REFERENCE = json.load(f) | ||
|
|
||
| _PROPERTIES = ["Z", "m", "v", "se", "pe", "are", "p", "i"] | ||
| _ATTRIBUTES = [ | ||
| "SpAbs", | ||
| "SpMax", | ||
| "SpDiam", | ||
| "SpAD", | ||
| "SpMAD", | ||
| "LogEE", | ||
| "SM1", | ||
| "VE1", | ||
| "VE2", | ||
| "VE3", | ||
| "VR1", | ||
| "VR2", | ||
| "VR3", | ||
| ] |
| _SMILES = { | ||
| "Benzene": "c1ccccc1", | ||
| "Hexane": "CCCCCC", | ||
| "Caffeine": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", | ||
| "Aspirin": "CC(=O)Oc1ccccc1C(=O)O", | ||
| "Limonene": "CC1=CCC(=CC1)C(C)=C", | ||
| } | ||
|
|
||
| with open(Path(__file__).parent / "references" / "barysz_matrix.json") as f: | ||
| _REFERENCE = json.load(f) | ||
|
|
||
| _PROPERTIES = ["Z", "m", "v", "se", "pe", "are", "p", "i"] | ||
| _ATTRIBUTES = [ | ||
| "SpAbs", | ||
| "SpMax", | ||
| "SpDiam", | ||
| "SpAD", | ||
| "SpMAD", | ||
| "LogEE", | ||
| "SM1", | ||
| "VE1", | ||
| "VE2", | ||
| "VE3", | ||
| "VR1", | ||
| "VR2", | ||
| "VR3", | ||
| ] | ||
|
|
||
|
|
||
| @pytest.fixture(scope="module") | ||
| def computed_values(): | ||
| computed = {} | ||
| for name, smiles in _SMILES.items(): | ||
| mol = preprocess_mol(MolFromSmiles(smiles)) | ||
| values, feature_names = calc(mol, n_frags=1) | ||
| computed[name] = dict(zip(feature_names, values, strict=True)) | ||
| return computed | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("molecule", list(_SMILES)) |
There was a problem hiding this comment.
Use those properties directly. You can use tuples in pytest.mark.parametrize also
Changes
This pull request introduces the
barysz_matrixdescriptor to the NewMordred fingerprint calculation and extends the atomic properties registry used across descriptors.New Features:
Added the barysz_matrix descriptor, which computes 104 spectral features (13 attributes × 8 atomic properties). The Barysz matrix is a weighted distance matrix where edge weights between bonded atoms are inversely proportional to their atomic property values and bond order, normalized against the corresponding carbon–carbon value. Shortest paths are resolved via the Floyd–Warshall algorithm. Spectral attributes (SpAbs, SpMax, SpDiam, SpAD, SpMAD, LogEE, SM1, VE1–VE3, VR1–VR3) are computed for each of the 8 atomic properties (Z, m, v, se, pe, are, p, i). All features are set to NaN for disconnected molecules.
Added PROPERTY_FUNCS dictionary to atomic_properties.py, mapping short property names ("Z", "m", "v", etc.) to their corresponding atom getter functions. This replaces the legacy _AUTOCORRELATION_PROPERTY_FUNCS pattern and provides a shared registry for descriptors that need lookup by short name.
Checklist before requesting a review
make test-coverage)make docsand seedocs/_build/index.html)