Skip to content

Commit e9ab358

Browse files
committed
Fixed lingo, shortestpath, topological-torsion and atom-pair fingerprints
Fixed bug in circular fingerprint (at default it was generating FCFP2 fingerprints instead of ECFP6) Added atompairs (CDK), rdk-descriptor (RDKit) and spectrophore (openbabel) fingerprints Renamed substructure to cdk-substructure fingerprint Removed signature fingerprint When generating a fingerprint without specifying a fingerprint size or depth, it will always go with the default parameters The size and depth setting now are supported with many more fingerprints Tried to fix heteroencoder fingerprint, but get an error when it loads the model
1 parent 3bc2da3 commit e9ab358

File tree

8 files changed

+207
-101
lines changed

8 files changed

+207
-101
lines changed

Example/test_fingerprint.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,9 @@
88
import numpy as np
99
from PyFingerprint.fingerprint import get_fingerprint, get_fingerprints
1010

11-
cdktypes = ['standard', 'extended', 'graph', 'maccs', 'pubchem', 'estate', 'hybridization', 'lingo',
12-
'klekota-roth', 'shortestpath', 'signature', 'substructure']
13-
rdktypes = ['rdkit', 'morgan', 'rdk-maccs', 'topological-torsion', 'avalon', 'atom-pair']
14-
babeltypes = ['fp2', 'fp3', 'fp4']
11+
cdktypes = ['standard', 'extended', 'graph', 'maccs', 'pubchem', 'estate', 'hybridization', 'lingo', 'klekota-roth', 'shortestpath', 'cdk-substructure', 'circular', 'atompairs']
12+
rdktypes = ['rdkit', 'morgan', 'rdk-maccs', 'topological-torsion', 'avalon', 'atom-pair', 'rdk-descriptor']
13+
babeltypes = ['fp2', 'fp3', 'fp4', 'spectrophore']
1514
vectypes = ['mol2vec', 'heteroencoder']
1615

1716
smi = 'CCCCN'

PyFingerprint/babel.py

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,44 @@
33
Created on Thu Nov 15 10:49:26 2018
44
55
@author: hcji
6+
7+
Updated on Thu Oct 12 20:04:22 2023
8+
9+
@author: Jnelen
610
"""
711

8-
from openbabel import pybel
12+
from openbabel import openbabel, pybel
913

10-
def ob_fingerprint(smi, fp_type='FP2', nbit=1024):
14+
import numpy as np
15+
16+
def convertToFP(bits, nbit):
17+
fp = np.zeros(nbit)
18+
for i in bits:
19+
fp[i] = 1
20+
return list(fp)
1121

22+
def ob_fingerprint(smi, fp_type='FP2', depth=0):
1223

1324
mol = pybel.readstring("smi", smi)
25+
1426
if fp_type == 'fp2':
15-
fp = mol.calcfp('FP2')
27+
bits = mol.calcfp('FP2').bits
28+
nbit = 1024
29+
fp = convertToFP(bits, nbit)
1630
elif fp_type == 'fp3':
17-
fp = mol.calcfp('FP3')
31+
bits = mol.calcfp('FP3').bits
32+
nbit = 55
33+
fp = convertToFP(bits, nbit)
1834
elif fp_type == 'fp4':
19-
fp = mol.calcfp('FP4')
20-
bits = fp.bits
21-
bits = [x for x in bits if x < nbit]
22-
return bits, nbit
35+
bits = mol.calcfp('FP4').bits
36+
nbit = 307
37+
fp = convertToFP(bits, nbit)
38+
elif fp_type == 'spectrophore':
39+
mol.addh()
40+
mol.make3D()
41+
spectrophoreCalculator = openbabel.OBSpectrophore()
42+
spectrophoreCalculator.SetNormalization(depth)
43+
fp = list(spectrophoreCalculator.GetSpectrophore(mol.OBMol))
44+
else:
45+
raise IOError('invalid fingerprint type')
46+
return fp

PyFingerprint/cdk.py

Lines changed: 66 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
Created on Wed Nov 14 12:46:18 2018
44
55
@author: hcji
6+
7+
Updated on Thu Oct 12 20:04:22 2023
8+
9+
@author: Jnelen
610
"""
711

812
import os
@@ -11,7 +15,7 @@
1115
import PyFingerprint
1216

1317
if not isJVMStarted():
14-
cdk_path = os.path.join(PyFingerprint.__path__[0], 'CDK', 'cdk-2.2.jar')
18+
cdk_path = os.path.join(PyFingerprint.__path__[0], 'CDK', 'cdk-2.9.jar')
1519
startJVM(getDefaultJVMPath(), "-ea", "-Djava.class.path=%s" % cdk_path)
1620
cdk = JPackage('org').openscience.cdk
1721

@@ -28,50 +32,88 @@ def cdk_parser_smiles(smi):
2832

2933
def get_fingerprinter(name, size, depth):
3034
### This was getting made every time!
31-
_fingerprinters = {"standard":lambda : cdk.fingerprint.Fingerprinter(size, depth)
32-
, "extended":lambda : cdk.fingerprint.ExtendedFingerprinter(size, depth)
33-
, "graph":lambda : cdk.fingerprint.GraphOnlyFingerprinter(size, depth)
34-
, "maccs":lambda : cdk.fingerprint.MACCSFingerprinter()
35-
, "pubchem":lambda : cdk.fingerprint.PubchemFingerprinter(cdk.silent.SilentChemObjectBuilder.getInstance())
36-
, "estate":lambda : cdk.fingerprint.EStateFingerprinter()
37-
, "hybridization":lambda : cdk.fingerprint.HybridizationFingerprinter(size, depth)
38-
, "lingo":lambda : cdk.fingerprint.LingoFingerprinter(depth)
39-
, "klekota-roth":lambda : cdk.fingerprint.KlekotaRothFingerprinter()
40-
, "shortestpath":lambda : cdk.fingerprint.ShortestPathFingerprinter(size)
41-
, "signature": lambda : cdk.fingerprint.SignatureFingerprinter(depth)
42-
, "circular": lambda : cdk.fingerprint.CircularFingerprinter()
43-
, "substructure": lambda : cdk.fingerprint.SubstructureFingerprinter()
44-
}
35+
## Checking if the depth is specified. If not, we use the default CDK values where appropriate
36+
if depth == None:
37+
_fingerprinters = {"standard":lambda : cdk.fingerprint.Fingerprinter(size, 7)
38+
, "atompairs": lambda : cdk.fingerprint.AtomPairs2DFingerprinter()
39+
, "extended":lambda : cdk.fingerprint.ExtendedFingerprinter(size, 7)
40+
, "graph":lambda : cdk.fingerprint.GraphOnlyFingerprinter(size, 7)
41+
, "maccs":lambda : cdk.fingerprint.MACCSFingerprinter()
42+
, "pubchem":lambda : cdk.fingerprint.PubchemFingerprinter(cdk.silent.SilentChemObjectBuilder.getInstance())
43+
, "estate":lambda : cdk.fingerprint.EStateFingerprinter()
44+
, "hybridization":lambda : cdk.fingerprint.HybridizationFingerprinter(size, 7)
45+
, "lingo":lambda : cdk.fingerprint.LingoFingerprinter()
46+
, "klekota-roth":lambda : cdk.fingerprint.KlekotaRothFingerprinter()
47+
, "shortestpath":lambda : cdk.fingerprint.ShortestPathFingerprinter(size)
48+
, "signature": lambda : cdk.fingerprint.SignatureFingerprinter()
49+
## circular fingerprint defaults to ECFP6: https://github.com/cdk/cdk/blob/125505c5ea1f69b692183bb0aae65816e7cb44e7/descriptor/fingerprint/src/main/java/org/openscience/cdk/fingerprint/CircularFingerprinter.java
50+
, "circular": lambda : cdk.fingerprint.CircularFingerprinter(4,size)
51+
, "cdk-substructure": lambda : cdk.fingerprint.SubstructureFingerprinter()
52+
}
53+
## Use the user-specified settings for the fingerprint generation
54+
else:
55+
_fingerprinters = {"standard":lambda : cdk.fingerprint.Fingerprinter(size, depth)
56+
, "atompairs": lambda : cdk.fingerprint.AtomPairs2DFingerprinter()
57+
, "extended":lambda : cdk.fingerprint.ExtendedFingerprinter(size, depth)
58+
, "graph":lambda : cdk.fingerprint.GraphOnlyFingerprinter(size, depth)
59+
, "maccs":lambda : cdk.fingerprint.MACCSFingerprinter()
60+
, "pubchem":lambda : cdk.fingerprint.PubchemFingerprinter(cdk.silent.SilentChemObjectBuilder.getInstance())
61+
, "estate":lambda : cdk.fingerprint.EStateFingerprinter()
62+
, "hybridization":lambda : cdk.fingerprint.HybridizationFingerprinter(size, depth)
63+
, "lingo":lambda : cdk.fingerprint.LingoFingerprinter(depth)
64+
, "klekota-roth":lambda : cdk.fingerprint.KlekotaRothFingerprinter()
65+
, "shortestpath":lambda : cdk.fingerprint.ShortestPathFingerprinter(size)
66+
, "signature": lambda : cdk.fingerprint.SignatureFingerprinter(depth)
67+
, "circular": lambda : cdk.fingerprint.CircularFingerprinter(depth, size)
68+
, "cdk-substructure": lambda : cdk.fingerprint.SubstructureFingerprinter()
69+
}
70+
4571
if name not in _fingerprinters:
4672
raise IOError('invalid fingerprint type')
4773

4874
return _fingerprinters[name]()
4975

50-
def cdk_fingerprint(smi, fp_type="standard", size=1024, depth=6):
51-
if fp_type == 'maccs':
52-
nbit = 166
53-
elif fp_type == 'estate':
76+
def cdk_fingerprint(smi, fp_type="standard", size=1024, depth=None):
77+
78+
mol = cdk_parser_smiles(smi)
79+
## Sanitize input molecules, as is recommended for most fingerprints (especially shortestpath)
80+
cdk.tools.manipulator.AtomContainerManipulator.percieveAtomTypesAndConfigureAtoms(mol)
81+
cdk.tools.manipulator.AtomContainerManipulator.convertImplicitToExplicitHydrogens(mol)
82+
83+
if fp_type == 'estate':
5484
nbit = 79
55-
elif fp_type == 'cdk':
85+
elif fp_type == 'maccs':
86+
nbit = 166
87+
elif fp_type == 'cdk-substructure':
5688
nbit = 307
89+
elif fp_type == 'atompairs':
90+
nbit = 780
5791
elif fp_type == 'pubchem':
5892
nbit = 881
5993
elif fp_type == 'klekota-roth':
60-
nbit = 4860
61-
elif fp_type in ['lingo', 'signature']:
94+
nbit = 4860
95+
elif fp_type == 'signature':
6296
nbit = None
97+
print("Signature_FP")
98+
fingerprinter = cdk.fingerprint.SignatureFingerprinter()
99+
mol = cdk_parser_smiles(smi)
100+
print(fingerprinter.getSize())
101+
print(fingerprinter.getBitFingerprint(mol).getSetbits())
102+
print(fingerprinter.getBitFingerprint(mol).size())
103+
print(fingerprinter.getRawFingerprint(mol))
104+
63105
else:
64106
nbit = size
65107

66-
mol = cdk_parser_smiles(smi)
108+
67109

68110
# Pull from cache if it exists
69111
if (fp_type, size, depth) in fp_map:
70112
fingerprinter = fp_map[(fp_type, size, depth)]
71113
else:
72114
fingerprinter = get_fingerprinter(fp_type, size, depth)
73115
fp_map[(fp_type, size, depth)] = fingerprinter
74-
116+
75117
fp_obj = fingerprinter.getBitFingerprint(mol)
76118
bits = list(fp_obj.getSetbits())
77119
return bits, nbit

PyFingerprint/fingerprint.py

Lines changed: 39 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
Created on Wed Feb 2 08:01:30 2022
44
55
@author: jihon
6+
7+
Updated on Thu Oct 12 20:04:22 2023
8+
9+
@author: Jnelen
610
"""
711

812

@@ -18,22 +22,21 @@
1822
hc_enable = False
1923

2024

21-
cdktypes = ['standard', 'extended', 'graph', 'maccs', 'pubchem', 'estate', 'hybridization', 'lingo',
22-
'klekota-roth', 'shortestpath', 'signature', 'substructure', 'circular']
23-
rdktypes = ['rdkit', 'morgan', 'rdk-maccs', 'topological-torsion', 'avalon', 'atom-pair']
24-
babeltypes = ['fp2', 'fp3', 'fp4']
25+
cdktypes = ['standard', 'extended', 'graph', 'maccs', 'pubchem', 'estate', 'hybridization', 'lingo', 'klekota-roth', 'shortestpath', 'cdk-substructure', 'circular', 'atompairs']
26+
rdktypes = ['rdkit', 'morgan', 'rdk-maccs', 'topological-torsion', 'avalon', 'atom-pair', 'rdk-descriptor']
27+
babeltypes = ['fp2', 'fp3', 'fp4', 'spectrophore']
2528
vectypes = ['mol2vec', 'heteroencoder']
2629

27-
2830
class fingerprint:
2931

30-
def __init__(self, bits: list, values: list, n: int):
31-
self.bits = bits
32-
self.values = values
33-
self.n = n
32+
def __init__(self, fp: list):
33+
self.fp = np.array(fp)
34+
self.bits = np.nonzero(self.fp)[0]
35+
self.values = self.fp[self.bits]
36+
self.n = len(self.fp)
3437

3538
def __str__(self):
36-
return (self.bits, self.n)
39+
return str(list(self.fp))[1:-1]
3740

3841
def check(self):
3942
for i in self.bits:
@@ -46,55 +49,50 @@ def check(self):
4649
raise TypeError
4750

4851
def to_numpy(self):
49-
if self.n is None:
50-
return None
51-
elif self.n > 10024:
52-
return None
53-
v = np.zeros(self.n)
54-
for i, k in enumerate(self.bits):
55-
v[k] = self.values[i]
56-
return v
52+
return self.fp
53+
54+
def getActiveBits(self):
55+
return self.bits
56+
57+
def numActiveBits(self):
58+
return len(self.getActiveBits())
59+
60+
def getActiveValues(self):
61+
return dict(zip(self.getActiveBits(), self.values))
62+
63+
def to_str(self):
64+
return str(list(self.fp))[1:-1]
5765

5866

5967
def get_fingerprint(smi: str, fp_type: str, nbit=None, depth=None):
60-
if nbit is None:
61-
nbit = 1024
62-
if depth is None:
63-
depth = 6
6468
if fp_type in cdktypes:
65-
bits, n = cdk_fingerprint(smi, fp_type, size = nbit, depth = depth)
66-
values = [1] * len(bits)
69+
if nbit is None:
70+
nbit = 1024
71+
bits, n = cdk_fingerprint(smi, fp_type, size=nbit, depth=depth)
72+
fp = np.zeros(n)
73+
for i, k in enumerate(bits):
74+
fp[k] = 1
6775
elif fp_type in rdktypes:
68-
bits, n = rdk_fingerprint(smi, fp_type, size = nbit)
69-
values = [1] * len(bits)
76+
fp = list(rdk_fingerprint(smi, fp_type, size=nbit))
77+
7078
elif fp_type in babeltypes:
71-
if nbit is None:
72-
nbit = 307
73-
bits, n = ob_fingerprint(smi, fp_type)
74-
values = [1] * len(bits)
79+
fp = ob_fingerprint(smi, fp_type)
7580
elif fp_type == 'mol2vec':
76-
values = list(mol2vec_fingerprint(smi))
77-
n = len(values)
78-
bits = list(np.arange(n))
81+
fp = list(mol2vec_fingerprint(smi))
7982
elif fp_type == 'heteroencoder':
8083
if not hc_enable:
8184
raise IOError('heteroencoder is not enabled')
82-
values = list(hc_fingerprint(smi))
83-
n = len(values)
84-
bits = list(np.arange(n))
85+
fp = list(hc_fingerprint(smi))
8586
else:
8687
raise IOError('invalid fingerprint type')
87-
return fingerprint(bits, values, n)
88+
return fingerprint(fp)
8889

8990

9091
def get_fingerprints(smlist: list, fp_type: str, nbit=None, depth=None):
9192
if fp_type not in vectypes:
9293
output = [get_fingerprint(smi, fp_type, nbit, depth) for smi in smlist]
9394
elif fp_type == 'mol2vec':
94-
vecs = mol2vec_fingerprints(smlist)
95-
n = vecs.shape[1]
96-
bits = list(np.arange(n))
97-
output = [fingerprint(bits, vecs[i,:], n) for i in range(vecs.shape[0])]
95+
output = [get_fingerprint(smi, fp_type, nbit, depth) for smi in smlist]
9896
elif fp_type == 'heteroencoder':
9997
if not hc_enable:
10098
raise IOError('heteroencoder is not enabled')

PyFingerprint/heteroencoder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2012,15 +2012,15 @@ def transform(self, mols, misses=False):
20122012
'''
20132013

20142014
def hc_fingerprint(smi: str):
2015-
model_name = os.path.join(PyFingerprint.__path__[0], 'Heteroencoder', 'heteroencoder_model.pkl')
2015+
model_name = os.path.join(PyFingerprint.__path__[0], 'Heteroencoder', 'heteroencoder_model')
20162016
model = DDC(model_name = model_name)
20172017
molb = Chem.rdchem.Mol.ToBinary(Chem.MolFromSmiles(smi))
20182018
vec = model.transform(model.vectorize([molb]))
20192019
return vec[0,0,:]
20202020

20212021

20222022
def hc_fingerprints(smlist: list):
2023-
model_name = os.path.join(PyFingerprint.__path__[0], 'Heteroencoder', 'heteroencoder_model.pkl')
2023+
model_name = os.path.join(PyFingerprint.__path__[0], 'Heteroencoder', 'heteroencoder_model')
20242024
model = DDC(model_name = model_name)
20252025
molb = [Chem.rdchem.Mol.ToBinary(Chem.MolFromSmiles(smi)) for smi in smlist]
20262026
vecs = model.transform(model.vectorize(molb))

0 commit comments

Comments
 (0)