-
Notifications
You must be signed in to change notification settings - Fork 7
/
process_pdbbind_data.py
104 lines (76 loc) · 3.96 KB
/
process_pdbbind_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os, sys, time, glob, warnings
from os.path import join, basename, dirname
from tqdm import trange
import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from drughive.molecules import MolParser
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('path', type=str, help='Path to extract. Could be single file or directory.')
parser.add_argument('--clear', action='store_true')
args = parser.parse_args()
data_path = args.path
grid_width = 24 # angstroms
pad = 5 # angstroms
dist_thresh = (2**0.5)*(grid_width/2 + pad)
folders = [x for x in os.listdir(data_path) if not any([p in x for p in ['readme','index', '.txt']])]
if args.clear:
fs_remove = glob.glob(join(data_path, '**', '*.npy'), recursive=True) ## TODO: delete
for f in fs_remove:
os.remove(f)
else:
pocket_only = True
for i in trange(len(folders)):
f = folders[i]
# load ligand
m = glob.glob(os.path.join(os.path.join(data_path,f),'*_ligand.mol2')) # ligand
if len(m) == 0:
continue
mfile = m[0]
pdb_id = basename(dirname(mfile))
try:
lig_parse = MolParser(fname=mfile)
rdlig = lig_parse.get_rdmol(sanitize=False, removeHs=False)
rdlig.UpdatePropertyCache(strict=False)
Chem.SanitizeMol(rdlig, sanitizeOps=Chem.rdmolops.SanitizeFlags.SANITIZE_SETAROMATICITY, catchErrors=True)
atom_types_lig = lig_parse.get_atom_types(rdlig)
atom_types_lig = lig_parse.types_list2vec(atom_types_lig)
atom_coords_lig = rdlig.get_coords()
idxs = atom_types_lig[:,0] != 1
atom_types_lig = atom_types_lig[idxs]
atom_coords_lig = atom_coords_lig[idxs]
center_lig = atom_coords_lig.mean(axis=0)
atom_coords_lig -= center_lig
np.save(os.path.join(mfile.replace('.mol2','.coords.npy')), arr=atom_coords_lig.astype(np.float16))
np.save(os.path.join(mfile.replace('.mol2','.types.npy')), arr=atom_types_lig.astype(np.int16))
#load protein
if pocket_only:
m = glob.glob(os.path.join(os.path.join(data_path,f),'*_pocket.pdb')) # protein pocket only
else:
m = glob.glob(os.path.join(os.path.join(data_path,f),'*_protein.pdb')) # full protein
if len(m) == 0:
continue
mfile = m[0]
prot_parse = MolParser(fname=mfile)
rdprot = prot_parse.get_rdmol(remove_hetero=True, removeHs=False)
rdprot.UpdatePropertyCache(strict=False)
Chem.SanitizeMol(rdprot, sanitizeOps=Chem.rdmolops.SanitizeFlags.SANITIZE_SETAROMATICITY, catchErrors=True)
atom_types_prot = prot_parse.get_atom_types(rdprot)
atom_types_prot = lig_parse.types_list2vec(atom_types_prot)
atom_coords_prot = rdprot.get_coords()
atom_coords_prot -= center_lig
idxs = atom_types_prot[:,0] != 1
atom_types_prot = atom_types_prot[idxs]
atom_coords_prot = atom_coords_prot[idxs]
cnorm = np.linalg.norm(atom_coords_prot, axis=-1)
atom_coords_prot = atom_coords_prot[cnorm < dist_thresh]
atom_types_prot = atom_types_prot[cnorm < dist_thresh]
np.save(os.path.join(mfile.replace('.pdb','.coords.npy')), arr=atom_coords_prot.astype(np.float16))
np.save(os.path.join(mfile.replace('.pdb','.types.npy')), arr=atom_types_prot.astype(np.int16))
except Exception as e:
warnings.warn(f'PDB ID: {pdb_id} failed')
raise e