-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
276 lines (214 loc) · 7.38 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
from Bio.PDB.Residue import Residue
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.PDBIO import Select, PDBIO
from Bio.PDB import PDBParser
def is_nucleic(residue: Residue) -> bool:
"""
Checks if the given residue is a nucleic acid.
Parameters
----------
residue : Bio.PDB.Residue.Residue
The residue to check.
Returns
-------
bool
True if the residue is a nucleic acid, False otherwise.
"""
return residue.get_resname() in ["DA", "DC", "DG", "DT", "A", "C", "G", "U"]
# Combine Complex and Positive Select also remove hydrogen atoms
class ComplexPositiveSelect(Select):
def accept_chain(self, chain):
# Remove chains with no residues and hetero atoms
if len(chain) == 0 or chain.id == " ":
return 0
else:
return 1
def accept_residue(self, residue):
if (is_aa(residue) or is_nucleic(residue)) and residue.id[1] > 0:
return 1
else:
return 0
def clean_pdb(pdb_path: str, out_path: str, select=ComplexPositiveSelect()) -> None:
pdb_id: str = pdb_path.split("/")[-1].split(".")[0]
pdb_parser: PDBParser = PDBParser()
structure: Structure = pdb_parser.get_structure(pdb_id, pdb_path)
io: PDBIO = PDBIO()
io.set_structure(structure)
io.save(out_path, select)
# def remove_neg_resSeq(traj: md.Trajectory) -> md.Trajectory:
# """
# Removes atoms with negative resSeq values from a mdtraj.Trajectory object.
# Parameters
# ----------
# traj : mdtraj.Trajectory
# The input trajectory.
# Returns
# -------
# mdtraj.Trajectory
# The output trajectory containing only atoms with positive resSeq values.
# """
# # Select all atoms with positive resSeq values
# positive_resSeq_atoms = traj.topology.select("resSeq < 1000 and resSeq > 0")
# # Create a new trajectory containing only the atoms with positive resSeq values
# traj_no_neg_resSeq = traj.atom_slice(positive_resSeq_atoms)
# return traj_no_neg_resSeq
# def remove_water(traj: md.Trajectory) -> md.Trajectory:
# """
# Removes water molecules from a mdtraj.Trajectory object.
# Parameters
# ----------
# traj : mdtraj.Trajectory
# The input trajectory.
# Returns
# -------
# mdtraj.Trajectory
# The output trajectory containing only non-water atoms.
# """
# # Select all non-water atoms
# non_water_atoms = traj.topology.select("not water")
# # Create a new trajectory containing only the non-water atoms
# traj_no_water = traj.atom_slice(non_water_atoms)
# return traj_no_water
# def remove_hydrogen(traj: md.Trajectory) -> md.Trajectory:
# """
# Removes hydrogen atoms from a mdtraj.Trajectory object.
# Parameters
# ----------
# traj : mdtraj.Trajectory
# The input trajectory.
# Returns
# -------
# mdtraj.Trajectory
# The output trajectory containing only non-hydrogen atoms.
# """
# # Select all non-hydrogen atoms
# non_hydrogen_atoms = traj.topology.select("not element H")
# # Create a new trajectory containing only the non-hydrogen atoms
# traj_no_hydrogen = traj.atom_slice(non_hydrogen_atoms)
# return traj_no_hydrogen
# # def remove_extra_chains(traj):
# # """
# # Removes chains that are not protein or ssDNA from a mdtraj.Trajectory object.
# # Parameters
# # ----------
# # traj : mdtraj.Trajectory
# # The input trajectory.
# # Returns
# # -------
# # mdtraj.Trajectory
# # The output trajectory containing only protein and ssDNA chains.
# # """
# # # Create a list of chains that are not protein or ssDNA
# # chains_to_remove = []
# # for chain in traj.topology.chains:
# # if (
# # ChainTypeHelper.get_chain_type(chain) != "protein"
# # and ChainTypeHelper.get_chain_type(chain) != "ssDNA"
# # ):
# # chains_to_remove.append(chain)
# # # Remove the chains that are not protein or ssDNA
# # traj_no_extra_chains = remove_chains(traj, chains_to_remove)
# # return traj_no_extra_chains
# def remove_chains(traj, chains_to_remove):
# """
# Removes chains from a mdtraj.Trajectory object.
# Parameters
# ----------
# traj : mdtraj.Trajectory
# The input trajectory.
# chains_to_remove : list[mdtraj.Chain]
# The list of chains to remove.
# Returns
# -------
# mdtraj.Trajectory
# The output trajectory containing only the chains that were not removed.
# """
# atom_indices_not_to_remove = []
# for chain in traj.topology.chains:
# if chain not in chains_to_remove:
# for atom in chain.atoms:
# atom_indices_not_to_remove.append(atom.index)
# traj_no_chains = traj.atom_slice(atom_indices_not_to_remove)
# return traj_no_chains
# def prepare_traj(
# traj: md.Trajectory,
# # _remove_extra_chains=True,
# _remove_water=True,
# _remove_hydrogen=True,
# _remove_negative_indices=True,
# ):
# """
# Prepares a mdtraj.Trajectory object for analysis.
# Parameters
# ----------
# traj : mdtraj.Trajectory
# The input trajectory.
# _remove_extra_chains : bool
# Whether to remove chains that are not protein or ssDNA.
# _remove_water : bool
# Whether to remove water molecules.
# _remove_hydrogen : bool
# Whether to remove hydrogen atoms.
# Returns
# -------
# mdtraj.Trajectory
# The output trajectory.
# """
# # if _remove_extra_chains:
# # traj = remove_extra_chains(traj)
# if _remove_water:
# traj = remove_water(traj)
# if _remove_hydrogen:
# traj = remove_hydrogen(traj)
# if _remove_negative_indices:
# traj = remove_neg_resSeq(traj)
# return traj
# def clean_pdb(
# pdb_path: str,
# output_path: str,
# # _remove_extra_chains=True,
# _remove_water=True,
# _remove_hydrogen=True,
# _remove_negative_indices=True,
# ) -> None:
# """
# Cleans a pdb file and saves the cleaned version.
# Parameters
# ----------
# pdb_path : str
# The path to the pdb file to clean.
# output_path : str
# The path to save the cleaned pdb file.
# _remove_extra_chains : bool
# Whether to remove chains that are not protein or ssDNA.
# _remove_water : bool
# Whether to remove water molecules.
# _remove_hydrogen : bool
# Whether to remove hydrogen atoms.
# _remove_negative_indices : bool
# Whether to remove atoms with negative resSeq values.
# """
# # Open the pdb file
# traj = md.load_pdb(pdb_path)
# # Prepare the trajectory
# traj = prepare_traj(
# traj,
# # _remove_extra_chains=_remove_extra_chains,
# _remove_water=_remove_water,
# _remove_hydrogen=_remove_hydrogen,
# _remove_negative_indices=_remove_negative_indices,
# )
# # Save the trajectory
# traj.save(output_path)
# def main():
# data_folder = "./data/ssDNA_binding_proteins_complex"
# file_name = "8DFA.pdb"
# file_path = os.path.join(data_folder, file_name)
# # Load the trajectory
# traj = md.load_pdb(file_path)
# # Prepare the trajectory
# traj = prepare_traj(traj)
# # Save the trajectory
# traj.save("./data/8DFA_clean.pdb")
# if __name__ == "__main__":
# main()