-
Notifications
You must be signed in to change notification settings - Fork 0
/
exe2_pdb.py
147 lines (129 loc) · 7.16 KB
/
exe2_pdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# -*- coding: utf-8 -*-
from Bio.Data.SCOPData import protein_letters_3to1
from Bio.PDB.MMCIFParser import MMCIFParser # Tip: This module might be useful for parsing...
import numpy as np
############# Exercise 2: Protein Data Bank #############
# General remark: In our exercise every structure will have EXACTLY ONE model.
# This is true for nearly all X-Ray structures. NMR structures have several models.
class PDB_Parser:
CIF_PARSER = MMCIFParser() # parser object for reading in structure in CIF format
def __init__(self, path):
"""
Initialize every PDB_Parser with a path to a structure-file in CIF format.
An example file is included in the repository (7ahl.cif).
Tip: Store the parsed structure in an object variable instead of parsing it
again & again ...
"""
self.structure = MMCIFParser().get_structure('7AHL', path)
def get_chain(self, chain_id):
return self.structure[0][chain_id]
# return next(c for c in self.structure.get_chains() if c.id == chain_id)
# 2.8 Chains
def get_number_of_chains(self):
"""
Input:
self: Use Biopython.PDB structure which has been stored in an object variable
Return:
Number of chains in this structure as integer.
"""
return len(list(self.structure.get_chains()))
# 2.9 Sequence
def get_sequence(self, chain_id):
"""
Input:
self: Use Biopython.PDB structure which has been stored in an object variable
chain_id : String (usually in ['A','B', 'C' ...]. The number of chains
depends on the specific protein and the resulting structure)
Return:
Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
in a Biopython.PDB structure as a string.
"""
return ''.join([protein_letters_3to1[r.get_resname()] for r in self.get_chain(chain_id).get_residues()
if r.get_resname() in protein_letters_3to1])
# 2.10 Water molecules
def get_number_of_water_molecules(self, chain_id):
"""
Input:
self: Use Biopython.PDB structure which has been stored in an object variable
chain_id : String (usually in ['A','B', 'C' ...]. The number of chains
depends on the specific protein and the resulting structure)
Return:
Return the number of water molecules of a given chain (chain_id)
in a Biopython.PDB structure as an integer.
"""
return len([r for r in self.get_chain(chain_id).get_residues() if r.get_resname() == 'HOH'])
# 2.11 C-alpha distance
def get_ca_distance(self, chain_id_1, index_1, chain_id_2, index_2):
"""
Input:
self: Use Biopython.PDB structure which has been stored in an object variable
chain_id_1 : String (usually in ['A','B', 'C' ...]. The number of chains
depends on the specific protein and the resulting structure)
index_1 : index of a residue in a given chain in a Biopython.PDB structure
chain_id_2 : String (usually in ['A','B', 'C' ...]. The number of chains
depends on the specific protein and the resulting structure)
index_2 : index of a residue in a given chain in a Biopython.PDB structure
chain_id_1 and index_1 describe precisely one residue in a PDB structure,
chain_id_2 and index_2 describe the second residue.
Return:
Return the C-alpha (!) distance between the two residues, described by
chain_id_1/index_1 and chain_id_2/index_2. Round the returned value via int().
The reason for using two different chains as an input is that also the distance
between residues of different chains can be interesting.
Different chains in a PDB structure can either occur between two different proteins
(Heterodimers) or between different copies of the same protein (Homodimers).
"""
c1 = self.get_chain(chain_id_1)
c2 = self.get_chain(chain_id_2)
return int(np.linalg.norm(c1[index_1]['CA'].coord - c2[index_2]['CA'].coord))
# 2.12 Contact Map
def get_contact_map(self, chain_id):
"""
Input:
self: Use Biopython.PDB structure which has been stored in an object variable
chain_id : String (usually in ['A','B', 'C' ...]. The number of chains
depends on the specific protein and the resulting structure)
Return:
Return a complete contact map (see description in exercise sheet)
for a given chain in a Biopython.PDB structure as numpy array.
The values in the matrix describe the c-alpha distance between all residues
in a chain of a Biopython.PDB structure.
Only integer values of the distance have to be given (see below).
"""
r = [ri for ri in self.get_chain(chain_id).get_residues() if 'CA' in ri]
rn = len(r)
contact_map = np.zeros((rn, rn), dtype=np.float32)
for i, r1 in enumerate(r):
for j, r2 in enumerate(r):
if i < j:
contact_map[i][j] = contact_map[j][i] = self.get_ca_distance(chain_id, r1.id[1], chain_id, r2.id[1])
return contact_map.astype(np.int64) # return rounded (integer) values
# 2.13 B-Factors
def get_bfactors(self, chain_id):
"""
Input:
self: Use Biopython.PDB structure which has been stored in an object variable
chain_id : String (usually in ['A','B', 'C' ...]. The number of chains
depends on the specific protein and the resulting structure)
Return:
Return the B-Factors for all residues in a chain of a Biopython.PDB structure.
The B-Factors describe the mobility of an atom or a residue.
In a Biopython.PDB structure B-Factors are given for each atom in a residue.
Calculate the mean B-Factor for a residue by averaging over the B-Factor
of all atoms in a residue.
Sometimes B-Factors are not available for a certain residue;
(e.g. the residue was not resolved); insert np.nan for those cases.
Finally normalize your B-Factors using Standard scores (zero mean, unit variance).
You have to use np.nanmean, np.nanvar etc. if you have nan values in your array.
The returned data structure has to be a numpy array rounded again to integer.
"""
r = list(self.get_chain(chain_id).get_residues())
b = np.array([np.nanmean([a.bfactor for a in ri.get_atoms()]) for ri in r if ri.get_resname() != 'HOH'])
b -= b.mean()
b /= b.std()
return b.astype(np.int64) # return rounded (integer) values
def main():
print('PDB parser class.')
return None
if __name__ == '__main__':
main()