-
Notifications
You must be signed in to change notification settings - Fork 0
/
Inparanoid.py
244 lines (204 loc) · 7.21 KB
/
Inparanoid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python
from graph_database import graph_database
from uniprot_queries import uniprot_queries
from chebi_from_string import chebi_from_string
from urllib2 import Request, urlopen, URLError
import xml.etree.ElementTree as ET
import sys, os, re, glob, multiprocessing
from activepool import ActivePool
import httplib
class inparanoid():
'''
Search for orthologs of the species proteins
'''
def __init__(self, gd, fasta_repo, blastp_repo, inparanoid_repo):
self.gd = gd
self.fasta_repo = fasta_repo
self.blastp_repo = blastp_repo
self.inparanoid_repo = inparanoid_repo
def research_orthologs(self, uniID, species):
'''
research orthologs of a specific UniProtID in the inparanoid database available on the web
save the info in the neo4j database
'''
url="http://inparanoid.sbc.su.se/cgi-bin/gene_search.cgi?idtype=all;all_or_selection=all;scorelimit=0.05;rettype=xml;id=" + uniID
request = Request(url)
for n in range(10):
try:
response = urlopen(request)
txml = response.read()
root = ET.fromstring(txml)
for cluster in root.iter('cluster'):
c = cluster.attrib['nr'] #Cluster ID
for p in cluster.iter('protein'):
pid = p.attrib['prot_id']
spec = p.attrib['speclong']
if pid != uniID:
if spec in species:
self.gd.add_orthology_relationship(pid, uniID, c)
break
except URLError, e:
print 'Warning : No ortholog for %s ' % uniID
break
except httplib.IncompleteRead:
pass
return None
def extract_fasta(self, specie):
'''
Extract the fasta files for all the proteins present in the database
'''
uniprot_IDs = self.gd.obtain_uniIDs_from_specie(specie)
url = 'https://www.uniprot.org/uniprot/'
for uniID in uniprot_IDs:
if uniID != None:
uni_url = url + uniID + '.fasta'
request = Request(uni_url)
for n in range(10):
try:
response = urlopen(request, timeout=150).read()
specie_name = specie.replace(' ', '_')
with open(self.fasta_repo + '/' + specie_name + '.fasta', 'a') as uniID_file:
uniID_file.write(response)
break
except URLError, httplib.BadStatusLine:
pass
return None
def run_makeBlastDb(self, file, s, pool):
'''
'''
name = multiprocessing.current_process().name
with s:
pool.makeActive(name)
sys.stderr.write("#################################")
print "Now running: %s" %str(pool)
runmakedb = "makeblastdb -in " + file + " -dbtype prot -title " + file + "_DB"
print runmakedb
os.system(runmakedb)
pool.makeInactive(name)
return None
def run_blastp(self, file, db, sm, pool):
'''
Run blast mith multiprocessing for a search
on a species database
'''
name = multiprocessing.current_process().name
print file, db
specie1 = file.split('/')[-1].split('.')[0]
specie2 = db.split('/')[-1].split('.')[0]
print specie1, specie2
with sm:
pool.makeActive(name)
sys.stderr.write("#################################")
print "Now running: %s" %str(pool)
rundoblast = "blastp -query " + file + " -db " + db + " -outfmt \"7 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen\" -seg yes -out "+self.blastp_repo+'/'+specie1+"-"+specie2
print rundoblast
os.system(rundoblast)
pool.makeInactive(name)
return None
def doblast(self, species, cpu=1):
'''
Run blast on all the fasta files extracted from the
proteins present in the database
Need to install blast: sudo apt-get install ncbi-blast+
'''
print "Run the blasts (%s CPUs)" %str(cpu)
#curentpath=os.getcwd()
#os.chdir(self.fasta_repo)
listfa = glob.glob(self.fasta_repo + '/*.fasta')
print listfa
jobs = []
pool = ActivePool()
sm = multiprocessing.Semaphore(cpu)
for file in listfa:
if not os.path.isfile(file+".phr"):
p = multiprocessing.Process(target=self.run_makeBlastDb, args=(file, sm, pool))
jobs.append(p)
for j in jobs:
j.start()
for j in jobs:
j.join()
sys.stderr.write('Now running: %s\n' % str(pool))
jobs = []
pool = ActivePool()
sm = multiprocessing.Semaphore(cpu)
for specie1 in species:
i = self.fasta_repo + '/' + specie1.replace(' ', '_') + '.fasta'
for specie2 in species:
j = self.fasta_repo + '/' + specie2.replace(' ', '_') + '.fasta'
# if specie1 != specie2:
check_rel = self.gd.check_species_relationship(specie1, specie2, 'Inparanoid_orthology')
if check_rel == False:
p = multiprocessing.Process(target = self.run_blastp, args = (j, i, sm, pool))
jobs.append(p)
else:
print "Ortholog relations between %s and %s are already in the database" %(specie1, specie2)
for j in jobs:
j.start()
for j in jobs:
j.join()
sys.stderr.write('Now running: %s\n' % str(pool))
return None
def doinparanoid(self, cpu=1):
'''
Run inparanoid on the blast results
'''
print "Run inparanoid"
runinp="perl Inparanoid/SCRIPTS_INPARANOID.pl "+self.blastp_repo + " " + self.fasta_repo + " " + self.inparanoid_repo + " " + str(cpu)
print runinp
os.system(runinp)
return None
def parse_inparanoid(self, fname):
'''
Parse the inparanoid results, stored in sql tables
'''
orthologs = {}
previous_group = 'NA'
with open(fname) as file:
file = file.read()
for content in file.split('\n')[:-1]:
tmp = content.split('\t')
if not previous_group == tmp[0]:
orthologs[tmp[0]] = {}
previous_group = tmp[0]
orthologs[tmp[0]][tmp[4]] = tmp[2]
return orthologs
def add_blast_orthology_relationship(self, orthologs):
'''
Add the orthology relationships present in the orthology dictionary
into the neo4j graph database
'''
for og in orthologs.keys():
for seq in orthologs[og]:
seq1 = seq.rstrip()
tabseq1 = seq1.split('|')
specie1 = orthologs[og][seq].replace('.fasta', '').replace('_', ' ')
for seq_2 in orthologs[og]:
seq2 = seq_2.rstrip()
tabseq2 = seq2.split('|')
specie2 = orthologs[og][seq_2].replace('.fasta', '').replace('_', ' ')
if seq1 != seq2:
is_protein1 = self.gd.check_protein(tabseq1[1])
is_protein2 = self.gd.check_protein(tabseq2[1])
if (is_protein1 == True) and (is_protein2 == True):
is_relationship = self.gd.check_relationship_link(tabseq1[1], tabseq2[1], 'Orthology_relationship')
if is_relationship == False:
self.gd.create_blastp_orthology_relationship(tabseq1[1], tabseq2[1])
return None
if __name__ == '__main__':
inparanoid_repo = '/home/salva/citrus2/BrasionSynth/inportho/'
species = ['Citrus Clementina', 'Citrus sinensis', 'Arabidopsis thaliana', 'Physcomitrella patens']
uniprot = uniprot_queries('Arabidopsis thaliana', '3702')
chebi = chebi_from_string()
chebi.chebi_connect()
gd = graph_database(chebi, uniprot, '', '', 'neo4j', 'salva')
gd.connect()
inparanoid = inparanoid(gd, '/home/salva/citrus2/data/', '/home/salva/citrus2/BrasionSynth/blastp/', '/home/salva/citrus2/BrasionSynth/inportho/')
#for specie in species:
# inparanoid.extract_fasta(specie)
inparanoid.doblast(species)
inparanoid.doinparanoid()
resfiles = glob.glob(inparanoid_repo + '/sqltable/sqltable*')
for fname in resfiles:
orthologs = inparanoid.parse_inparanoid(fname)
inparanoid.add_blast_orthology_relationship(orthologs)
inparanoid.research_orthologs('Q8LEB2', 'Arabidopsis thaliana')