-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathselect_enriched.py
85 lines (64 loc) · 3.33 KB
/
select_enriched.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from matplotlib import pyplot as plt
import subprocess
import statistics
import os
host_pathogen = 'domain'
domain_or_motif = 'domain'
cutoff = 5
interactions_with_obs_freq = {}
interactions_with_rand_freq = {}
# From the results of the wilcoxon test and effect size, take domains where p-value > 0.05 and effect size > 0.5
cat = subprocess.Popen('cat outputs/' + host_pathogen+'-'+domain_or_motif+'_wilcoxon_test.txt', stdout=subprocess.PIPE, shell=True)
tr = subprocess.Popen('tr \'\",\' \' \'', stdin=cat.stdout, stdout=subprocess.PIPE, shell=True)
awk = subprocess.Popen('LC_NUMERIC=C awk \'{if ($3 + 0 > 0.5 && $2 + 0 < 0.05) {print $1}}\' ',stdout=subprocess.PIPE,stdin=tr.stdout,shell=True)
domains = awk.communicate()[0].decode().split('\n')[1:]
# Take only the enriched motif/domains from the host/pathogen (values above 1)
motif_enrichment = subprocess.Popen('LC_NUMERIC=C awk \'{if ($2 + 0 > 1) {print $1}}\' outputs/host_domain_enrichment.txt',stdout=subprocess.PIPE,shell=True)
motif_enrichment_only_enriched = motif_enrichment.communicate()[0].decode().split('\n')[1:]
domain_enrichment = subprocess.Popen('LC_NUMERIC=C awk \'{if ($2 + 0 > 1) {print $1}}\' outputs/pathogen_domain_enrichment.txt',stdout=subprocess.PIPE,shell=True)
domain_enrichment_only_enriched = domain_enrichment.communicate()[0].decode().split('\n')[1:]
with open('outputs/domain-domain_observed_freq.txt','rt') as f_obs:
for line in f_obs:
record_obs = line.split()
if int(record_obs[0]) >= cutoff:
interactions_with_obs_freq[record_obs[-2]] = record_obs[-1].replace(',','.')
with open('outputs/domain-domain_random_freq.txt', 'rt') as f_rand:
for line in f_rand:
record_rand = line.split(',')
all_rand_freq = record_rand[1:]
rand_freq = statistics.fmean([float(i) for i in all_rand_freq])
interactions_with_rand_freq[record_rand[0]] = rand_freq
# Compute the fobs/frand for the selected domains.
for i in domains:
try:
if i in interactions_with_obs_freq:
motif = i.split('-')[0]
domain = i.split('-')[1]
n=0
if motif in motif_enrichment_only_enriched and domain in domain_enrichment_only_enriched:
prot1 = subprocess.Popen("grep -nrw " + motif + " Pfam_domains_combinations/", stdout=subprocess.PIPE,
shell=True)
prot2 = subprocess.Popen("grep -nrw " + domain + " Pfam_domains_combinations/", stdout=subprocess.PIPE,
shell=True)
#num_prot = subprocess.Popen("grep -nrw " + i + " Pfam_domains_pathogen/", stdout=subprocess.PIPE, shell=True)
protein_with_motif = prot1.communicate()[0].decode().split("\n")
protein_with_domain = prot2.communicate()[0].decode().split("\n")
unique_motif = []
unique_domain = []
for j in protein_with_motif:
if j != '' and j.split('/')[1].split('.')[0] not in unique_motif:
unique_motif.append(j.split('/')[1].split('.')[0])
for j in protein_with_domain:
if j != '' and j.split('/')[1].split('.')[0] not in unique_domain:
unique_domain.append(j.split('/')[1].split('.')[0])
for j in set(unique_domain):
if j in unique_motif:
print(j)
n+=1
if i in interactions_with_rand_freq and n >= 1:
obs_freq = float(interactions_with_obs_freq[i])
rand_freq = float(interactions_with_rand_freq[i])
with open('outputs/domain_domain_enrichment.txt','at') as f:
f.write(i+'\t'+str(obs_freq/rand_freq)+'\n')
except:
continue