forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_gdsc.py
175 lines (161 loc) · 6.7 KB
/
parse_gdsc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import parse_ncbi, parse_drugbank
def main():
base_dir = "/home/emre/arastirma/data/drug/sensitivity/gdsc/"
#file_name = base_dir + "gdsc_compounds_conc_w5.csv"
#compound_to_concentrations = get_compounds(file_name)
#print len(compound_to_concentrations), compound_to_concentrations["GNF-2"]
geneid_to_names, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(base_dir + "../../../proteome/ncbi/geneid_to_symbol.txt")
file_target = base_dir + "gdsc_en_output_w5.csv"
file_response = base_dir + "gdsc_manova_output_w5.csv"
get_gsdc_info(file_target, file_response, name_to_geneid)
return
def get_gsdc_info(file_target, file_response, name_to_geneid, name_to_drug=None, drug_to_geneids=None):
log_info = False
# Parse files
compound_to_targets = get_targets(file_target)
#print len(compound_to_targets), compound_to_targets["GNF-2"]
compound_to_gene_to_values = get_drug_response(file_response)
#print len(compound_to_gene_to_values), len(compound_to_gene_to_values["GNF-2"]), compound_to_gene_to_values["GNF-2"]["FGFR3"]
# Map genes to geneids
##name_to_geneid["PI3KB"] = "5287"
##name_to_geneid["SRC"] = "6714"
name_to_geneid["PDGFR"] = "5159" # PDGFRB
name_to_geneid["ABL"] = "25" # ABL1
#name_to_geneid["ATM"] = "472"
name_to_geneid["FAK"] = "5747" # PTK2
name_to_geneid["MTORC1"] = "84335" # AKT1S1
name_to_geneid["MTORC2"] = "253260" # RICTOR
name_to_geneid["BCLW"] = "599" # BCL2L2
name_to_geneid["BCLXL"] = "598" # BCL2L1
name_to_geneid["MEK2"] = "5605" # MAP2K2
name_to_geneid["MEK1"] = "5604" # MAP2K1
name_to_geneid["RAF"] = "5894" # RAF1
name_to_geneid["HER2"] = "2064" # ERBB2
name_to_geneid["CHK1"] = "1111" # CHEK1
name_to_geneid["CHK2"] = "11200" # CHEK2
name_to_geneid["FAM123B"] = name_to_geneid["AMER1"]
name_to_geneid["MYCL1"] = name_to_geneid["MYCL"]
compound_to_geneids = {}
compounds_not_found = {}
for compound, targets in compound_to_targets.iteritems():
compound = compound.lower()
targets = targets.replace(" AND ", " : ")
if targets in ("SRC FAMILY ", "SRC-FAMILY"):
targets = "SRC"
elif targets == "P53-MDM2 INTERACTION":
targets = "TP53 : MDM2"
elif targets == "BCL2 FAMILY":
targets = "BCL2"
elif targets == "SERUM/GLUCOCORTICOID REGULATED KINASE 1 ":
targets = "SGK1"
elif targets == "CASPASE 3 ACTIVATOR":
targets = "CASP3"
elif targets == "TBK1 : PDK1 : IKK ":
targets = "TBK1 : PDK1 : IKBKB"
elif targets == "AURORA B":
targets = "AURKB"
elif targets == "AMPK AGONIST":
targets = "PRKAA1 : PRKAA2"
elif targets == "SPLEEN TYROSINE KINASE":
targets = "SYK"
elif targets == "ATM (IC50 13 NM) (ATR >>10 MM)":
targets = "ATM : ATR"
elif targets == "PRKC":
targets = "PRKCA : PRKCB : PRKCD : PRKCE : PRKCG : PRKCI"
elif targets == "PROLYL4HYDROXYLASE.":
targets = "P4HA1"
elif targets == "DIHYDROFOLATE REDUCTASE (DHFR)":
targets = "DHFR"
elif targets in ("BCR-ABL", "BCRABL ONLY (ALLOSTERIC NONATP COMPETETIVE)"):
targets = "BCR : ABL1"
elif targets == "G-SECRETASE":
targets = "APP : PSENEN : APH1A : APH1B"
else:
idx = targets.find("/")
if idx != -1:
prefix = targets[:idx-1]
targets = prefix + targets[idx-1] + " : " + prefix + targets[idx+1]
targets = targets.replace(" ", "")
targets = targets.split(":")
geneids = set()
for target in targets:
target = target.strip().replace("-", "")
if target in name_to_geneid:
geneids.add(name_to_geneid[target])
else:
compounds_not_found.setdefault(compound, set()).add(target)
if compound == "bibw2992":
geneids.add("2066")
if compound in name_to_drug:
drug = name_to_drug[compound]
if drug in drug_to_geneids:
geneids |= drug_to_geneids[drug]
if len(geneids) == 0:
continue
compound_to_geneids[compound] = geneids
compound_to_gene_to_values_mod = {}
gene_to_geneid = {}
genes_not_found = set()
for compound, gene_to_values in compound_to_gene_to_values.iteritems():
compound = compound.lower()
for gene, values in gene_to_values.iteritems():
if gene not in name_to_geneid:
genes_not_found.add(gene)
else:
compound_to_gene_to_values_mod.setdefault(compound, {})[gene] = values
gene_to_geneid[gene] = name_to_geneid[gene]
compound_to_gene_to_values = compound_to_gene_to_values_mod
if log_info:
print "Not found:"
for k, v in compounds_not_found.iteritems():
print k, v
print len(compound_to_geneids), compound_to_geneids["erlotinib"]
print genes_not_found
print "Identical targets:"
for i, compound in enumerate(compound_to_geneids):
geneids = compound_to_geneids[compound]
for j, compound2 in enumerate(compound_to_geneids):
if i < j:
geneids2 = compound_to_geneids[compound2]
if geneids == geneids2: #len(geneids & geneids2) > 0:
print compound, compound2, geneids & geneids2
return compound_to_geneids, compound_to_gene_to_values, gene_to_geneid
def get_compounds(file_name):
compound_to_concentrations = {}
f = open(file_name)
line = f.readline()
for line in f:
words = line.strip().split(",")
compound, c_min, c_max = words
compound_to_concentrations[compound] = (float(c_min), float(c_max))
f.close()
return compound_to_concentrations
def get_targets(file_name):
compound_to_targets = {}
f = open(file_name)
line = f.readline()
for line in f:
words = line.strip().split(",")
compound = words[2]
if compound in compound_to_targets:
continue
#print compound
targets = words[3].upper()
compound_to_targets[compound] = targets
f.close()
return compound_to_targets
def get_drug_response(file_name):
compound_to_gene_to_values = {}
f = open(file_name)
line = f.readline()
for line in f:
words = line.strip().split(",")
compound = words[0]
gene = words[1]
p_value = float(words[6])
effect = float(words[7])
compound_to_gene_to_values.setdefault(compound, {})[gene] = (p_value, effect)
f.close()
return compound_to_gene_to_values
if __name__ == "__main__":
main()