forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_drugrepurposing.py
79 lines (44 loc) · 1.53 KB
/
parse_drugrepurposing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
##############################################################################
# DrugRepurposing.info parser
#
# eg 22/01/2016
##############################################################################
from bs4 import BeautifulSoup
def main():
base_dir = "../../data/drugrepurposing.info/"
drug_to_values = read_repurposing_data(base_dir + "Drug Repurposing Info.html")
#print len(drug_to_values), drug_to_values
for drug, old, new, name, company, status in drug_to_values: #.iteritems():
new = new.lower()
if new.find("cardia") != -1 or new.find("heart") != -1 or new.find("ischem") != -1:
print drug, new, status
return
def read_repurposing_data(file_name):
drug_to_values = []
html_doc = open(file_name)
soup = BeautifulSoup(html_doc, "lxml")
flag = False
for i, tag in enumerate(soup.find_all('tr')):
#print i, tag.name
values = []
for tag_p in tag.descendants:
#print tag_p.name
if tag_p.name == "td":
if tag_p.get('class') is not None and tag_p['class'][0].startswith("recaptcha"):
flag = True
break
if tag_p.name == "input":
val = tag_p.get('value')
values.append(val)
if tag_p.name == "option" and tag_p.get('selected') is not None:
val = str(tag_p.get_text())
#val = val.encode("ascii", "ignore")
values.append(val)
if flag: # or i>10:
break
#print values
if len(values) > 0:
drug_to_values.append(values[:-1])
return drug_to_values
if __name__ == "__main__":
main()