forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_utilities.py
79 lines (59 loc) · 2.07 KB
/
text_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
try:
from toolbox.external.negex import negex #sortRules, negTagger
except:
print "Import error: Negex. Using keyword matching instead"
KEYWORDS_NEGATIVE = [ " not ", " no ", " except ", " exception ", " inappropriate ", " without ", " absence " ]
KEYWORDS_SYMPTOMATIC = [ " protect", " maint", " manage", " symptom", " relie", " palliati", " alleviat" ]
def get_negex_rules(file_name):
f = open(file_name)
rules = negex.sortRules(f.readlines())
return rules
def is_negated(txt, phrase, rules = None):
negative = False
if rules is not None:
tagger = negex.negTagger(sentence = txt, phrases = [ phrase ], rules = rules, negP = False)
negative = tagger.getNegationFlag() == "negated"
#txt_tagged = tagger.getNegTaggedSentence()
else:
negative, i = in_keywords(txt, KEYWORDS_NEGATIVE)
return negative
def is_symptomatic(txt):
return in_keywords(txt, KEYWORDS_SYMPTOMATIC)
def in_keywords(txt, keywords):
flag = False
for i, keyword in enumerate(keywords):
idx = txt.find(keyword)
if idx != -1:
flag = True
break
return flag, i
def convert_to_R_string(txt):
txt = txt.replace_chars(txt, mapping=[(" ", "."), (",", ""), ("'", ""), ("-", "."), ("/", ".")])
return txt
def replace_chars(txt, mapping=[(" ", "_"), (",", ""), ("'", ""), ("-", "_"), ("/", "_")]):
for a, b in mapping:
txt = txt.replace(a,b)
return txt
def tokenize_disease_name(disease, exact=True):
disease = disease.lower()
disease_mod = disease.replace(" and ", ", ")
disease_mod = disease.replace("-", ", ")
phrases = disease_mod.split(",")
values = []
for phrase in phrases:
inner_values = []
words = phrase.strip().split()
for i, token in enumerate(words):
if token.endswith("'s") or token.endswith("^s") :
token = token[:-2]
if i == len(words) - 1:
if token[-1] == "s":
token = token[:-1]
if token in ("disease", "disorder", "syndrome"):
continue
inner_values.append(token)
if exact:
values.append(" ".join(inner_values))
else:
values += inner_values
return values