-
Notifications
You must be signed in to change notification settings - Fork 0
/
npe.py
90 lines (74 loc) · 3.02 KB
/
npe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# !python -m spacy download en_core_web_sm
import string
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
lemmatizer = WordNetLemmatizer()
stop = set(stopwords.words('english') + list(string.punctuation))
must_contain_alphe = True
min_term_len = 3
def noun_phrase_by_sentence(text):
tokenized_content = []
sentences = []
for s in sent_tokenize(text):
for ss in s.split('\n'):
for sss in ss.split(' '):
if len(sss.strip()) > 0:
sentences.append(sss.strip())
np_list = []
for sentence in sentences:
np_sentence_list = []
doc = nlp(sentence)
for noun_chunk in doc.noun_chunks:
sent_stop = [i for i in word_tokenize(noun_chunk.text.lower()) if i not in stop]
# print ('sent_stop', sent_stop)
cleaned_sent_stop = []
for ss in sent_stop:
if len(ss) >= min_term_len and any(c.isalpha() for c in ss):
cleaned_sent_stop.append(ss)
# print ('cleaned_sent_stop', cleaned_sent_stop)
lemmatized = []
if len(cleaned_sent_stop) > 0:
lemmatized=[lemmatizer.lemmatize(word) for word in cleaned_sent_stop]
if len(lemmatized) > 0:
# print ('lemmatized', lemmatized)
for n in lemmatized:
np_sentence_list.append(n)
np_sentence_list.append("_".join(lemmatized))
if len(np_sentence_list) > 0:
np_list.append(list(set(np_sentence_list)))
# print('np_sentence_list', np_sentence_list)
return np_list
def noun_phrase_by_passage(text):
tokenized_content = []
passages = []
for p in text.split('\n'):
for pp in p.split(' '):
if len(pp.strip()) > 0:
passages.append(pp.strip())
np_list = []
for passage in passages:
np_passage_list = []
doc = nlp(passage)
for noun_chunk in doc.noun_chunks:
sent_stop = [i for i in word_tokenize(noun_chunk.text.lower()) if i not in stop]
# print ('sent_stop', sent_stop)
cleaned_sent_stop = []
for ss in sent_stop:
if len(ss) >= min_term_len and any(c.isalpha() for c in ss):
cleaned_sent_stop.append(ss)
# print ('cleaned_sent_stop', cleaned_sent_stop)
lemmatized = []
if len(cleaned_sent_stop) > 0:
lemmatized=[lemmatizer.lemmatize(word) for word in cleaned_sent_stop]
if len(lemmatized) > 0:
# print ('lemmatized', lemmatized)
for n in lemmatized:
np_passage_list.append(n)
np_passage_list.append("_".join(lemmatized))
if len(np_passage_list) > 0:
np_list.append(list(set(np_passage_list)))
# print('np_passage_list', np_passage_list)
return np_list