Skip to content

Commit 9f93069

Browse files
Preliminary NER
1 parent c2b626c commit 9f93069

File tree

3 files changed

+112
-0
lines changed

3 files changed

+112
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
*.pyc
22
.venv
3+
.DS_Store
4+
stanford-ner/*

nlprouter.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import sys
2+
3+
4+
_, user_id, webpage_data = sys.argv

webpage_process.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import os
2+
3+
from bs4 import BeautifulSoup
4+
from nltk.tag.stanford import StanfordNERTagger
5+
import pydash as _
6+
7+
8+
path_sner_model = os.getenv(
9+
'STANFORD_NER_MODEL',
10+
os.path.realpath('./stanford-ner/models/english.all.3class.distsim.crf.ser.gz')
11+
)
12+
path_sner_jar = os.getenv(
13+
'STANFORD_NER_JAR',
14+
os.path.realpath('./stanford-ner/tagger/stanford-ner.jar')
15+
)
16+
stanford_tagger = StanfordNERTagger(path_sner_model, path_sner_jar)
17+
18+
"""
19+
Strips out HTML tags
20+
"""
21+
def cleanse_tags(webpage_data):
22+
return BeautifulSoup(webpage_data, "html.parser").get_text()
23+
24+
"""
25+
Mapper that works on each word token, tagging it as usual with a
26+
Named Entity Recognition Tagger
27+
@param webpage_data:string
28+
@returns [(string, tag)]
29+
"""
30+
def ner_tagging(webpage_data):
31+
ner_tuple_list = stanford_tagger.tag(webpage_data.split())
32+
33+
return ner_tuple_list
34+
35+
"""
36+
Filters a Stanford NER tuple, grouping together neighboring words
37+
with the same categories and removing useless categorized words.
38+
@param ner_tuple_list:[(string, tag)]
39+
@returns [(string, tag)]
40+
"""
41+
def reduce_neighbors(ner_tuple_list=[]):
42+
43+
def reducer(filtered_list, ner_tuple):
44+
word, tag = ner_tuple
45+
46+
if tag == 'O':
47+
return filtered_list
48+
49+
if not filtered_list:
50+
filtered_list.append((word, tag))
51+
return filtered_list
52+
53+
recent_filtered_word, recent_filtered_tag = filtered_list[-1]
54+
55+
if recent_filtered_tag == tag:
56+
filtered_list.pop()
57+
filtered_list.append(("%s %s" % (recent_filtered_word, word), tag))
58+
else:
59+
filtered_list.append((word, tag))
60+
61+
return filtered_list
62+
63+
return _.reduce_(ner_tuple_list, reducer, [])
64+
65+
"""
66+
Pick the most 'important' items of count size from ner_tuple_list,
67+
shortening it to count size. Only unique items are obtained.
68+
"""
69+
def pick_most_important(ner_tuple_list=[], count=0):
70+
71+
def reducer(word_tagfrequency_kv, ner_tuple):
72+
word, tag = ner_tuple
73+
74+
if word not in word_tagfrequency_kv:
75+
word_tagfrequency_kv[word] = {
76+
"tag": tag,
77+
"count": 1
78+
}
79+
return word_tagfrequency_kv
80+
81+
word_tagfrequency_kv[word] = {
82+
"tag": tag,
83+
"count": word_tagfrequency_kv[word]["count"] + 1
84+
}
85+
return word_tagfrequency_kv
86+
87+
word_tagfrequency_kv = _.reduce_(ner_tuple_list, reducer, {})
88+
89+
sorted_list = sorted(
90+
word_tagfrequency_kv.items(),
91+
key=lambda (key, value): value["count"],
92+
reverse=True
93+
)
94+
sorted_ner_tuple_list = _.map_(
95+
sorted_list,
96+
lambda (key,value): (key, value["tag"])
97+
)
98+
99+
sorted_length = len(sorted_ner_tuple_list)
100+
if sorted_length > count:
101+
return _.drop_right(
102+
sorted_ner_tuple_list,
103+
sorted_length - count
104+
)
105+
return sorted_ner_tuple_list
106+

0 commit comments

Comments
 (0)