1
+ import os
2
+
3
+ from bs4 import BeautifulSoup
4
+ from nltk .tag .stanford import StanfordNERTagger
5
+ import pydash as _
6
+
7
+
8
+ path_sner_model = os .getenv (
9
+ 'STANFORD_NER_MODEL' ,
10
+ os .path .realpath ('./stanford-ner/models/english.all.3class.distsim.crf.ser.gz' )
11
+ )
12
+ path_sner_jar = os .getenv (
13
+ 'STANFORD_NER_JAR' ,
14
+ os .path .realpath ('./stanford-ner/tagger/stanford-ner.jar' )
15
+ )
16
+ stanford_tagger = StanfordNERTagger (path_sner_model , path_sner_jar )
17
+
18
+ """
19
+ Strips out HTML tags
20
+ """
21
+ def cleanse_tags (webpage_data ):
22
+ return BeautifulSoup (webpage_data , "html.parser" ).get_text ()
23
+
24
+ """
25
+ Mapper that works on each word token, tagging it as usual with a
26
+ Named Entity Recognition Tagger
27
+ @param webpage_data:string
28
+ @returns [(string, tag)]
29
+ """
30
+ def ner_tagging (webpage_data ):
31
+ ner_tuple_list = stanford_tagger .tag (webpage_data .split ())
32
+
33
+ return ner_tuple_list
34
+
35
+ """
36
+ Filters a Stanford NER tuple, grouping together neighboring words
37
+ with the same categories and removing useless categorized words.
38
+ @param ner_tuple_list:[(string, tag)]
39
+ @returns [(string, tag)]
40
+ """
41
+ def reduce_neighbors (ner_tuple_list = []):
42
+
43
+ def reducer (filtered_list , ner_tuple ):
44
+ word , tag = ner_tuple
45
+
46
+ if tag == 'O' :
47
+ return filtered_list
48
+
49
+ if not filtered_list :
50
+ filtered_list .append ((word , tag ))
51
+ return filtered_list
52
+
53
+ recent_filtered_word , recent_filtered_tag = filtered_list [- 1 ]
54
+
55
+ if recent_filtered_tag == tag :
56
+ filtered_list .pop ()
57
+ filtered_list .append (("%s %s" % (recent_filtered_word , word ), tag ))
58
+ else :
59
+ filtered_list .append ((word , tag ))
60
+
61
+ return filtered_list
62
+
63
+ return _ .reduce_ (ner_tuple_list , reducer , [])
64
+
65
+ """
66
+ Pick the most 'important' items of count size from ner_tuple_list,
67
+ shortening it to count size. Only unique items are obtained.
68
+ """
69
+ def pick_most_important (ner_tuple_list = [], count = 0 ):
70
+
71
+ def reducer (word_tagfrequency_kv , ner_tuple ):
72
+ word , tag = ner_tuple
73
+
74
+ if word not in word_tagfrequency_kv :
75
+ word_tagfrequency_kv [word ] = {
76
+ "tag" : tag ,
77
+ "count" : 1
78
+ }
79
+ return word_tagfrequency_kv
80
+
81
+ word_tagfrequency_kv [word ] = {
82
+ "tag" : tag ,
83
+ "count" : word_tagfrequency_kv [word ]["count" ] + 1
84
+ }
85
+ return word_tagfrequency_kv
86
+
87
+ word_tagfrequency_kv = _ .reduce_ (ner_tuple_list , reducer , {})
88
+
89
+ sorted_list = sorted (
90
+ word_tagfrequency_kv .items (),
91
+ key = lambda (key , value ): value ["count" ],
92
+ reverse = True
93
+ )
94
+ sorted_ner_tuple_list = _ .map_ (
95
+ sorted_list ,
96
+ lambda (key ,value ): (key , value ["tag" ])
97
+ )
98
+
99
+ sorted_length = len (sorted_ner_tuple_list )
100
+ if sorted_length > count :
101
+ return _ .drop_right (
102
+ sorted_ner_tuple_list ,
103
+ sorted_length - count
104
+ )
105
+ return sorted_ner_tuple_list
106
+
0 commit comments