-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtextextract.go
324 lines (240 loc) · 8.02 KB
/
textextract.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
package articletext
/*
The file contains a logic of a package to find a DOM node containing majority of a text
in HTML document
Author: Roman Gelembjuk <roman@gelembjuk.com>
*/
import (
"math"
"regexp"
"strconv"
"strings"
"unicode/utf8"
"github.com/PuerkitoBio/goquery"
"gopkg.in/neurosnap/sentences.v1"
"gopkg.in/neurosnap/sentences.v1/data"
)
type TextDescription struct {
CountSentences int
AverageWords int
CountLongSentences int
CountGoodSentences int
CountCorrectSentences int
}
var tokenizer *sentences.DefaultSentenceTokenizer
func init() {
// prepare tokenizer
b, _ := data.Asset("data/english.json")
// load the training data
training, _ := sentences.LoadTraining(b)
// create the default sentence tokenizer
tokenizer = sentences.NewSentenceTokenizer(training)
}
func getPrimarySelection(s *goquery.Selection) *goquery.Selection {
// prepare a selection for search. Add some descriptions for DOM nodes
// to find most correct node with a text
describeDocumentNode(s)
// now find a node with a text and return it
return findSelectionWithPrimaryText(s)
}
/*
* This is the core function. It checks a selection object and finds if this is a text node
* or it is needed to go deeper , inside a node that has most of text
*/
func findSelectionWithPrimaryText(s *goquery.Selection) *goquery.Selection {
// if no children then return a text from this node
if s.Children().Length() == 0 {
return s
}
// variable to find a node with longest text inside it
sort_by_count_sentences := 0
// a node with longest text inside it
var sort_by_text_node *goquery.Selection = nil
// keep count of nodes containing more 2 sentences
count_of_nodes_with_sentences := 0
max_count_of_correct_sentences := 0
// calcuate count of real symbols
node_full_text_len := utf8.RuneCountInString(s.Text())
top_total_count_of_correct_sentences := getNumbericAttribute(s, "totalcountofcorrectsentences")
// all subnodes lengths
tlengths := []int{}
densityes := []int{}
s.Children().Each(func(i int, sec *goquery.Selection) {
totalcountofcorrectsentences := getNumbericAttribute(sec, "totalcountofcorrectsentences")
if totalcountofcorrectsentences > 1 {
count_of_nodes_with_sentences++
if totalcountofcorrectsentences > max_count_of_correct_sentences {
max_count_of_correct_sentences = totalcountofcorrectsentences
}
}
// node text length
tlen := utf8.RuneCountInString(sec.Text())
html, _ := sec.Html()
hlen := utf8.RuneCountInString(html)
if tlen == 0 {
// process next subnode
return
}
tlengths = append(tlengths, tlen)
density := (hlen / tlen)
densityes = append(densityes, density)
// check if this block is better then previous
// choose better block only if previous is empty or
// has less then 10 real sentences
if totalcountofcorrectsentences > sort_by_count_sentences && sort_by_count_sentences < 10 {
sort_by_count_sentences = totalcountofcorrectsentences
sort_by_text_node = sec
}
})
// if any nide with a text was found
if sort_by_count_sentences > 0 {
// calculate mean deviation
lvar := getMeanDeviation(tlengths)
// get relative value of a mean deviation agains full text length in a node
lvarproc := (100 * lvar) / float64(node_full_text_len)
// during tests we found that if this value is less 5
// the a node is what we are looking for
// it is the node with "main" text of a page
if lvarproc < 15 && len(tlengths) > 3 ||
(count_of_nodes_with_sentences > 2 &&
float32(max_count_of_correct_sentences) < float32(top_total_count_of_correct_sentences)*0.8) {
// we found that a text is equally distributed between subnodes
// no need to go deeper
return s
}
// go deeper inside a node with most of text
return findSelectionWithPrimaryText(sort_by_text_node)
}
// no subnodes found. return a node itself
return s
}
// describe a text inside a node and add description as pseudo attributes
func describeDocumentNode(s *goquery.Selection) *goquery.Selection {
var totalcountofgoodsentences int
var totalcountofcorrectsentences int
var maxcountofflatsentences int
countchildren := s.Children().Length()
var sd TextDescription
if countchildren > 0 {
// for each child node check if to remove or not
s.Children().Each(func(i int, sec *goquery.Selection) {
// go deeper recursively
describeDocumentNode(sec)
// aggregate data to set to a node
totalcountofgoodsentences += getNumbericAttribute(sec, "totalcountofgoodsentences")
totalcountofcorrectsentences += getNumbericAttribute(sec, "totalcountofcorrectsentences")
countsentences := getNumbericAttribute(sec, "maxcountofflatsentences")
if countsentences > maxcountofflatsentences {
maxcountofflatsentences = countsentences
}
})
// describe sentences in this html tag only, drop child nodes
secclone := getSelectionWihoutChildren(s)
sd = describeSentences(secclone)
totalcountofgoodsentences += sd.CountGoodSentences
totalcountofcorrectsentences += sd.CountCorrectSentences
if sd.CountGoodSentences > maxcountofflatsentences {
maxcountofflatsentences = sd.CountGoodSentences
}
} else {
// no child nodes
//fmt.Println(s.Text())
sd = describeSentences(s)
totalcountofgoodsentences = sd.CountGoodSentences
maxcountofflatsentences = sd.CountGoodSentences
totalcountofcorrectsentences = sd.CountCorrectSentences
}
//fmt.Printf("set totalcountofgoodsentences ")
// set attributes for the node
s.SetAttr("countsentences", strconv.Itoa(sd.CountSentences))
s.SetAttr("averagewords", strconv.Itoa(sd.AverageWords))
s.SetAttr("countgoodsentences", strconv.Itoa(sd.CountGoodSentences))
s.SetAttr("countlongsentences", strconv.Itoa(sd.CountLongSentences))
s.SetAttr("totalcountofgoodsentences", strconv.Itoa(totalcountofgoodsentences))
s.SetAttr("totalcountofcorrectsentences", strconv.Itoa(totalcountofcorrectsentences))
s.SetAttr("maxcountofflatsentences", strconv.Itoa(maxcountofflatsentences))
return s
}
/*
*
*/
func describeSentences(s *goquery.Selection) TextDescription {
var d TextDescription
var text string
// get text of this node and then split for sentences
if s.Children().Length() > 0 {
text = getTextFromHtml(s)
} else {
text = s.Text()
}
sentences := tokenizer.Tokenize(text)
d.CountSentences = len(sentences)
//fmt.Println("==============================================")
for _, s := range sentences {
sentence := s.Text
if len(sentence) == 0 {
continue
}
c := len(get_words_from(sentence))
//fmt.Println(sentence)
d.AverageWords += c
if c > 3 {
// presume normal sentence usually has more 3 words
d.CountLongSentences++
if c < 25 {
// but a sentence should not have nore 25 words. We will not
// consider such sentence as a good one
d.CountGoodSentences++
}
lastsymbol := sentence[len(sentence)-1:]
if strings.ContainsAny(lastsymbol, ".?!") {
d.CountCorrectSentences++
}
}
}
if d.CountSentences > 0 {
d.AverageWords = int(d.AverageWords / d.CountSentences)
}
return d
}
func get_words_from(text string) []string {
words := regexp.MustCompile("[^\\s]+")
return words.FindAllString(text, -1)
}
// The function to calculate a Mean Deviation for a list of integer values
func getMeanDeviation(list []int) float64 {
if len(list) < 1 {
return 0.0
}
sum := 0
for i := range list {
sum += list[i]
}
// calculate arithmetic mean
avg := float64(sum / len(list))
number1 := 0.0
for i := range list {
number1 += math.Abs(float64(list[i]) - avg)
}
// calculate mean deviation
meandeviation := number1 / float64(len(list))
return meandeviation
}
// this function returns text in a selection with ignoring child nodes
// for "xxx <a>YYY</a>" result wil be "xxx "
func getSelectionWihoutChildren(s *goquery.Selection) *goquery.Selection {
clone := s.Clone()
// remove all child nodes in this selection
clone.Children().Each(func(i int, sec *goquery.Selection) {
sec.Remove()
})
return clone
}
func getNumbericAttribute(s *goquery.Selection, attr string) int {
a, f := s.Attr(attr)
if f {
ai, _ := strconv.Atoi(a)
return ai
}
return 0
}