-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathurl.go
120 lines (65 loc) · 1.4 KB
/
url.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
package article_spider
import (
"github.com/PuerkitoBio/goquery"
"strings"
)
type url struct {
s *Spider
}
func NewUrl(s *Spider) *url {
return &url{s: s}
}
func (u url) Start() {
u.s.total = len(u.s.form.DetailUrls)
for _, uu := range u.s.form.DetailUrls {
u.s.detailCoroutineChan <- true
u.s.detailWait.Add(1)
go u.GetDetail(uu)
}
u.s.detailWait.Wait()
u.s.cancel()
}
func (u url) GetDetail(detailUrl string) {
defer func() {
<-u.s.detailCoroutineChan
u.s.detailWait.Done()
u.s.currentIndex++
}()
select {
case <-u.s.cxt.Done():
return
default:
}
html, err := u.s.form.GetHtml(detailUrl)
if err != nil {
//n.form.Notice.PushMessage(notice.NewError(err.Error()))
//n.form.Notice.Error(err.Error())
return
}
//中间链接(中间页面)
if len(u.s.form.MiddleSelector) > 0 {
for _, s := range u.s.form.MiddleSelector {
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
u.s.notice.Error(err.Error())
return
}
href, b := doc.Find(s).Attr("href")
if !b {
return
}
href = u.s.form.GetHref(href)
html, err = u.s.form.GetHtml(href)
if err != nil {
u.s.notice.Error(err.Error())
return
}
}
}
res, err := u.s.form.ResolveSelector(html, u.s.form.DetailFields, detailUrl)
if err != nil {
u.s.notice.Error(err.Error())
return
}
u.s.result.Push(res)
}