Skip to content

Commit e839e7e

Browse files
authored
Merge pull request #26 from tech-engine/dev
Feature: Added css & xpath node selectors
2 parents ae64909 + 4019a03 commit e839e7e

File tree

6 files changed

+221
-0
lines changed

6 files changed

+221
-0
lines changed

go.mod

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ require (
1515
google.golang.org/api v0.136.0
1616
)
1717

18+
require github.com/antchfx/xpath v1.3.1 // indirect
19+
1820
require (
1921
cloud.google.com/go v0.110.6 // indirect
2022
cloud.google.com/go/compute v1.23.0 // indirect
@@ -23,6 +25,8 @@ require (
2325
cloud.google.com/go/iam v1.1.1 // indirect
2426
cloud.google.com/go/longrunning v0.5.1 // indirect
2527
cloud.google.com/go/storage v1.33.0 // indirect
28+
github.com/andybalholm/cascadia v1.3.2
29+
github.com/antchfx/htmlquery v1.3.2
2630
github.com/davecgh/go-spew v1.1.1 // indirect
2731
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
2832
github.com/golang/protobuf v1.5.3 // indirect

go.sum

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ cloud.google.com/go/storage v1.33.0/go.mod h1:Hhh/dogNRGca7IWv1RC2YqEn0c0G77ctA/
1717
firebase.google.com/go v3.13.0+incompatible h1:3TdYC3DDi6aHn20qoRkxwGqNgdjtblwVAyRLQwGn/+4=
1818
firebase.google.com/go v3.13.0+incompatible/go.mod h1:xlah6XbEyW6tbfSklcfe5FHJIwjt8toICdV5Wh9ptHs=
1919
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
20+
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
21+
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
22+
github.com/antchfx/htmlquery v1.3.2 h1:85YdttVkR1rAY+Oiv/nKI4FCimID+NXhDn82kz3mEvs=
23+
github.com/antchfx/htmlquery v1.3.2/go.mod h1:1mbkcEgEarAokJiWhTfr4hR06w/q2ZZjnYLrDt6CTUk=
24+
github.com/antchfx/xpath v1.3.1 h1:PNbFuUqHwWl0xRjvUPjJ95Agbmdj2uzzIwmQKgu4oCk=
25+
github.com/antchfx/xpath v1.3.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
2026
github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
2127
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
2228
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
@@ -139,6 +145,7 @@ golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTk
139145
golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU=
140146
golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
141147
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
148+
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
142149
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
143150
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
144151
golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -152,6 +159,9 @@ golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwY
152159
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
153160
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
154161
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
162+
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
163+
golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
164+
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
155165
golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
156166
golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
157167
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
@@ -163,6 +173,7 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ
163173
golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
164174
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
165175
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
176+
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
166177
golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E=
167178
golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y=
168179
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -175,17 +186,22 @@ golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7w
175186
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
176187
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
177188
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
189+
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
190+
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
178191
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
179192
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
180193
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
181194
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
195+
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
196+
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
182197
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
183198
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
184199
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
185200
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
186201
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
187202
golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ=
188203
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
204+
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
189205
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
190206
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
191207
golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4=
@@ -197,6 +213,7 @@ golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3
197213
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
198214
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
199215
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
216+
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
200217
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
201218
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
202219
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=

pkg/core/ports.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"net/url"
88

99
"github.com/tech-engine/goscrapy/internal/fsm"
10+
"golang.org/x/net/html"
1011
)
1112

1213
type IEngine[OUT any] interface {
@@ -50,6 +51,7 @@ type IResponseReader interface {
5051
Cookies() []*http.Cookie
5152
Request() *http.Request
5253
Meta(string) (any, bool)
54+
ISelector
5355
}
5456

5557
type IJob interface {
@@ -64,3 +66,15 @@ type IOutput[OUT any] interface {
6466
}
6567

6668
type ResponseCallback func(context.Context, IResponseReader)
69+
type ISelectorGetter interface {
70+
Get() *html.Node
71+
GetAll() []*html.Node
72+
Text(...string) []string
73+
Attr(string) []string
74+
}
75+
76+
type ISelector interface {
77+
Css(string) ISelector
78+
Xpath(string) ISelector
79+
ISelectorGetter
80+
}

pkg/scheduler/response.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ import (
66
"net/http"
77

88
"github.com/tech-engine/goscrapy/internal/fsm"
9+
"github.com/tech-engine/goscrapy/pkg/core"
10+
"golang.org/x/net/html"
911
)
1012

1113
func NewResponse() *response {
@@ -19,6 +21,7 @@ type response struct {
1921
cookies []*http.Cookie
2022
request *http.Request
2123
meta *fsm.FixedSizeMap[string, any]
24+
nodes Selectors
2225
}
2326

2427
// response implementing core.ResponseReader
@@ -61,6 +64,7 @@ func (r *response) Reset() {
6164
// because we there isn't guarantee that we will have the same pair for req-res from the pools,
6265
// we must set it meta=nil upon releasing req-res to their respective pools, otherwise we will have corrupt data.
6366
r.meta = nil
67+
r.nodes = nil
6468
}
6569

6670
// response implementing engine.ResponseWriter
@@ -87,3 +91,40 @@ func (r *response) WriteCookies(cookies []*http.Cookie) {
8791
func (r *response) WriteMeta(meta *fsm.FixedSizeMap[string, any]) {
8892
r.meta = meta
8993
}
94+
95+
func (r *response) Css(selector string) core.ISelector {
96+
97+
if r.nodes == nil {
98+
if nodes, err := NewSelector(r.body); err == nil {
99+
r.nodes = nodes
100+
}
101+
}
102+
103+
return r.nodes.Css(selector)
104+
}
105+
106+
func (r *response) Xpath(xpath string) core.ISelector {
107+
108+
if r.nodes == nil {
109+
if nodes, err := NewSelector(r.body); err == nil {
110+
r.nodes = nodes
111+
}
112+
}
113+
return r.nodes.Xpath(xpath)
114+
}
115+
116+
func (r *response) Text(def ...string) []string {
117+
return r.nodes.Text(def...)
118+
}
119+
120+
func (r *response) Attr(attrName string) []string {
121+
return r.nodes.Attr(attrName)
122+
}
123+
124+
func (r *response) Get() *html.Node {
125+
return r.nodes.Get()
126+
}
127+
128+
func (r *response) GetAll() []*html.Node {
129+
return r.nodes.GetAll()
130+
}

pkg/scheduler/selectors.go

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package scheduler
2+
3+
import (
4+
"io"
5+
"strings"
6+
7+
"github.com/andybalholm/cascadia"
8+
"github.com/antchfx/htmlquery"
9+
"github.com/tech-engine/goscrapy/pkg/core"
10+
"golang.org/x/net/html"
11+
)
12+
13+
type Selectors []*html.Node
14+
15+
func NewSelector(r io.Reader) (Selectors, error) {
16+
root, err := html.Parse(r)
17+
if err != nil {
18+
return nil, err
19+
}
20+
return Selectors([]*html.Node{root}), nil
21+
}
22+
23+
// Css selector - select element by id, class, nodename etc.
24+
func (nodes Selectors) Css(selector string) core.ISelector {
25+
sel, err := cascadia.ParseWithPseudoElement(selector)
26+
if err != nil {
27+
return Selectors([]*html.Node{})
28+
}
29+
30+
selected := make(Selectors, 0, len(nodes))
31+
for _, node := range nodes {
32+
selected = append(selected, cascadia.QueryAll(node, sel)...)
33+
}
34+
35+
return selected
36+
}
37+
38+
// Xpath selector - select element using an xpath expression.
39+
func (nodes Selectors) Xpath(xpath string) core.ISelector {
40+
selected := make(Selectors, 0, len(nodes))
41+
for _, node := range nodes {
42+
matches, err := htmlquery.QueryAll(node, xpath)
43+
if err != nil {
44+
continue
45+
}
46+
selected = append(selected, matches...)
47+
}
48+
return selected
49+
}
50+
51+
// Extracts all the text of a node and it's descendents.
52+
func (nodes Selectors) Text(def ...string) []string {
53+
texts := make([]string, 0, len(nodes))
54+
for _, node := range nodes {
55+
text := strings.TrimSpace(htmlquery.InnerText(node))
56+
if text == "" && len(def) > 0 {
57+
texts = append(texts, def[0])
58+
continue
59+
}
60+
texts = append(texts, text)
61+
}
62+
return texts
63+
}
64+
65+
// Extracts attribute values
66+
func (nodes Selectors) Attr(attrName string) []string {
67+
attrs := make([]string, 0, len(nodes))
68+
for _, node := range nodes {
69+
for _, attr := range node.Attr {
70+
if attr.Key == attrName {
71+
attrs = append(attrs, attr.Val)
72+
}
73+
}
74+
}
75+
return attrs
76+
}
77+
78+
// Get the first matched node
79+
func (nodes Selectors) Get() *html.Node {
80+
if len(nodes) <= 0 {
81+
return nil
82+
}
83+
return nodes[0]
84+
}
85+
86+
// Gets all the matched nodes
87+
func (nodes Selectors) GetAll() []*html.Node {
88+
return nodes
89+
}

pkg/scheduler/selectors_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package scheduler
2+
3+
import (
4+
"strings"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestSelectors(t *testing.T) {
11+
html := `
12+
<html>
13+
<body>
14+
<div id="main" class="content">
15+
<h1>Title</h1>
16+
<p class="intro">Introduction paragraph 1</p>
17+
<a href="http://example.com">Example Link</a>
18+
<p>This is test paragraph</p>
19+
<p class="intro" data-mg="test">Introduction paragraph 3</p>
20+
</div>
21+
</body>
22+
</html>
23+
`
24+
25+
selector, err := NewSelector(strings.NewReader(html))
26+
27+
assert.NoError(t, err)
28+
29+
cssSelector := selector.Css("p.intro")
30+
31+
cssNodes := cssSelector.GetAll()
32+
assert.Len(t, cssNodes, 2, "expected nodes=2, got=%s", len(cssNodes))
33+
34+
cssNodesTexts := cssSelector.Text()
35+
assert.Equal(t, "Introduction paragraph 1", cssNodesTexts[0], "expected paragraph text=Introduction paragraph 1, got=%s", cssNodesTexts[0])
36+
37+
xpathSelector := selector.Xpath("//p[@data-mg='test']")
38+
39+
xpathNodes := xpathSelector.GetAll()
40+
assert.Len(t, xpathNodes, 1, "expected xpath nodes=1, got=%s", len(xpathNodes))
41+
42+
xpathNodesTexts := xpathSelector.Text()
43+
assert.Len(t, xpathNodesTexts, 1, "expected xpathNodesTexts=1, got=%s", len(xpathNodesTexts))
44+
assert.Equal(t, "Introduction paragraph 3", xpathNodesTexts[0], "expected paragraph text=Introduction paragraph 3, got=%s", xpathNodesTexts[0])
45+
46+
attrValues := selector.Css("a").Attr("href")
47+
assert.Len(t, xpathNodesTexts, 1, "expected attrValues=1, got=%s", len(attrValues))
48+
assert.Equal(t, "http://example.com", attrValues[0], "expected href=http://example.com, got=%s", attrValues[0])
49+
50+
noCssElements := selector.Css("p.box").GetAll()
51+
assert.Empty(t, noCssElements, "expected element=0, got=%s", len(noCssElements))
52+
53+
noXpathElements := selector.Xpath("//p[@class='test']").GetAll()
54+
assert.Empty(t, noXpathElements, "expected element=0, got=%s", len(noXpathElements))
55+
56+
}

0 commit comments

Comments
 (0)