forked from monperrus/crawler-user-agents
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidate.go
119 lines (99 loc) · 2.56 KB
/
validate.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package agents
import (
_ "embed"
"encoding/json"
"fmt"
"regexp"
"time"
)
//go:embed crawler-user-agents.json
var crawlersJson []byte
// Crawler contains information about one crawler.
type Crawler struct {
// An identifier for the crawler.
Id string `json:"id"`
// Regexp of User Agent of the crawler.
Pattern string `json:"pattern"`
// Discovery date.
AdditionDate time.Time `json:"addition_date"`
// Official url of the robot.
URL string `json:"url"`
// Examples of full User Agent strings.
Instances []string `json:"instances"`
}
// Private time needed to convert addition_date from/to the format used in JSON.
type jsonCrawler struct {
Id string `json:"id"`
Pattern string `json:"pattern"`
AdditionDate string `json:"addition_date"`
URL string `json:"url"`
Instances []string `json:"instances"`
}
const timeLayout = "2006/01/02"
func (c Crawler) MarshalJSON() ([]byte, error) {
jc := jsonCrawler{
Pattern: c.Pattern,
AdditionDate: c.AdditionDate.Format(timeLayout),
URL: c.URL,
Instances: c.Instances,
}
return json.Marshal(jc)
}
func (c *Crawler) UnmarshalJSON(b []byte) error {
var jc jsonCrawler
if err := json.Unmarshal(b, &jc); err != nil {
return err
}
c.Id = jc.Id
c.Pattern = jc.Pattern
c.URL = jc.URL
c.Instances = jc.Instances
if c.Id == "" {
return fmt.Errorf("empty id in record %s", string(b))
}
if c.Pattern == "" {
return fmt.Errorf("empty pattern in record %s", string(b))
}
if jc.AdditionDate != "" {
tim, err := time.ParseInLocation(timeLayout, jc.AdditionDate, time.UTC)
if err != nil {
return err
}
c.AdditionDate = tim
}
return nil
}
// The list of crawlers, built from contents of crawler-user-agents.json.
var Crawlers = func() []Crawler {
var crawlers []Crawler
if err := json.Unmarshal(crawlersJson, &crawlers); err != nil {
panic(err)
}
return crawlers
}()
var regexps = func() []*regexp.Regexp {
regexps := make([]*regexp.Regexp, len(Crawlers))
for i, crawler := range Crawlers {
regexps[i] = regexp.MustCompile(crawler.Pattern)
}
return regexps
}()
// Returns if User Agent string matches any of crawler patterns.
func IsCrawler(userAgent string) bool {
for _, re := range regexps {
if re.MatchString(userAgent) {
return true
}
}
return false
}
// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers.
func MatchingCrawlers(userAgent string) []int {
indices := []int{}
for i, re := range regexps {
if re.MatchString(userAgent) {
indices = append(indices, i)
}
}
return indices
}