-
Notifications
You must be signed in to change notification settings - Fork 9
/
charset.go
222 lines (180 loc) · 4.8 KB
/
charset.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
package spider
import (
"net/http"
"regexp"
"strings"
"unicode/utf8"
"github.com/suosi-inc/chardet"
"github.com/x-funs/go-fun"
)
const (
CharsetPosHeader = "header"
CharsetPosHtml = "html"
CharsetPosGuess = "guess"
CharsetPosValid = "valid"
)
const (
RegexCharset = "(?i)charset=\\s*([a-z][_\\-0-9a-z]*)"
RegexCharsetHtml4 = "(?i)<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>"
RegexCharsetHtml5 = "(?i)<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>"
)
var (
regexCharsetPattern = regexp.MustCompile(RegexCharset)
regexCharsetHtml4Pattern = regexp.MustCompile(RegexCharsetHtml4)
regexCharsetHtml5Pattern = regexp.MustCompile(RegexCharsetHtml5)
)
type CharsetRes struct {
Charset string
CharsetPos string
}
// Charset 解析 HTTP body、http.Header 中的编码和语言, 如果未解析成功则尝试进行猜测
func Charset(body []byte, headers *http.Header) CharsetRes {
var charsetRes CharsetRes
var guessCharset string
// 优先检测是否是有效的 UTF-8
valid := utf8.Valid(body)
if valid {
charsetRes.Charset = "UTF-8"
charsetRes.CharsetPos = CharsetPosValid
return charsetRes
}
// 根据 Content-Type、Body Html 标签探测编码
charsetRes = CharsetFromHeaderHtml(body, headers)
// 未识别到 charset 则使用 guess
if charsetRes.Charset == "" {
guessCharset = CharsetGuess(body)
if guessCharset != "" {
charsetRes.Charset = guessCharset
charsetRes.CharsetPos = CharsetPosGuess
}
}
return charsetRes
}
// CharsetFromHeaderHtml 解析 HTTP body、http.Header 中的 charset, 准确性高
func CharsetFromHeaderHtml(body []byte, headers *http.Header) CharsetRes {
var res CharsetRes
cHeader := CharsetFromHeader(headers)
cHtml := CharsetFromHtml(body)
// 只有 Header 则使用 Header
if cHeader != "" && cHtml == "" {
res.Charset = cHeader
res.CharsetPos = CharsetPosHeader
return res
}
// 只有 Html 则使用 Html
if cHeader == "" && cHtml != "" {
res.Charset = cHtml
res.CharsetPos = CharsetPosHtml
return res
}
// 同时有 Header 和 Html, 根据情况使用 Header 或 Html
if cHeader != "" && cHtml != "" {
if cHeader == cHtml {
res.Charset = cHeader
res.CharsetPos = CharsetPosHeader
return res
}
// Header 和 Html 不一致, 以下情况以 Html 为准
if strings.HasPrefix(cHeader, "ISO") || strings.HasPrefix(cHeader, "WINDOWS") {
res.Charset = cHtml
res.CharsetPos = CharsetPosHtml
return res
}
res.Charset = cHeader
res.CharsetPos = CharsetPosHeader
return res
}
return res
}
// CharsetFromHeader 解析 HTTP header 中的 charset
func CharsetFromHeader(headers *http.Header) string {
var charset string
if headers != nil {
contentType := headers.Get("Content-Type")
if !fun.Blank(contentType) {
matches := regexCharsetPattern.FindStringSubmatch(contentType)
if len(matches) > 1 {
charset = matches[1]
}
}
}
return convertCharset(charset)
}
// CharsetFromHtml 解析 Html 中的 charset
func CharsetFromHtml(body []byte) string {
var charset string
if len(body) >= 0 {
// 先检测 HTML 标签
html := fun.String(body)
// 匹配 HTML4 标签
var charset4 string
matches := regexCharsetHtml4Pattern.FindStringSubmatch(html)
if len(matches) > 1 {
matches = regexCharsetPattern.FindStringSubmatch(matches[1])
if len(matches) > 1 {
charset4 = matches[1]
}
}
// 匹配 HTML5 标签
var charset5 string
matches = regexCharsetHtml5Pattern.FindStringSubmatch(html)
if len(matches) > 1 {
charset5 = matches[1]
}
// 只有其中一个
if charset4 != "" && charset5 == "" {
charset = charset4
}
if charset4 == "" && charset5 != "" {
charset = charset5
}
if charset4 != "" && charset5 != "" {
// 竟然两个都有, 以最先出现的为准
if charset4 == charset5 {
charset = charset5
} else {
charset4Index := strings.Index(html, charset4)
charset5Index := strings.Index(html, charset5)
if charset4Index < charset5Index {
charset = charset4
} else {
charset = charset5
}
}
}
}
return convertCharset(charset)
}
// CharsetGuess 根据 HTTP body 猜测编码
func CharsetGuess(body []byte) string {
var guessCharset string
detector := chardet.NewHtmlDetector()
guess, err := detector.DetectBest(body)
if err == nil {
guessCharset = strings.ToUpper(guess.Charset)
}
return guessCharset
}
// convertCharset 格式化 charset
func convertCharset(charset string) string {
c := strings.ToUpper(strings.TrimSpace(charset))
if c != "" {
// alias utf8
if c == "UTF8" || c == "UTF_8" {
return "UTF-8"
}
// alias gb2312, gb18030
if strings.HasPrefix(c, "GB") {
return "GBK"
}
// alias big5-hkscs..
if strings.HasPrefix(c, "BIG5") {
return "Big5"
}
// alias shift-jis
if strings.HasPrefix(c, "SHIFT") {
return "SHIFT_JIS"
}
}
return c
}