Skip to content

Commit 9c26b2b

Browse files
committed
update detections
1 parent 961e1b0 commit 9c26b2b

File tree

5 files changed

+153
-3
lines changed

5 files changed

+153
-3
lines changed

chardet/api.go

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
package chardet
22

3-
import "slices"
3+
import (
4+
"bytes"
5+
"slices"
6+
7+
"github.com/wlynxg/chardet/consts"
8+
)
49

510
type detectFunc func([]byte) (string, error)
611

@@ -9,14 +14,35 @@ func prefer(f detectFunc) {
914
}
1015

1116
var detectFuncList = []detectFunc{
17+
//DetectEncodingByUChardetDylib,
1218
DetectEncodingByUChardetCmd,
1319
DetectEncodingByWlynxgChardet,
1420
DetectEncodingByGogsChardet,
1521
}
1622

23+
const (
24+
UTF8WithBOM string = "utf-8-bom"
25+
UTF16LEWithBOM string = "utf-16le-bom"
26+
UTF16BEWithBOM string = "utf-16be-bom"
27+
UTF32LEWithBOM string = "utf-32le-bom"
28+
UTF32BEWithBOM string = "utf-32be-bom"
29+
)
30+
1731
func DetectEncoding(dat []byte) (v string, err error) {
18-
for _, df := range detectFuncList {
19-
v, err = df(dat)
32+
switch {
33+
case bytes.HasPrefix(dat, []byte(consts.UTF8BOM)):
34+
return UTF8WithBOM, nil // EF BB BF UTF-8 with BOM
35+
case bytes.HasPrefix(dat, []byte(consts.UTF16LEBOM)):
36+
return UTF16LEWithBOM, nil // FF FE UTF-16, little endian BOM
37+
case bytes.HasPrefix(dat, []byte(consts.UTF16BEBOM)):
38+
return UTF16BEWithBOM, nil // FE FF UTF-16, big endian BOM
39+
case bytes.HasPrefix(dat, []byte(consts.UTF32LEBOM)):
40+
return UTF32LEWithBOM, nil // FF FE 00 00 UTF-32, little-endian BOM
41+
case bytes.HasPrefix(dat, []byte(consts.UTF32BEBOM)):
42+
return UTF32BEWithBOM, nil // 00 00 FE FF UTF-32, big-endian BOM
43+
}
44+
for _, f := range detectFuncList {
45+
v, err = f(dat)
2046
if err == nil {
2147
break
2248
}

chardet/chardet.uchardet.dylib.go

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package chardet
2+
3+
import (
4+
"errors"
5+
"runtime"
6+
"unsafe"
7+
8+
"github.com/ebitengine/purego"
9+
)
10+
11+
func DetectEncodingByUChardetDylib(dat []byte) (string, error) {
12+
if lib == 0 {
13+
return "", errors.New("no uchardet dylib found")
14+
}
15+
dec := NewChardet()
16+
defer dec.Release()
17+
if dec.Handle(dat) == 0 {
18+
if v := dec.End(); v > "" {
19+
return v, nil
20+
}
21+
}
22+
return "", errors.New("detect failed by uchardet")
23+
}
24+
25+
var (
26+
lib uintptr
27+
uchardetNew func() unsafe.Pointer
28+
uchardetDelete func(det unsafe.Pointer)
29+
uchardetHandleData func(det unsafe.Pointer, data unsafe.Pointer, len uintptr) int
30+
uchardetDataEnd func(det unsafe.Pointer)
31+
uchardetGetCharset func(det unsafe.Pointer) *byte // 返回 C 字符串 (char*)
32+
uchardetReset func(det unsafe.Pointer)
33+
)
34+
35+
func init() {
36+
var err error
37+
var name = uchardetLib()
38+
lib, err = purego.Dlopen(name, purego.RTLD_NOW|purego.RTLD_GLOBAL)
39+
if err != nil {
40+
return
41+
}
42+
purego.RegisterLibFunc(&uchardetNew, lib, "uchardet_new")
43+
purego.RegisterLibFunc(&uchardetDelete, lib, "uchardet_delete")
44+
purego.RegisterLibFunc(&uchardetHandleData, lib, "uchardet_handle_data")
45+
purego.RegisterLibFunc(&uchardetDataEnd, lib, "uchardet_data_end")
46+
purego.RegisterLibFunc(&uchardetGetCharset, lib, "uchardet_get_charset")
47+
purego.RegisterLibFunc(&uchardetReset, lib, "uchardet_reset")
48+
}
49+
50+
type Chardet struct {
51+
det unsafe.Pointer
52+
}
53+
54+
func NewChardet() *Chardet {
55+
if uchardetNew == nil {
56+
return nil
57+
}
58+
return &Chardet{
59+
det: uchardetNew(),
60+
}
61+
}
62+
func (c *Chardet) Release() {
63+
if c.det != nil {
64+
uchardetDelete(c.det)
65+
c.det = nil
66+
}
67+
}
68+
func (c *Chardet) Handle(buf []byte) int {
69+
if c.det == nil || len(buf) == 0 {
70+
return -1 // 或其他错误指示
71+
}
72+
dataPtr := unsafe.Pointer(&buf[0])
73+
dlen := uintptr(len(buf))
74+
return uchardetHandleData(c.det, dataPtr, dlen)
75+
}
76+
func (c *Chardet) End() string {
77+
uchardetDataEnd(c.det)
78+
cString := uchardetGetCharset(c.det)
79+
return cstrToString(cString)
80+
}
81+
82+
func uchardetLib() string {
83+
switch runtime.GOOS {
84+
case "darwin":
85+
return "/opt/homebrew/lib/libuchardet.dylib"
86+
case "linux":
87+
return "libuchardet.so"
88+
default:
89+
return ""
90+
}
91+
}
92+
func cstrToSlice(cptr *byte) []byte {
93+
if cptr == nil {
94+
return nil
95+
}
96+
var length int
97+
for ptr := cptr; *ptr != 0; ptr = (*byte)(unsafe.Add(unsafe.Pointer(ptr), 1)) {
98+
length++
99+
}
100+
return unsafe.Slice(cptr, length)
101+
}
102+
func cstrToString(cptr *byte) string {
103+
return string(cstrToSlice(cptr))
104+
}

go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@ toolchain go1.24.3
66

77
require (
88
github.com/alecthomas/kong v1.11.0
9+
github.com/ebitengine/purego v0.9.1
910
github.com/endeveit/enca v0.0.0-20160315071803-00fe968221ab
1011
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f
1112
github.com/gonejack/charamel v1.0.2
13+
github.com/jupiterrider/ffi v0.5.1
1214
github.com/wlynxg/chardet v1.0.1
1315
golang.org/x/text v0.26.0
1416
)

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ github.com/alecthomas/kong v1.11.0 h1:y++1gI7jf8O7G7l4LZo5ASFhrhJvzc+WgF/arranEm
44
github.com/alecthomas/kong v1.11.0/go.mod h1:p2vqieVMeTAnaC83txKtXe8FLke2X07aruPWXyMPQrU=
55
github.com/alecthomas/repr v0.4.0 h1:GhI2A8MACjfegCPVq9f1FLvIBS+DrQ2KQBFZP1iFzXc=
66
github.com/alecthomas/repr v0.4.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4=
7+
github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A=
8+
github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
79
github.com/endeveit/enca v0.0.0-20160315071803-00fe968221ab h1:8sh8Pynho3gYrdzdbe796TbjWmKbrDasgcvvD9vaCH0=
810
github.com/endeveit/enca v0.0.0-20160315071803-00fe968221ab/go.mod h1:p9sYlSrwy19GJyed1EXDwdZeL4rVBd1tPoPgDvs7U1Q=
911
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
@@ -12,6 +14,8 @@ github.com/gonejack/charamel v1.0.2 h1:X71K3fX2Tjz8EnS601mS439VN1BHyK4DAgNZDbGLX
1214
github.com/gonejack/charamel v1.0.2/go.mod h1:RQJBTqDLll8x8xAJvJAFhQSoOphw8NKZ+paIcDS0aLk=
1315
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
1416
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
17+
github.com/jupiterrider/ffi v0.5.1 h1:l7ANXU+Ex33LilVa283HNaf/sTzCrrht7D05k6T6nlc=
18+
github.com/jupiterrider/ffi v0.5.1/go.mod h1:x7xdNKo8h0AmLuXfswDUBxUsd2OqUP4ekC8sCnsmbvo=
1519
github.com/wlynxg/chardet v1.0.1 h1:xyN64+w82gH7K1oLBqV7G1a6quVCATWYMmBcwz4gghY=
1620
github.com/wlynxg/chardet v1.0.1/go.mod h1:HLQMNsa0w4MkH2e7waQaFD+Yh85riFFTLhFtP8fsdbQ=
1721
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=

transcode.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import (
1313
"golang.org/x/text/encoding"
1414
"golang.org/x/text/encoding/htmlindex"
1515
"golang.org/x/text/encoding/simplifiedchinese"
16+
"golang.org/x/text/encoding/unicode"
17+
"golang.org/x/text/encoding/unicode/utf32"
1618
"golang.org/x/text/transform"
1719

1820
"github.com/gonejack/transcode/chardet"
@@ -149,6 +151,18 @@ func detectEncoding(r *bufio.Reader) (string, error) {
149151
return chardet.DetectEncoding(hdr)
150152
}
151153
func parseEncoding(encoding string) (enc encoding.Encoding, err error) {
154+
switch strings.ToLower(encoding) {
155+
case chardet.UTF8WithBOM:
156+
return unicode.UTF8BOM, nil
157+
case chardet.UTF16LEWithBOM:
158+
return unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), nil
159+
case chardet.UTF16BEWithBOM:
160+
return unicode.UTF16(unicode.BigEndian, unicode.UseBOM), nil
161+
case chardet.UTF32LEWithBOM:
162+
return utf32.UTF32(utf32.LittleEndian, utf32.UseBOM), nil
163+
case chardet.UTF32BEWithBOM:
164+
return utf32.UTF32(utf32.BigEndian, utf32.UseBOM), nil
165+
}
152166
enc, err = htmlindex.Get(encoding)
153167
if err != nil {
154168
err = fmt.Errorf("invalid encoding: %s", encoding)

0 commit comments

Comments
 (0)