Skip to content

Commit f06c2d5

Browse files
committed
better charset detection
1 parent 4510a8b commit f06c2d5

File tree

7 files changed

+380
-13
lines changed

7 files changed

+380
-13
lines changed

chardet/chardet.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package chardet
2+
3+
import (
4+
gogschardet "github.com/gogs/chardet"
5+
"github.com/wlynxg/chardet"
6+
)
7+
8+
func detectEncoding(dat []byte) (string, error) {
9+
r1 := chardet.Detect(dat)
10+
if r1.Encoding > "" {
11+
return r1.Encoding, nil
12+
}
13+
r2, err := gogschardet.NewTextDetector().DetectBest(dat)
14+
if err != nil {
15+
return "", err
16+
}
17+
return r2.Charset, nil
18+
}

chardet/chardet_golang.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
package chardet
2+
3+
func DetectEncoding(dat []byte) (string, error) {
4+
return detectEncoding(dat)
5+
}

chardet/chardet_uchardet.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
//go:build uchardet
2+
3+
package chardet
4+
5+
import (
6+
"github.com/gonejack/transcode/chardet/uchardet"
7+
)
8+
9+
func DetectEncoding(dat []byte) (string, error) {
10+
dec := uchardet.NewChardet()
11+
defer dec.Release()
12+
if dec.Handle(dat) == 0 {
13+
if v := dec.End(); v > "" {
14+
return v, nil
15+
}
16+
}
17+
return detectEncoding(dat)
18+
}

chardet/uchardet/uchardet.go

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
package uchardet
2+
3+
// https://github.com/centny/uchardet
4+
5+
/*
6+
#include <stdio.h>
7+
#include <stdlib.h>
8+
#include <string.h>
9+
#include <uchardet/uchardet.h>
10+
// Apple Silicon (M1/M2/M3)
11+
#cgo darwin,arm64 CPPFLAGS: -I/opt/homebrew/opt/uchardet/include
12+
#cgo darwin,arm64 LDFLAGS: -L/opt/homebrew/opt/uchardet/lib -luchardet
13+
14+
// Intel macOS (x86_64)
15+
#cgo darwin,amd64 CPPFLAGS: -I/usr/local/include
16+
#cgo darwin,amd64 LDFLAGS: -L/usr/local/lib -luchardet
17+
18+
// Linux (assumes installed in /usr/local)
19+
#cgo linux CPPFLAGS: -I/usr/local/include
20+
#cgo linux LDFLAGS: -L/usr/local/lib -luchardet
21+
*/
22+
import "C"
23+
import "unsafe"
24+
25+
// Chardet is the binding uchardet_t on libuchardet
26+
type Chardet struct {
27+
det C.uchardet_t
28+
}
29+
30+
// NewChardet is the default creator to create Chardet
31+
func NewChardet() *Chardet {
32+
return &Chardet{
33+
det: C.uchardet_new(),
34+
}
35+
}
36+
37+
// Release will free the Chardet
38+
func (c *Chardet) Release() {
39+
C.uchardet_delete(c.det)
40+
}
41+
42+
// Handle will process the data slice
43+
func (c *Chardet) Handle(buf []byte) int {
44+
var data = (*C.char)(unsafe.Pointer(&buf[0]))
45+
var dlen = C.size_t(len(buf))
46+
return int(C.uchardet_handle_data(c.det, data, dlen))
47+
}
48+
49+
// Reset encoding detector.
50+
func (c *Chardet) Reset() {
51+
C.uchardet_reset(c.det)
52+
}
53+
54+
// End is ending the process and return the encoding name
55+
func (c *Chardet) End() string {
56+
C.uchardet_data_end(c.det)
57+
return cstring(C.uchardet_get_charset(c.det))
58+
}
59+
60+
func cstring(cs *C.char) string {
61+
clen := C.strlen(cs)
62+
if clen < 1 {
63+
return ""
64+
}
65+
buf := make([]byte, clen+1)
66+
C.strcpy((*C.char)(unsafe.Pointer(&buf[0])), cs)
67+
return string(buf[:clen])
68+
}

go.mod

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
module github.com/gonejack/transcode
22

3-
go 1.21
3+
go 1.24.0
4+
5+
toolchain go1.24.3
46

57
require (
68
github.com/alecthomas/kong v0.8.1
79
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f
10+
github.com/wlynxg/chardet v1.0.0
811
golang.org/x/text v0.14.0
912
)
13+
14+
require (
15+
go.uber.org/multierr v1.11.0 // indirect
16+
go.uber.org/zap v1.27.0 // indirect
17+
)

go.sum

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,28 @@
11
github.com/alecthomas/assert/v2 v2.1.0 h1:tbredtNcQnoSd3QBhQWI7QZ3XHOVkw1Moklp2ojoH/0=
22
github.com/alecthomas/assert/v2 v2.1.0/go.mod h1:b/+1DI2Q6NckYi+3mXyH3wFb8qG37K/DuK80n7WefXA=
3-
github.com/alecthomas/kong v0.8.0 h1:ryDCzutfIqJPnNn0omnrgHLbAggDQM2VWHikE1xqK7s=
4-
github.com/alecthomas/kong v0.8.0/go.mod h1:n1iCIO2xS46oE8ZfYCNDqdR0b0wZNrXAIAqro/2132U=
53
github.com/alecthomas/kong v0.8.1 h1:acZdn3m4lLRobeh3Zi2S2EpnXTd1mOL6U7xVml+vfkY=
64
github.com/alecthomas/kong v0.8.1/go.mod h1:n1iCIO2xS46oE8ZfYCNDqdR0b0wZNrXAIAqro/2132U=
75
github.com/alecthomas/repr v0.1.0 h1:ENn2e1+J3k09gyj2shc0dHr/yjaWSHRlrJ4DPMevDqE=
86
github.com/alecthomas/repr v0.1.0/go.mod h1:2kn6fqh/zIyPLmm3ugklbEi5hg5wS435eygvNfaDQL8=
7+
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
8+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
99
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f h1:3BSP1Tbs2djlpprl7wCLuiqMaUh5SJkkzI2gDs+FgLs=
1010
github.com/gogs/chardet v0.0.0-20211120154057-b7413eaefb8f/go.mod h1:Pcatq5tYkCW2Q6yrR2VRHlbHpZ/R4/7qyL1TCF7vl14=
1111
github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM=
1212
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
13+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
14+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
15+
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
16+
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
17+
github.com/wlynxg/chardet v1.0.0 h1:2gEgdmy/at4xIC+mOfNf1OFsb4LtnG9IcumfRXii/d0=
18+
github.com/wlynxg/chardet v1.0.0/go.mod h1:DgEUcneT6QieJ9qEhtRFOHWOjSNLPAo8lwUhjNopcFE=
19+
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
20+
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
21+
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
22+
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
23+
go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8=
24+
go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E=
1325
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
1426
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
27+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
28+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

0 commit comments

Comments
 (0)