-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathdetect.go
100 lines (80 loc) · 2.8 KB
/
detect.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
package icu
// #cgo pkg-config: icu-i18n
// #include "c_bridge.h"
// #include "stdlib.h"
import "C"
import (
"fmt"
"sync"
"unsafe"
)
const (
U_ZERO_ERROR = 0 // ICU common constant error code which means that no error occured
MatchDataBufferSize = 25 // Size of the buffer for detection results (Max count of returned guesses per detect call)
)
// CharsetDetector provides ICU charset detection functionality.
type CharsetDetector struct {
ptr *C.UCharsetDetector // ICU struct needed for detection
resBuffer [MatchDataBufferSize]C.MatchData
gMutex sync.Mutex // Mutex used to guarantee thread safety for ICU calls
}
// An equivalent of MatchData C structure (see c_bridge.h)
type Match struct {
Charset string
Language string
Confidence int
}
// Creates new charset detector. If it is successfully created, it
// must be closed as it needs to free native ICU resources.
func NewCharsetDetector() (*CharsetDetector, error) {
det := new(CharsetDetector)
var status int
statusPtr := unsafe.Pointer(&status)
det.ptr = C.ucsdet_open((*C.UErrorCode)(statusPtr))
if status != U_ZERO_ERROR {
return nil, fmt.Errorf("ICU Error code returned: %d", status)
}
return det, nil
}
func (det *CharsetDetector) GuessCharset(input []byte) (matches []Match, err error) {
// As described in c_bridge.h, detection operations are not thread safe and
// should be called consequently. So a mutex is used here.
det.gMutex.Lock()
defer det.gMutex.Unlock()
inputLen := len(input)
if inputLen == 0 {
return nil, fmt.Errorf("Input data len is 0")
}
var status int
// Perform detection. Guess count is the number of matches returned.
// The matches themself are put in the result buffer
guessCount := C.detectCharset(
unsafe.Pointer(det.ptr),
unsafe.Pointer(&input[0]),
C.int(inputLen),
(*C.int)(unsafe.Pointer(&status)),
(*C.MatchData)(unsafe.Pointer(&det.resBuffer[0])),
C.int(MatchDataBufferSize))
if status == U_ZERO_ERROR {
// Convert the returned number of entries from result buffer to a slice
// that will be returned
count := int(guessCount)
mt := make([]Match, count, count)
for i := 0; i < count; i++ {
mData := det.resBuffer[i]
charset := C.GoString(mData.charset)
language := C.GoString(mData.language)
mt[i] = Match{charset, language, int(mData.confidence)}
}
return mt, nil
}
return nil, fmt.Errorf("ICU Error code returned: %d", status)
}
// Close frees native C resources
func (det *CharsetDetector) Close() {
det.gMutex.Lock()
defer det.gMutex.Unlock()
if det.ptr != nil {
C.ucsdet_close(det.ptr)
}
}