-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathjaro.go
99 lines (91 loc) · 2.58 KB
/
jaro.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
package stringosim
import ()
type JaroSimilarityOptions struct {
Threshold float64
PValue float64
LValue float64
CaseInsensitive bool
}
var DefaultJaroSimilarityOptions = JaroSimilarityOptions{
CaseInsensitive: false,
Threshold: 0.7,
PValue: 0.1,
LValue: 4,
}
func Jaro(s []rune, t []rune, options ...JaroSimilarityOptions) float64 {
opt := DefaultJaroSimilarityOptions
for _, option := range options {
opt = option
}
lenMatched, numT, _ := jaroHelper(s, t, opt)
if lenMatched == 0 {
return 0.0
}
lenS := len(s)
lenT := len(t)
return jaroFormula(lenMatched, numT, lenS, lenT)
}
func JaroWinkler(s []rune, t []rune, options ...JaroSimilarityOptions) float64 {
opt := DefaultJaroSimilarityOptions
for _, option := range options {
opt = option
}
lenMatched, numT, prefixLen := jaroHelper(s, t, opt)
if lenMatched == 0 {
return 0.0
}
lenS := len(s)
lenT := len(t)
jaroDis := jaroFormula(lenMatched, numT, lenS, lenT)
if jaroDis < opt.Threshold {
return jaroDis
}
p := opt.PValue
if p*float64(prefixLen) > 1.0 {
p = 1.0 / float64(prefixLen)
}
return jaroDis + (1.0-jaroDis)*p*float64(prefixLen)
}
func jaroHelper(s []rune, t []rune, option JaroSimilarityOptions) (int, int, int) {
lenS := len(s)
lenT := len(t)
checkS := make([]rune, 0, len(s))
checkT := make([]rune, 0, len(t))
matchedT := make([]bool, len(t))
maxDis := Max(lenS, lenT) / 2
for is, cs := range s {
for it := Max(0, is-maxDis); it <= Min(lenT-1, is+maxDis); it++ {
if !matchedT[it] && SameRune(cs, t[it], option.CaseInsensitive) {
matchedT[it] = true
checkS = append(checkS, cs)
break
}
}
}
for it, ct := range t {
if matchedT[it] {
checkT = append(checkT, ct)
}
}
minLen := Min(lenS, lenT)
prefixLen := 0
for i := 0; i < minLen; i++ {
if !SameRune(s[i], t[i], option.CaseInsensitive) {
prefixLen = i
break
}
}
numTranspositions := 0
for i, cs := range checkS {
if !SameRune(cs, checkT[i], option.CaseInsensitive) {
numTranspositions++
}
}
if len(checkS) == 0 {
return 0, 0, 0
}
return len(checkS), numTranspositions / 2, prefixLen
}
func jaroFormula(m, t, s1, s2 int) float64 {
return (float64(m)/float64(s1) + float64(m)/float64(s2) + float64(m-t)/float64(m)) / 3.0
}