Skip to content

Commit

Permalink
Merge pull request #27 from adrg/metrics-improvements
Browse files Browse the repository at this point in the history
Fix support for multibyte characters
  • Loading branch information
adrg authored Sep 27, 2023
2 parents 3a48a17 + f86c21b commit f9dd097
Show file tree
Hide file tree
Showing 9 changed files with 66 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
go: ['1.16']
go: ['1.17']
steps:
- name: Setup
uses: actions/setup-go@v4
with:
go-version: ${{ matrix.go }}

- name: Checkout
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Dependencies
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Initialize CodeQL
uses: github/codeql-action/init@v2
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ module github.com/adrg/strutil

go 1.14

require github.com/stretchr/testify v1.8.2
require github.com/stretchr/testify v1.8.4
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSS
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
Expand Down
10 changes: 4 additions & 6 deletions internal/stringutil/stringutil.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
package stringutil

import "unicode/utf8"

// CommonPrefix returns the common prefix of the specified strings. An empty
// string is returned if the parameters have no prefix in common.
func CommonPrefix(first, second string) string {
if utf8.RuneCountInString(first) > utf8.RuneCountInString(second) {
first, second = second, first
fRunes, sRunes := []rune(first), []rune(second)
if len(fRunes) > len(sRunes) {
fRunes, sRunes = sRunes, fRunes
}

var commonLen int
sRunes := []rune(second)
for i, r := range first {
for i, r := range fRunes {
if r != sRunes[i] {
break
}
Expand Down
8 changes: 8 additions & 0 deletions internal/stringutil/stringutil_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ func TestCommonPrefix(t *testing.T) {
{"a", stringutil.CommonPrefix("aab", "ab")},
{"aa", stringutil.CommonPrefix("aab", "aaab")},
{"aa", stringutil.CommonPrefix("aaab", "aab")},
{"忧郁的乌龟", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的乌龟")},
{"忧郁的", stringutil.CommonPrefix("忧郁的", "忧郁的乌龟")},
{"忧郁的", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的")},
{"", stringutil.CommonPrefix("忧郁的乌龟", "郁的乌龟")},
{"", stringutil.CommonPrefix("郁的乌龟", "忧郁的乌龟")},
{"\u2019", stringutil.CommonPrefix("\u2019a", "\u2019b")},
{"a\u2019bc", stringutil.CommonPrefix("a\u2019bcd", "a\u2019bce")},
{"abc", stringutil.CommonPrefix("abc\u2019d", "abc\u2020d")},
})
}

Expand Down
17 changes: 10 additions & 7 deletions metrics/jaro.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,25 @@ func (m *Jaro) Compare(a, b string) float64 {
}

func matchingRunes(a, b string, limit int) []rune {
common := []rune{}
runesB := []rune(b)
lenB := len(runesB)

for i, r := range a {
var (
runesA = []rune(a)
runesB = []rune(b)
runesCommon = []rune{}
lenB = len(runesB)
)

for i, r := range runesA {
end := mathutil.Min(i+limit+1, lenB)
for j := mathutil.Max(0, i-limit); j < end; j++ {
if r == runesB[j] && runesB[j] != -1 {
common = append(common, runesB[j])
runesCommon = append(runesCommon, runesB[j])
runesB[j] = -1
break
}
}
}

return common
return runesCommon
}

func transpositions(a, b []rune) int {
Expand Down
18 changes: 9 additions & 9 deletions metrics/levenshtein.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package metrics

import (
"strings"
"unicode/utf8"

"github.com/adrg/strutil/internal/mathutil"
)
Expand Down Expand Up @@ -56,8 +55,15 @@ func (m *Levenshtein) Distance(a, b string) int {
}

func (m *Levenshtein) distance(a, b string) (int, int) {
// Lower terms if case insensitive comparison is specified.
if !m.CaseSensitive {
a = strings.ToLower(a)
b = strings.ToLower(b)
}
runesA, runesB := []rune(a), []rune(b)

// Check if both terms are empty.
lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b)
lenA, lenB := len(runesA), len(runesB)
if lenA == 0 && lenB == 0 {
return 0, 0
}
Expand All @@ -71,12 +77,6 @@ func (m *Levenshtein) distance(a, b string) (int, int) {
return m.DeleteCost * lenA, maxLen
}

// Lower terms if case insensitive comparison is specified.
if !m.CaseSensitive {
a = strings.ToLower(a)
b = strings.ToLower(b)
}

// Initialize cost slice.
prevCol := make([]int, lenB+1)
for i := 0; i <= lenB; i++ {
Expand All @@ -92,7 +92,7 @@ func (m *Levenshtein) distance(a, b string) (int, int) {
insCost := col[j] + m.InsertCost

subCost := prevCol[j]
if a[i] != b[j] {
if runesA[i] != runesB[j] {
subCost += m.ReplaceCost
}

Expand Down
29 changes: 29 additions & 0 deletions metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ func TestHamming(t *testing.T) {
require.Equal(t, 0, h.Distance("", ""))
require.Equal(t, "0.75", sf(h.Compare("text", "test")))
require.Equal(t, "0.50", sf(h.Compare("once", "one")))
require.Equal(t, "1.00", sf(h.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.75", sf(h.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.75", sf(h.Compare("ab\u2018c", "ab\u2019c")))
h.CaseSensitive = false
require.Equal(t, "0.50", sf(h.Compare("one", "ONCE")))
}
Expand All @@ -25,6 +28,9 @@ func TestJaccard(t *testing.T) {
j := metrics.NewJaccard()
require.Equal(t, "1.00", sf(j.Compare("", "")))
require.Equal(t, "0.00", sf(j.Compare("a", "b")))
require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.50", sf(j.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.20", sf(j.Compare("ab\u2018c", "ab\u2019c")))
require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
j.NgramSize = 0
require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
Expand All @@ -37,6 +43,9 @@ func TestJaro(t *testing.T) {
j := metrics.NewJaro()
require.Equal(t, "1.00", sf(j.Compare("", "")))
require.Equal(t, "0.00", sf(j.Compare("test", "")))
require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.83", sf(j.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.83", sf(j.Compare("ab\u2018c", "ab\u2019c")))
require.Equal(t, "0.00", sf(j.Compare("a", "b")))
require.Equal(t, "0.78", sf(j.Compare("sort", "shirt")))
require.Equal(t, "0.64", sf(j.Compare("sort", "report")))
Expand All @@ -48,6 +57,9 @@ func TestJaroWinkler(t *testing.T) {
j := metrics.NewJaroWinkler()
require.Equal(t, "1.00", sf(j.Compare("", "")))
require.Equal(t, "0.00", sf(j.Compare("test", "")))
require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.88", sf(j.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.87", sf(j.Compare("ab\u2018c", "ab\u2019c")))
require.Equal(t, "0.80", sf(j.Compare("sort", "shirt")))
require.Equal(t, "0.94", sf(j.Compare("charm", "charmed")))
j.CaseSensitive = false
Expand All @@ -59,11 +71,19 @@ func TestLevenshtein(t *testing.T) {
require.Equal(t, 0, l.Distance("", ""))
require.Equal(t, 4, l.Distance("test", ""))
require.Equal(t, 4, l.Distance("", "test"))
require.Equal(t, 0, l.Distance("ab\u2019c", "ab\u2019c"))
require.Equal(t, 1, l.Distance("ab\u2019d", "ab\u2019c"))
require.Equal(t, 1, l.Distance("ab\u2018c", "ab\u2019c"))
require.Equal(t, "0.40", sf(l.Compare("book", "brick")))
require.Equal(t, "0.75", sf(l.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.75", sf(l.Compare("ab\u2018c", "ab\u2019c")))
l.CaseSensitive = false
require.Equal(t, "0.80", sf(l.Compare("hello", "jello")))
l.ReplaceCost = 2
require.Equal(t, "0.60", sf(l.Compare("hello", "JELLO")))
require.Equal(t, "1.00", sf(l.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.50", sf(l.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.50", sf(l.Compare("ab\u2018c", "ab\u2019c")))
}

func TestOperlapCoefficient(t *testing.T) {
Expand All @@ -72,6 +92,9 @@ func TestOperlapCoefficient(t *testing.T) {
require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
require.Equal(t, "0.00", sf(o.Compare("aa", "")))
require.Equal(t, "0.00", sf(o.Compare("bb", "")))
require.Equal(t, "1.00", sf(o.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.67", sf(o.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.33", sf(o.Compare("ab\u2018c", "ab\u2019c")))
o.NgramSize = 0
require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
require.Equal(t, "1.00", sf(o.Compare("aa", "aaaa")))
Expand All @@ -87,6 +110,9 @@ func TestSmithWatermanGotoh(t *testing.T) {
require.Equal(t, "0.00", sf(s.Compare("test", "")))
require.Equal(t, "0.00", sf(s.Compare("", "test")))
require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.75", sf(s.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.50", sf(s.Compare("ab\u2018c", "ab\u2019c")))
s.Substitution = nil
require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
s.CaseSensitive = false
Expand All @@ -103,6 +129,9 @@ func TestSorensenDice(t *testing.T) {
require.Equal(t, "1.00", sf(s.Compare("", "")))
require.Equal(t, "0.00", sf(s.Compare("a", "b")))
require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
require.Equal(t, "0.67", sf(s.Compare("ab\u2019d", "ab\u2019c")))
require.Equal(t, "0.33", sf(s.Compare("ab\u2018c", "ab\u2019c")))
s.NgramSize = 0
require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
s.CaseSensitive = false
Expand Down

0 comments on commit f9dd097

Please sign in to comment.