From df6d072b97865f5ce4a213aa7ceb5e8e390a0879 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Thu, 27 Oct 2022 15:48:11 +0300 Subject: [PATCH 01/11] Add multibyte character support for strutil.CommonPrefix --- internal/stringutil/stringutil.go | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/internal/stringutil/stringutil.go b/internal/stringutil/stringutil.go index 522ca77..2cdcaf0 100644 --- a/internal/stringutil/stringutil.go +++ b/internal/stringutil/stringutil.go @@ -1,17 +1,15 @@ package stringutil -import "unicode/utf8" - // CommonPrefix returns the common prefix of the specified strings. An empty // string is returned if the parameters have no prefix in common. func CommonPrefix(first, second string) string { - if utf8.RuneCountInString(first) > utf8.RuneCountInString(second) { - first, second = second, first + fRunes, sRunes := []rune(first), []rune(second) + if len(fRunes) > len(sRunes) { + fRunes, sRunes = sRunes, fRunes } var commonLen int - sRunes := []rune(second) - for i, r := range first { + for i, r := range fRunes { if r != sRunes[i] { break } From 40005b54e6afb3262d60d17f7d48f40724d42dfc Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Thu, 27 Oct 2022 15:50:45 +0300 Subject: [PATCH 02/11] Improve strutil.CommonPrefix test cases --- internal/stringutil/stringutil_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/stringutil/stringutil_test.go b/internal/stringutil/stringutil_test.go index fd917dd..5022181 100644 --- a/internal/stringutil/stringutil_test.go +++ b/internal/stringutil/stringutil_test.go @@ -17,6 +17,11 @@ func TestCommonPrefix(t *testing.T) { {"a", stringutil.CommonPrefix("aab", "ab")}, {"aa", stringutil.CommonPrefix("aab", "aaab")}, {"aa", stringutil.CommonPrefix("aaab", "aab")}, + {"忧郁的乌龟", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的乌龟")}, + {"忧郁的", stringutil.CommonPrefix("忧郁的", "忧郁的乌龟")}, + {"忧郁的", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的")}, + {"", stringutil.CommonPrefix("忧郁的乌龟", "郁的乌龟")}, + {"", stringutil.CommonPrefix("郁的乌龟", "忧郁的乌龟")}, }) } From 92cf5325e59f269fe142715f87fccaa65c04a760 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Thu, 27 Oct 2022 16:13:51 +0300 Subject: [PATCH 03/11] Add multibyte character support for metrics.Jaro --- metrics/jaro.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/metrics/jaro.go b/metrics/jaro.go index 0dbb544..7b394b2 100644 --- a/metrics/jaro.go +++ b/metrics/jaro.go @@ -62,22 +62,25 @@ func (m *Jaro) Compare(a, b string) float64 { } func matchingRunes(a, b string, limit int) []rune { - common := []rune{} - runesB := []rune(b) - lenB := len(runesB) - - for i, r := range a { + var ( + runesA = []rune(a) + runesB = []rune(b) + runesCommon = []rune{} + lenB = len(runesB) + ) + + for i, r := range runesA { end := mathutil.Min(i+limit+1, lenB) for j := mathutil.Max(0, i-limit); j < end; j++ { if r == runesB[j] && runesB[j] != -1 { - common = append(common, runesB[j]) + runesCommon = append(runesCommon, runesB[j]) runesB[j] = -1 break } } } - return common + return runesCommon } func transpositions(a, b []rune) int { From 634ece20b06473f5b9112403b2001dbd0d6b4631 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Thu, 27 Oct 2022 16:15:38 +0300 Subject: [PATCH 04/11] Add multibyte character support for metrics.Levenshtein --- metrics/levenshtein.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metrics/levenshtein.go b/metrics/levenshtein.go index 006aadc..2f4f191 100644 --- a/metrics/levenshtein.go +++ b/metrics/levenshtein.go @@ -76,6 +76,7 @@ func (m *Levenshtein) distance(a, b string) (int, int) { a = strings.ToLower(a) b = strings.ToLower(b) } + runesA, runesB := []rune(a), []rune(b) // Initialize cost slice. prevCol := make([]int, lenB+1) @@ -92,7 +93,7 @@ func (m *Levenshtein) distance(a, b string) (int, int) { insCost := col[j] + m.InsertCost subCost := prevCol[j] - if a[i] != b[j] { + if runesA[i] != runesB[j] { subCost += m.ReplaceCost } From b2c7f89fca1d8571e841d51ec6f58d7aa6907f62 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Wed, 22 Mar 2023 11:30:17 +0200 Subject: [PATCH 05/11] Improve internal stringutil.CommonPrefix test case --- internal/stringutil/stringutil_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/stringutil/stringutil_test.go b/internal/stringutil/stringutil_test.go index 5022181..b04dcd2 100644 --- a/internal/stringutil/stringutil_test.go +++ b/internal/stringutil/stringutil_test.go @@ -22,6 +22,9 @@ func TestCommonPrefix(t *testing.T) { {"忧郁的", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的")}, {"", stringutil.CommonPrefix("忧郁的乌龟", "郁的乌龟")}, {"", stringutil.CommonPrefix("郁的乌龟", "忧郁的乌龟")}, + {"\u2019", stringutil.CommonPrefix("\u2019a", "\u2019b")}, + {"a\u2019bc", stringutil.CommonPrefix("a\u2019bcd", "a\u2019bce")}, + {"abc", stringutil.CommonPrefix("abc\u2019d", "abc\u2020d")}, }) } From 29820035bad34d937e9cf5c643e521332a5c74ea Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Wed, 22 Mar 2023 11:50:16 +0200 Subject: [PATCH 06/11] Improve test cases for metrics --- metrics/metrics_test.go | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/metrics/metrics_test.go b/metrics/metrics_test.go index 3f18910..f2bf94e 100644 --- a/metrics/metrics_test.go +++ b/metrics/metrics_test.go @@ -17,6 +17,9 @@ func TestHamming(t *testing.T) { require.Equal(t, 0, h.Distance("", "")) require.Equal(t, "0.75", sf(h.Compare("text", "test"))) require.Equal(t, "0.50", sf(h.Compare("once", "one"))) + require.Equal(t, "1.00", sf(h.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.75", sf(h.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.75", sf(h.Compare("ab\u2018c", "ab\u2019c"))) h.CaseSensitive = false require.Equal(t, "0.50", sf(h.Compare("one", "ONCE"))) } @@ -25,6 +28,9 @@ func TestJaccard(t *testing.T) { j := metrics.NewJaccard() require.Equal(t, "1.00", sf(j.Compare("", ""))) require.Equal(t, "0.00", sf(j.Compare("a", "b"))) + require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.50", sf(j.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.20", sf(j.Compare("ab\u2018c", "ab\u2019c"))) require.Equal(t, "0.43", sf(j.Compare("night", "alright"))) j.NgramSize = 0 require.Equal(t, "0.43", sf(j.Compare("night", "alright"))) @@ -37,6 +43,9 @@ func TestJaro(t *testing.T) { j := metrics.NewJaro() require.Equal(t, "1.00", sf(j.Compare("", ""))) require.Equal(t, "0.00", sf(j.Compare("test", ""))) + require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.83", sf(j.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.83", sf(j.Compare("ab\u2018c", "ab\u2019c"))) require.Equal(t, "0.00", sf(j.Compare("a", "b"))) require.Equal(t, "0.78", sf(j.Compare("sort", "shirt"))) require.Equal(t, "0.64", sf(j.Compare("sort", "report"))) @@ -48,6 +57,9 @@ func TestJaroWinkler(t *testing.T) { j := metrics.NewJaroWinkler() require.Equal(t, "1.00", sf(j.Compare("", ""))) require.Equal(t, "0.00", sf(j.Compare("test", ""))) + require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.88", sf(j.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.87", sf(j.Compare("ab\u2018c", "ab\u2019c"))) require.Equal(t, "0.80", sf(j.Compare("sort", "shirt"))) require.Equal(t, "0.94", sf(j.Compare("charm", "charmed"))) j.CaseSensitive = false @@ -59,11 +71,19 @@ func TestLevenshtein(t *testing.T) { require.Equal(t, 0, l.Distance("", "")) require.Equal(t, 4, l.Distance("test", "")) require.Equal(t, 4, l.Distance("", "test")) + require.Equal(t, 0, l.Distance("ab\u2019c", "ab\u2019c")) + require.Equal(t, 1, l.Distance("ab\u2019d", "ab\u2019c")) + require.Equal(t, 1, l.Distance("ab\u2018c", "ab\u2019c")) require.Equal(t, "0.40", sf(l.Compare("book", "brick"))) + require.Equal(t, "0.75", sf(l.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.75", sf(l.Compare("ab\u2018c", "ab\u2019c"))) l.CaseSensitive = false require.Equal(t, "0.80", sf(l.Compare("hello", "jello"))) l.ReplaceCost = 2 require.Equal(t, "0.60", sf(l.Compare("hello", "JELLO"))) + require.Equal(t, "1.00", sf(l.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.50", sf(l.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.50", sf(l.Compare("ab\u2018c", "ab\u2019c"))) } func TestOperlapCoefficient(t *testing.T) { @@ -72,6 +92,9 @@ func TestOperlapCoefficient(t *testing.T) { require.Equal(t, "0.75", sf(o.Compare("night", "alright"))) require.Equal(t, "0.00", sf(o.Compare("aa", ""))) require.Equal(t, "0.00", sf(o.Compare("bb", ""))) + require.Equal(t, "1.00", sf(o.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.67", sf(o.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.33", sf(o.Compare("ab\u2018c", "ab\u2019c"))) o.NgramSize = 0 require.Equal(t, "0.75", sf(o.Compare("night", "alright"))) require.Equal(t, "1.00", sf(o.Compare("aa", "aaaa"))) @@ -87,6 +110,9 @@ func TestSmithWatermanGotoh(t *testing.T) { require.Equal(t, "0.00", sf(s.Compare("test", ""))) require.Equal(t, "0.00", sf(s.Compare("", "test"))) require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten"))) + require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.75", sf(s.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.50", sf(s.Compare("ab\u2018c", "ab\u2019c"))) s.Substitution = nil require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten"))) s.CaseSensitive = false @@ -103,6 +129,9 @@ func TestSorensenDice(t *testing.T) { require.Equal(t, "1.00", sf(s.Compare("", ""))) require.Equal(t, "0.00", sf(s.Compare("a", "b"))) require.Equal(t, "0.60", sf(s.Compare("night", "alright"))) + require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c"))) + require.Equal(t, "0.67", sf(s.Compare("ab\u2019d", "ab\u2019c"))) + require.Equal(t, "0.33", sf(s.Compare("ab\u2018c", "ab\u2019c"))) s.NgramSize = 0 require.Equal(t, "0.60", sf(s.Compare("night", "alright"))) s.CaseSensitive = false From a03b045ef461b0128b356896005843daf543e23a Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Fri, 19 May 2023 17:25:46 +0300 Subject: [PATCH 07/11] Bump github.com/stretchr/testify from 1.8.2 to 1.8.3 --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 2daa717..d0660c4 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,4 @@ module github.com/adrg/strutil go 1.14 -require github.com/stretchr/testify v1.8.2 +require github.com/stretchr/testify v1.8.3 diff --git a/go.sum b/go.sum index 6a56e69..c3467ce 100644 --- a/go.sum +++ b/go.sum @@ -8,8 +8,8 @@ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSS github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8= -github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= +github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= From 10a6ebf72a9a6e3166e3620e62d162ea80e0dc6e Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Fri, 19 May 2023 17:25:57 +0300 Subject: [PATCH 08/11] Bump CI Go version to 1.17 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8233d01..f98fc43 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - go: ['1.16'] + go: ['1.17'] steps: - name: Setup uses: actions/setup-go@v3 From 5b907ea79bbe3963ff709512fe8b59eb6427f0ae Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Wed, 31 May 2023 15:51:50 +0300 Subject: [PATCH 09/11] Bump github.com/stretchr/testify from 1.8.2 to 1.8.4 --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index d0660c4..485a5f0 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,4 @@ module github.com/adrg/strutil go 1.14 -require github.com/stretchr/testify v1.8.3 +require github.com/stretchr/testify v1.8.4 diff --git a/go.sum b/go.sum index c3467ce..479781e 100644 --- a/go.sum +++ b/go.sum @@ -8,8 +8,8 @@ github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSS github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= -github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= From c924f4e24d05e1822892a88f58c1560028b6ea2b Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Thu, 7 Sep 2023 11:19:07 +0300 Subject: [PATCH 10/11] Bump actions/checkout from 3 to 4 --- .github/workflows/ci.yml | 2 +- .github/workflows/codeql-analysis.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f98fc43..f2452bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,7 +18,7 @@ jobs: go-version: ${{ matrix.go }} - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies run: | diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b1fed34..fd97148 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -24,7 +24,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Initialize CodeQL uses: github/codeql-action/init@v2 From f86c21ba9d3b52e6097c62f3bf3b619eebfa64c9 Mon Sep 17 00:00:00 2001 From: Adrian-George Bostan Date: Wed, 27 Sep 2023 18:46:50 +0300 Subject: [PATCH 11/11] Improve Levenshtein metric performance slightly --- metrics/levenshtein.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/metrics/levenshtein.go b/metrics/levenshtein.go index 2f4f191..e0e9e9c 100644 --- a/metrics/levenshtein.go +++ b/metrics/levenshtein.go @@ -2,7 +2,6 @@ package metrics import ( "strings" - "unicode/utf8" "github.com/adrg/strutil/internal/mathutil" ) @@ -56,8 +55,15 @@ func (m *Levenshtein) Distance(a, b string) int { } func (m *Levenshtein) distance(a, b string) (int, int) { + // Lower terms if case insensitive comparison is specified. + if !m.CaseSensitive { + a = strings.ToLower(a) + b = strings.ToLower(b) + } + runesA, runesB := []rune(a), []rune(b) + // Check if both terms are empty. - lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b) + lenA, lenB := len(runesA), len(runesB) if lenA == 0 && lenB == 0 { return 0, 0 } @@ -71,13 +77,6 @@ func (m *Levenshtein) distance(a, b string) (int, int) { return m.DeleteCost * lenA, maxLen } - // Lower terms if case insensitive comparison is specified. - if !m.CaseSensitive { - a = strings.ToLower(a) - b = strings.ToLower(b) - } - runesA, runesB := []rune(a), []rune(b) - // Initialize cost slice. prevCol := make([]int, lenB+1) for i := 0; i <= lenB; i++ {