Skip to content

Commit

Permalink
Merge branch 'origin/master' into '7.6-couchbase' (#1986)
Browse files Browse the repository at this point in the history
```
* 5c7445c Abhinav Dangeti | Fix merge conflict
*   a0cb65a Abhinav Dangeti | Merge remote-tracking branch 'origin/master' into 7.6-couchbase
|\
| * 5f1f45a Sergio Vera | Fixed spanish accents normalization (#1957)
| * e26eace Mohd Shaad Khan | MB-60207 fix facets merge (#1946)
| * c8e3daf Likith B | #1873: Added timeout option in the Search Handler (#1898)
| * 6dee5e9 Aditi Ahuja | Added missing nil check (#1905)
| * 907c83e Rahul Rampure | Added a document that demonstrates the performance benefits of docvalues (#1897)
* | 8b9206a Abhi Dangeti | MB-60739: Upgrade go-faiss & zapx/v16 (#1985)
```
  • Loading branch information
abhinavdangeti authored Feb 13, 2024
1 parent 8b9206a commit f8af28e
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 15 deletions.
5 changes: 5 additions & 0 deletions analysis/lang/es/analyzer_es.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ func AnalyzerConstructor(config map[string]interface{},
if err != nil {
return nil, err
}
normalizeEsFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stopEsFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
Expand All @@ -47,6 +51,7 @@ func AnalyzerConstructor(config map[string]interface{},
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEsFilter,
normalizeEsFilter,
lightStemmerEsFilter,
},
}
Expand Down
15 changes: 0 additions & 15 deletions analysis/lang/es/light_stemmer_es.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,6 @@ func stem(input []rune) []rune {
return input
}

for i, r := range input {
switch r {
case 'à', 'á', 'â', 'ä':
input[i] = 'a'
case 'ò', 'ó', 'ô', 'ö':
input[i] = 'o'
case 'è', 'é', 'ê', 'ë':
input[i] = 'e'
case 'ù', 'ú', 'û', 'ü':
input[i] = 'u'
case 'ì', 'í', 'î', 'ï':
input[i] = 'i'
}
}

switch input[l-1] {
case 'o', 'a', 'e':
return input[:l-1]
Expand Down
67 changes: 67 additions & 0 deletions analysis/lang/es/spanish_normalize.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package es

import (
"bytes"

"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const NormalizeName = "normalize_es"

type SpanishNormalizeFilter struct {
}

func NewSpanishNormalizeFilter() *SpanishNormalizeFilter {
return &SpanishNormalizeFilter{}
}

func (s *SpanishNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}

func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'à', 'á', 'â', 'ä':
runes[i] = 'a'
case 'ò', 'ó', 'ô', 'ö':
runes[i] = 'o'
case 'è', 'é', 'ê', 'ë':
runes[i] = 'e'
case 'ù', 'ú', 'û', 'ü':
runes[i] = 'u'
case 'ì', 'í', 'î', 'ï':
runes[i] = 'i'
}
}

return analysis.BuildTermFromRunes(runes)
}

func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSpanishNormalizeFilter(), nil
}

func init() {
registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
}
112 changes: 112 additions & 0 deletions analysis/lang/es/spanish_normalize_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package es

import (
"reflect"
"testing"

"github.com/blevesearch/bleve/v2/analysis"
)

func TestSpanishNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Guía"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Guia"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Belcebú"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Belcebu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Limón"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Limon"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("agüero"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("aguero"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("laúd"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("laud"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}

spanishNormalizeFilter := NewSpanishNormalizeFilter()
for _, test := range tests {
actual := spanishNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
}
}
}

0 comments on commit f8af28e

Please sign in to comment.