Merge pull request #62 from izihawa/development

Development
izihawa · Jul 7, 2022 · 106856f · 106856f
2 parents 07a568c + ab57bd4
commit 106856f
Show file tree

Hide file tree

Showing 16 changed files with 278 additions and 68 deletions.
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -31,7 +31,7 @@ rust_library(
     ),
     edition = "2021",
     deps = all_crate_deps() + [":proto_grpc_rust"],
-    version = "0.8.5",
+    version = "0.8.7",
 )
 
 rust_binary(
@@ -62,4 +62,4 @@ container_push(
    registry = "index.docker.io",
    repository = "izihawa/summa-server",
    tag = "latest",
-)
+)
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 edition = "2021"
 name = "summa"
-version = "0.8.5"
+version = "0.8.7"
 license-file = "LICENSE"
 description = "Fast full-text search server"
 homepage = "https://github.com/izihawa/summa"

diff --git a/WORKSPACE b/WORKSPACE
@@ -36,9 +36,9 @@ http_archive(
 )
 http_archive(
     name = "rules_python",
-    sha256 = "5fa3c738d33acca3b97622a13a741129f67ef43f5fdfcec63b29374cc0574c29",
-    strip_prefix = "rules_python-0.9.0",
-    urls = ["https://github.com/bazelbuild/rules_python/archive/0.9.0.tar.gz"],
+    sha256 = "95525d542c925bc2f4a7ac9b68449fc96ca52cfba15aa883f7193cdf745c38ff",
+    strip_prefix = "rules_python-cccbfb920c8b100744c53c0c03900f1be4040fe8",
+    url = "https://github.com/ppodolsky/rules_python/archive/cccbfb920c8b100744c53c0c03900f1be4040fe8.tar.gz",
 )
 
 # GRPC
@@ -53,7 +53,9 @@ grpc_extra_deps()
 # Rust
 load("@rules_rust//rust:repositories.bzl", "rust_register_toolchains", "rules_rust_dependencies")
 rules_rust_dependencies()
-rust_register_toolchains()
+rust_register_toolchains(
+    version="1.62.0",
+)
 load("@rules_rust//crate_universe:repositories.bzl", "crate_universe_dependencies")
 crate_universe_dependencies(bootstrap = True)
 load("@rules_rust//crate_universe:defs.bzl", "crate", "crates_repository", "render_config")

diff --git a/aiosumma/BUILD.bazel b/aiosumma/BUILD.bazel
@@ -67,7 +67,7 @@ py_wheel(
         "summa/proto/summa_py_pb",
         "aiosumma",
     ],
-    version = "2.3.15",
+    version = "2.3.23",
     deps = [
         ":aiosumma",
         ":data",

diff --git a/aiosumma/aiosumma/parser/elements.py b/aiosumma/aiosumma/parser/elements.py
@@ -75,7 +75,7 @@ def to_summa_query(self):
         elif isinstance(self.expr, Regex):
             return {'regex': {'field': self.name, 'value': self.expr.value}}
         elif isinstance(self.expr, Proximity):
-            return {'phrase': {'field': self.name, 'value': self.expr.term, 'slop': self.expr.slop}}
+            return {'phrase': {'field': self.name, 'value': self.expr.term.value, 'slop': self.expr.slop}}
         else:
             raise UnsupportedQueryError(error=f'{self.expr} in search field `{self.name}`')
 
@@ -253,10 +253,10 @@ def __str__(self):
 class BaseApprox(Item):
     """Base for approximations, that is fuzziness and proximity
     """
-    _equality_attrs = ['term', 'degree']
+    _equality_attrs = ['term', 'slop']
 
     def __repr__(self):  # pragma: no cover
-        return "%s(%s, %s)" % (self.__class__.__name__, self.term.__repr__(), self.degree)
+        return "%s(%s, %s)" % (self.__class__.__name__, self.term.__repr__(), self.slop)
 
     @property
     def children(self):
@@ -266,17 +266,17 @@ def children(self):
 class Fuzzy(BaseApprox):
     """Fuzzy search on word
     :param Word term: the approximated term
-    :param degree: the degree which will be converted to :py:class:`decimal.Decimal`.
+    :param slop: the degree which will be converted to :py:class:`decimal.Decimal`.
     """
-    def __init__(self, term, degree=None):
+    def __init__(self, term, slop=None):
         super().__init__()
         self.term = term
-        if degree is None:
-            degree = 0.5
-        self.degree = Decimal(degree).normalize()
+        if slop is None:
+            slop = 0.5
+        self.slop = Decimal(slop).normalize()
 
     def __str__(self):
-        return "%s~%s" % (self.term, self.degree)
+        return "%s~%s" % (self.term, self.slop)
 
 
 class Proximity(BaseApprox):

diff --git a/aiosumma/aiosumma/parser/parser.py b/aiosumma/aiosumma/parser/parser.py
@@ -123,7 +123,7 @@
 
 # r'(?P<phrase>"(?:[^\\"]|\\"|\\[^"])*")' # this is quite complicated to handle \"
 # modifiers after term or phrase
-APPROX_RE = r'~(?P<degree>[0-9.]+)?'
+APPROX_RE = r'~(?P<slop>[0-9.]+)?'
 BOOST_RE = r'\^(?P<score>[0-9.]+)?'
 
 # regex
@@ -199,7 +199,7 @@ def t_REGEX(t):
 @lex.TOKEN(APPROX_RE)
 def t_APPROX(t):
     m = re.match(APPROX_RE, t.value)
-    t.value = m.group("degree")
+    t.value = m.group("slop")
     return t
 
 

diff --git a/aiosumma/aiosumma/tests/test_processor.py b/aiosumma/aiosumma/tests/test_processor.py
@@ -85,51 +85,51 @@ def test_production_chain():
                                     'query': {'match': {'value': 'claudio'}}},
                                    {'occur': 'should',
                                     'query': {'boost': {'query': {'match': {'value': 'claudios'}},
-                                                        'score': '0.85'}}},
+                                                        'score': '0.65'}}},
                                    {'occur': 'should',
                                     'query': {'match': {'value': 'claude'}}},
                                    {'occur': 'should',
                                     'query': {'boost': {'query': {'match': {'value': 'clauded'}},
-                                                        'score': '0.85'}}},
+                                                        'score': '0.65'}}},
                                    {'occur': 'should',
                                     'query': {'boost': {'query': {'match': {'value': 'claudes'}},
-                                                        'score': '0.85'}}},
+                                                        'score': '0.65'}}},
                                    {'occur': 'should',
                                     'query': {'boost': {'query': {'match': {'value': 'clauding'}},
-                                                        'score': '0.85'}}},
+                                                        'score': '0.65'}}},
                                    {'occur': 'should',
                                     'query': {'match': {'value': 'rugarli'}}},
                                    {'occur': 'should',
                                     'query': {'boost': {'query': {'match': {'value': 'rugarlis'}},
-                                                        'score': '0.85'}}}]},
+                                                        'score': '0.65'}}}]},
     }
     processed_query = query_processor.process('+(search engine) -car', 'en')
     assert processed_query.to_summa_query() == {
         'boolean': {'subqueries': [
             {'occur': 'must', 'query': {'match': {'value': 'search'}}},
-            {'occur': 'must', 'query': {'boost': {'query': {'match': {'value': 'searches'}}, 'score': '0.85'}}},
+            {'occur': 'must', 'query': {'boost': {'query': {'match': {'value': 'searches'}}, 'score': '0.65'}}},
             {'occur': 'must', 'query': {'match': {'value': 'engine'}}},
-            {'occur': 'must', 'query': {'boost': {'query': {'match': {'value': 'engines'}}, 'score': '0.85'}}},
+            {'occur': 'must', 'query': {'boost': {'query': {'match': {'value': 'engines'}}, 'score': '0.65'}}},
             {'occur': 'must_not', 'query': {'match': {'value': 'car'}}},
-            {'occur': 'must_not', 'query': {'boost': {'query': {'match': {'value': 'cars'}}, 'score': '0.85'}}}]}}
+            {'occur': 'must_not', 'query': {'boost': {'query': {'match': {'value': 'cars'}}, 'score': '0.65'}}}]}}
 
     processed_query = query_processor.process('search engine', 'en')
     assert processed_query.to_summa_query() == {'boolean': {'subqueries': [
         {'occur': 'should', 'query': {'match': {'value': 'search'}}},
-        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'searches'}}, 'score': '0.85'}}},
+        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'searches'}}, 'score': '0.65'}}},
         {'occur': 'should', 'query': {'match': {'value': 'engine'}}},
-        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'engines'}}, 'score': '0.85'}}}
+        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'engines'}}, 'score': '0.65'}}}
     ]}}
     processed_query = query_processor.process('author:Smith +"title book"', 'en')
     assert processed_query.to_summa_query() == {'boolean': {'subqueries': [
         {'occur': 'should', 'query': {'term': {'field': 'author', 'value': 'smith'}}},
         {'occur': 'should', 'query':
-            {'boost': {'query': {'term': {'field': 'author', 'value': 'smiths'}}, 'score': '0.85'}}},
+            {'boost': {'query': {'term': {'field': 'author', 'value': 'smiths'}}, 'score': '0.65'}}},
         {'occur': 'must', 'query': {'match': {'value': '"title book"'}}}]}}
     processed_query = query_processor.process('science +year:[2010 TO *]', 'en')
     assert processed_query.to_summa_query() == {'boolean': {'subqueries': [
         {'occur': 'should', 'query': {'match': {'value': 'science'}}},
-        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'sciences'}}, 'score': '0.85'}}},
+        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'sciences'}}, 'score': '0.65'}}},
         {'occur': 'must', 'query': {'range': {
             'field': 'year', 'value': {'including_left': True, 'including_right': True, 'left': '2010', 'right': '*'}}}}
     ]}}
@@ -140,9 +140,9 @@ def test_unknown_language_transformer():
     processed_query = query_processor.process('search engine', 'zz')
     assert processed_query.to_summa_query() == {'boolean': {'subqueries': [
         {'occur': 'should', 'query': {'match': {'value': 'search'}}},
-        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'searches'}}, 'score': '0.85'}}},
+        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'searches'}}, 'score': '0.65'}}},
         {'occur': 'should', 'query': {'match': {'value': 'engine'}}},
-        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'engines'}}, 'score': '0.85'}}}
+        {'occur': 'should', 'query': {'boost': {'query': {'match': {'value': 'engines'}}, 'score': '0.65'}}}
     ]}}
 
 
@@ -166,7 +166,7 @@ def test_exact_match_transformers():
         {'occur': 'should', 'query': {'match': {'value': 'search'}}},
         {'occur': 'should', 'query': {'match': {'value': 'engine'}}},
         {'occur': 'should', 'query': {'boost': {'query': {
-            'phrase': {'field': 'title', 'value': 'search engine'}}, 'score': '2'}}
+            'phrase': {'field': 'title', 'slop': 3, 'value': 'search engine'}}, 'score': '2'}}
          }
     ]}}
 

diff --git a/aiosumma/aiosumma/tree_transformers/exact_match.py b/aiosumma/aiosumma/tree_transformers/exact_match.py
@@ -5,6 +5,7 @@
 
 from ..parser.elements import (
     Boost,
+    Phrase,
     Proximity,
     SearchField,
     SynonymsGroup,
@@ -35,16 +36,22 @@ def visit_group(self, node, context, parents=None):
                 phrase.append(operand.value)
             elif isinstance(operand, SynonymsGroup):
                 phrase.append(operand.operands[0].value)
+            elif isinstance(operand, SearchField):
+                continue
             else:
                 return node, False
+
+        if not phrase:
+            return node, False
+
         phrase = ' '.join(phrase)
 
         score = self.score
         if callable(score):
             score = score(node, context)
         if self.default_phrase_field:
-            new_operands.append(Boost(SearchField(self.default_phrase_field, Proximity(phrase, slop=3)), score))
+            new_operands.append(Boost(SearchField(self.default_phrase_field, Proximity(Phrase(phrase), slop=3)), score))
         else:
-            new_operands.append(Boost(Proximity(phrase, slop=3), score))
+            new_operands.append(Boost(Proximity(Phrase(phrase), slop=3), score))
         node.operands = new_operands
         return node, False
diff --git a/aiosumma/aiosumma/tree_transformers/morphy.py b/aiosumma/aiosumma/tree_transformers/morphy.py
@@ -16,10 +16,11 @@ class MorphyTreeTransformer(TreeTransformer):
         'en': EnglishMorphology('en_core_web_sm'),
     }
 
-    def __init__(self, enable_morph=True, enable_accent=True, ignore_nodes=None):
+    def __init__(self, enable_morph=True, enable_accent=True, score: str = '0.65', ignore_nodes=None):
         super().__init__(ignore_nodes=ignore_nodes)
         self.enable_morph = enable_morph
         self.enable_accent = enable_accent
+        self.score = score
 
     def visit_word(self, node, context, parents=None):
         forms = [node]
@@ -30,7 +31,7 @@ def visit_word(self, node, context, parents=None):
         if self.enable_morph and context.language in self.morphology:
             for w in self.morphology[context.language].derive_forms(node.value):
                 if node.value != w:
-                    forms.append(Boost(Word(w), score='0.85'))
+                    forms.append(Boost(Word(w), score=self.score))
 
         if len(forms) == 1:
             return node, True

diff --git a/aiosumma/aiosumma/tree_transformers/tantivy.py b/aiosumma/aiosumma/tree_transformers/tantivy.py
@@ -21,6 +21,8 @@ def visit_phrase(self, node, context, parents=None):
         return node, False
 
     def visit_minus(self, node, context, parents=None):
+        if parents is None:
+            return node.a, False
         if isinstance(node.a, BaseGroup):
             op = node.a
             new_operands = []