add Python implementation of Editops/Opcodes

mazlum · Oct 6, 2022 · 773a45e · 773a45e
1 parent aa6a88f
commit 773a45e
Show file tree

Hide file tree

Showing 73 changed files with 2,188 additions and 1,638 deletions.
diff --git a/.github/workflows/releasebuild.yml b/.github/workflows/releasebuild.yml
@@ -160,7 +160,7 @@ jobs:
       fail-fast: false
       matrix:
         arch: [auto, aarch64, ppc64le, s390x]
-        python_tag: [ "cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*"]
+        python_tag: ["cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*"]
         exclude:
           # PyPy builds not available for these platforms
           - arch: ppc64le

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -97,6 +97,21 @@ repos:
       - flake8-bugbear
       - pep8-naming
 
+# Flake8 also supports pre-commit natively (same author)
+- repo: https://github.com/PyCQA/flake8
+  rev: "5.0.4"
+  hooks:
+  - id: flake8
+    exclude: ^(docs/.*|tools/.*)$
+    additional_dependencies: *flake8_dependencies
+
+# PyLint has native support - not always usable, but works for us
+- repo: https://github.com/PyCQA/pylint
+  rev: "v2.15.3"
+  hooks:
+  - id: pylint
+    files: ^pybind11
+
 # CMake formatting
 - repo: https://github.com/cheshirekow/cmake-format-precommit
   rev: "v0.6.13"
@@ -106,6 +121,15 @@ repos:
     types: [file]
     files: (\.cmake|CMakeLists.txt)(.in)?$
 
+# Check static types with mypy
+#- repo: https://github.com/pre-commit/mirrors-mypy
+#  rev: "v0.971"
+#  hooks:
+#  - id: mypy
+#    args: []
+#    exclude: ^(tests|docs)/
+#    additional_dependencies: [nox, rich]
+
 # Checks the manifest for missing files (native support)
 - repo: https://github.com/mgedmin/check-manifest
   rev: "0.48"

diff --git a/bench/benchmark_fuzz.py b/bench/benchmark_fuzz.py
@@ -1,7 +1,6 @@
 # todo combine benchmarks of scorers into common code base
 import timeit
 
-import numpy as np
 import pandas
 
 
@@ -63,7 +62,7 @@ def scorer_benchmark(funcname):
 # token_ratio is unique to RapidFuzz
 time_token_ratio = benchmark(
     "token_ratio",
-    f"[rfuzz.token_ratio(a, b, processor=None) for b in b_list]",
+    "[rfuzz.token_ratio(a, b, processor=None) for b in b_list]",
     setup,
     lengths,
     count,
@@ -72,7 +71,7 @@ def scorer_benchmark(funcname):
 # this gets very slow, so only benchmark it for smaller values
 time_token_ratio_simple = benchmark(
     "fuzzywuzzy",
-    f"[max(rfuzz.token_sort_ratio(a, b, processor=None), rfuzz.token_set_ratio(a, b, processor=None)) for b in b_list]",
+    "[max(rfuzz.token_sort_ratio(a, b, processor=None), rfuzz.token_set_ratio(a, b, processor=None)) for b in b_list]",
     setup,
     lengths,
     count,
@@ -86,12 +85,12 @@ def scorer_benchmark(funcname):
     }
 )
 
-df.to_csv(f"results/token_ratio.csv", sep=",", index=False)
+df.to_csv("results/token_ratio.csv", sep=",", index=False)
 
 # partial_token_ratio is unique to RapidFuzz
 time_partial_token_ratio = benchmark(
     "token_ratio",
-    f"[rfuzz.partial_token_ratio(a, b, processor=None) for b in b_list]",
+    "[rfuzz.partial_token_ratio(a, b, processor=None) for b in b_list]",
     setup,
     lengths,
     count,
@@ -100,7 +99,10 @@ def scorer_benchmark(funcname):
 # this gets very slow, so only benchmark it for smaller values
 time_partial_token_ratio_simple = benchmark(
     "fuzzywuzzy",
-    f"[max(rfuzz.partial_token_sort_ratio(a, b, processor=None), rfuzz.partial_token_set_ratio(a, b, processor=None)) for b in b_list]",
+    (
+        "[max(rfuzz.partial_token_sort_ratio(a, b, processor=None), "
+        "rfuzz.partial_token_set_ratio(a, b, processor=None)) for b in b_list]"
+    ),
     setup,
     lengths,
     count,
@@ -114,4 +116,4 @@ def scorer_benchmark(funcname):
     }
 )
 
-df.to_csv(f"results/partial_token_ratio.csv", sep=",", index=False)
+df.to_csv("results/partial_token_ratio.csv", sep=",", index=False)
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,3 +8,36 @@ requires = [
 ]
 build-backend = "backend"
 backend-path = ["_custom_build"]
+
+[tool.isort]
+profile = "black"
+
+[tool.mypy]
+files = "src"
+python_version = "3.7"
+strict = true
+show_error_codes = true
+enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]
+warn_unreachable = true
+
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"]
+xfail_strict = true
+filterwarnings = ["error"]
+log_cli_level = "info"
+testpaths = ["tests"]
+
+[tool.pylint]
+master.py-version = "3.6"
+reports.output-format = "colorized"
+messages_control.disable = [
+  "design",
+  "fixme",
+  "imports",
+  "line-too-long",
+  "imports",
+  "invalid-name",
+  "protected-access",
+  "missing-module-docstring",
+]
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,6 @@
+[flake8]
+max-line-length = 120
+show_source = True
+exclude = .git, __pycache__, build, dist, docs, tools, venv
+extend-ignore = E203, E722, B903, B950, N801, N802, N806
+extend-select = B9
diff --git a/setup.py b/setup.py
@@ -69,7 +69,7 @@ def run_setup(with_binary):
 else:
     try:
         run_setup(True)
-    except:
+    except BaseException:
         show_message(
             "WARNING: The C extension could not be compiled, speedups"
             " are not enabled.",

diff --git a/src/rapidfuzz/_utils.py b/src/rapidfuzz/_utils.py
@@ -1,24 +1,33 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2022 Max Bachmann
 
+from __future__ import annotations
 
-def _GetScorerFlagsDistance(**kwargs):
+from typing import Any, Callable
+
+
+def _get_scorer_flags_distance(**_kwargs: Any) -> dict[str, Any]:
     return {"optimal_score": 0, "worst_score": 2**63 - 1, "flags": (1 << 6)}
 
 
-def _GetScorerFlagsSimilarity(**kwargs):
+def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]:
     return {"optimal_score": 2**63 - 1, "worst_score": 0, "flags": (1 << 6)}
 
 
-def _GetScorerFlagsNormalizedDistance(**kwargs):
+def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]:
     return {"optimal_score": 0, "worst_score": 1, "flags": (1 << 5)}
 
 
-def _GetScorerFlagsNormalizedSimilarity(**kwargs):
+def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]:
     return {"optimal_score": 1, "worst_score": 0, "flags": (1 << 5)}
 
 
-def fallback_import(module: str, name: str, set_attrs: bool = True):
+def fallback_import(
+    module: str,
+    name: str,
+    cached_scorer_call: dict[str, Callable[..., dict[str, Any]]] | None = None,
+    set_attrs: bool = True,
+) -> Any:
     """
     import library function and possibly fall back to a pure Python version
     when no C++ implementation is available
@@ -35,6 +44,9 @@ def fallback_import(module: str, name: str, set_attrs: bool = True):
             f"cannot import name '{name}' from '{py_mod.__name}' ({py_mod.__file__})"
         )
 
+    if cached_scorer_call:
+        py_func._RF_ScorerPy = cached_scorer_call
+
     if impl == "cpp":
         cpp_mod = importlib.import_module(module + "_cpp")
     elif impl == "python":
@@ -55,14 +67,22 @@ def fallback_import(module: str, name: str, set_attrs: bool = True):
     if set_attrs:
         cpp_func.__name__ = py_func.__name__
         cpp_func.__doc__ = py_func.__doc__
+
+    if cached_scorer_call:
+        cpp_func._RF_ScorerPy = cached_scorer_call
+
     return cpp_func
 
 
-default_distance_attribute = {"get_scorer_flags": _GetScorerFlagsDistance}
-default_similarity_attribute = {"get_scorer_flags": _GetScorerFlagsSimilarity}
-default_normalized_distance_attribute = {
-    "get_scorer_flags": _GetScorerFlagsNormalizedDistance
+default_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {
+    "get_scorer_flags": _get_scorer_flags_distance
+}
+default_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
+    "get_scorer_flags": _get_scorer_flags_similarity
+}
+default_normalized_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {
+    "get_scorer_flags": _get_scorer_flags_normalized_distance
 }
-default_normalized_similarity_attribute = {
-    "get_scorer_flags": _GetScorerFlagsNormalizedSimilarity
+default_normalized_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
+    "get_scorer_flags": _get_scorer_flags_normalized_similarity
 }
diff --git a/src/rapidfuzz/_utils.pyi b/src/rapidfuzz/_utils.pyi
diff --git a/src/rapidfuzz/cpp_common.pxd b/src/rapidfuzz/cpp_common.pxd
@@ -2,18 +2,25 @@
 # cython: language_level=3, binding=True, linetrace=True
 
 from cpython.object cimport PyObject
-from cpython.pycapsule cimport (PyCapsule_GetPointer, PyCapsule_IsValid,
-                                PyCapsule_New)
+from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_IsValid, PyCapsule_New
 from libc.stddef cimport wchar_t
 from libc.stdint cimport int64_t, uint64_t
 from libc.stdlib cimport free, malloc
 from libcpp cimport bool
 from libcpp.utility cimport move, pair
 from libcpp.vector cimport vector
-from rapidfuzz_capi cimport (SCORER_STRUCT_VERSION, RF_GetScorerFlags,
-                             RF_Kwargs, RF_KwargsInit, RF_Preprocessor,
-                             RF_Scorer, RF_ScorerFlags, RF_ScorerFuncInit,
-                             RF_String, RF_StringType)
+from rapidfuzz_capi cimport (
+    SCORER_STRUCT_VERSION,
+    RF_GetScorerFlags,
+    RF_Kwargs,
+    RF_KwargsInit,
+    RF_Preprocessor,
+    RF_Scorer,
+    RF_ScorerFlags,
+    RF_ScorerFuncInit,
+    RF_String,
+    RF_StringType,
+)
 
 from array import array
 

diff --git a/src/rapidfuzz/distance/DamerauLevenshtein.py b/src/rapidfuzz/distance/DamerauLevenshtein.py
@@ -1,19 +1,20 @@
 # SPDX-License-Identifier: MIT
 # Copyright (C) 2022 Max Bachmann
 
+from __future__ import annotations
+
 from rapidfuzz._utils import default_distance_attribute as _dist_attr
 from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr
 from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr
 from rapidfuzz._utils import default_similarity_attribute as _sim_attr
 from rapidfuzz._utils import fallback_import as _fallback_import
 
 _mod = "rapidfuzz.distance.DamerauLevenshtein"
-distance = _fallback_import(_mod, "distance")
-similarity = _fallback_import(_mod, "similarity")
-normalized_distance = _fallback_import(_mod, "normalized_distance")
-normalized_similarity = _fallback_import(_mod, "normalized_similarity")
-
-distance._RF_ScorerPy = _dist_attr
-similarity._RF_ScorerPy = _sim_attr
-normalized_distance._RF_ScorerPy = _norm_dist_attr
-normalized_similarity._RF_ScorerPy = _norm_sim_attr
+distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr)
+similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr)
+normalized_distance = _fallback_import(
+    _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr
+)
+normalized_similarity = _fallback_import(
+    _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr
+)
diff --git a/src/rapidfuzz/distance/DamerauLevenshtein.pyi b/src/rapidfuzz/distance/DamerauLevenshtein.pyi
@@ -1,48 +1,35 @@
-from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar
+# SPDX-License-Identifier: MIT
+# Copyright (C) 2022 Max Bachmann
 
-from typing_extensions import Protocol
+from __future__ import annotations
 
-from rapidfuzz.distance import Editops, Opcodes
+from typing import Callable, Hashable, Sequence
 
-class _ScorerAttributes(Protocol):
-    _RF_ScorerPy: Dict
-
-def _attr_decorator(func: Any) -> _ScorerAttributes:
-    return func
-
-_StringType = Sequence[Hashable]
-_S1 = TypeVar("_S1")
-_S2 = TypeVar("_S2")
-
-@_attr_decorator
 def distance(
-    s1: _S1,
-    s2: _S2,
+    s1: Sequence[Hashable],
+    s2: Sequence[Hashable],
     *,
-    processor: Optional[Callable[..., _StringType]] = None,
-    score_cutoff: Optional[int] = None
+    processor: Callable[..., Sequence[Hashable]] | None = None,
+    score_cutoff: int | None = None,
 ) -> int: ...
-@_attr_decorator
-def normalized_distance(
-    s1: _S1,
-    s2: _S2,
-    *,
-    processor: Optional[Callable[..., _StringType]] = None,
-    score_cutoff: Optional[float] = 0
-) -> float: ...
-@_attr_decorator
 def similarity(
-    s1: _S1,
-    s2: _S2,
+    s1: Sequence[Hashable],
+    s2: Sequence[Hashable],
     *,
-    processor: Optional[Callable[..., _StringType]] = None,
-    score_cutoff: Optional[int] = None
+    processor: Callable[..., Sequence[Hashable]] | None = None,
+    score_cutoff: int | None = None,
 ) -> int: ...
-@_attr_decorator
+def normalized_distance(
+    s1: Sequence[Hashable],
+    s2: Sequence[Hashable],
+    *,
+    processor: Callable[..., Sequence[Hashable]] | None = None,
+    score_cutoff: float | None = None,
+) -> float: ...
 def normalized_similarity(
-    s1: _S1,
-    s2: _S2,
+    s1: Sequence[Hashable],
+    s2: Sequence[Hashable],
     *,
-    processor: Optional[Callable[..., _StringType]] = None,
-    score_cutoff: Optional[float] = 0
+    processor: Callable[..., Sequence[Hashable]] | None = None,
+    score_cutoff: float | None = None,
 ) -> float: ...