From 773a45e0d15101dfd0381de28f87cf2674b805d7 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Thu, 6 Oct 2022 22:59:04 +0200 Subject: [PATCH] add Python implementation of Editops/Opcodes --- .github/workflows/releasebuild.yml | 2 +- .pre-commit-config.yaml | 24 + bench/benchmark_fuzz.py | 16 +- pyproject.toml | 33 + setup.cfg | 6 + setup.py | 2 +- src/rapidfuzz/_utils.py | 42 +- src/rapidfuzz/_utils.pyi | 7 - src/rapidfuzz/cpp_common.pxd | 19 +- src/rapidfuzz/distance/DamerauLevenshtein.py | 19 +- src/rapidfuzz/distance/DamerauLevenshtein.pyi | 59 +- .../distance/DamerauLevenshtein_py.py | 44 +- .../distance/DamerauLevenshtein_py.pyi | 46 -- src/rapidfuzz/distance/Hamming.py | 19 +- src/rapidfuzz/distance/Hamming.pyi | 53 +- src/rapidfuzz/distance/Hamming_py.py | 52 +- src/rapidfuzz/distance/Hamming_py.pyi | 54 -- src/rapidfuzz/distance/Indel.py | 19 +- src/rapidfuzz/distance/Indel.pyi | 53 +- src/rapidfuzz/distance/Indel_py.py | 73 +- src/rapidfuzz/distance/Indel_py.pyi | 54 -- src/rapidfuzz/distance/Jaro.py | 19 +- src/rapidfuzz/distance/Jaro.pyi | 51 +- src/rapidfuzz/distance/JaroWinkler.py | 19 +- src/rapidfuzz/distance/JaroWinkler.pyi | 51 +- src/rapidfuzz/distance/JaroWinkler_py.py | 37 +- src/rapidfuzz/distance/JaroWinkler_py.pyi | 50 -- src/rapidfuzz/distance/Jaro_py.py | 125 +-- src/rapidfuzz/distance/Jaro_py.pyi | 46 -- src/rapidfuzz/distance/LCSseq.py | 19 +- src/rapidfuzz/distance/LCSseq.pyi | 53 +- src/rapidfuzz/distance/LCSseq_py.py | 61 +- src/rapidfuzz/distance/LCSseq_py.pyi | 54 -- src/rapidfuzz/distance/Levenshtein.py | 19 +- src/rapidfuzz/distance/Levenshtein.pyi | 73 +- src/rapidfuzz/distance/Levenshtein_py.py | 77 +- src/rapidfuzz/distance/Levenshtein_py.pyi | 66 -- src/rapidfuzz/distance/OSA.py | 19 +- src/rapidfuzz/distance/OSA.pyi | 51 +- src/rapidfuzz/distance/OSA_cpp.py | 2 +- src/rapidfuzz/distance/OSA_py.py | 40 +- src/rapidfuzz/distance/OSA_py.pyi | 46 -- src/rapidfuzz/distance/__init__.py | 18 +- src/rapidfuzz/distance/__init__.pyi | 88 ++- src/rapidfuzz/distance/_initialize_cpp.pyx | 125 ++- src/rapidfuzz/distance/_initialize_py.py | 721 ++++++++++++++---- src/rapidfuzz/distance/metrics_cpp.pyi | 70 +- src/rapidfuzz/distance/metrics_cpp.pyx | 28 +- src/rapidfuzz/fuzz.py | 49 +- src/rapidfuzz/fuzz.pyi | 101 +-- src/rapidfuzz/fuzz_cpp.pyx | 26 +- src/rapidfuzz/fuzz_py.py | 216 +++--- src/rapidfuzz/fuzz_py.pyi | 105 --- src/rapidfuzz/process.py | 4 +- src/rapidfuzz/process.pyi | 1 - src/rapidfuzz/process_cpp.py | 38 +- src/rapidfuzz/process_cpp_impl.pyx | 28 +- src/rapidfuzz/process_py.py | 209 +++-- src/rapidfuzz/string_metric.py | 86 ++- src/rapidfuzz/string_metric.pyi | 56 -- src/rapidfuzz/utils.py | 2 + src/rapidfuzz/utils.pyi | 5 + src/rapidfuzz/utils_cpp.pyx | 12 +- src/rapidfuzz/utils_py.py | 4 +- tests/distance/test_Indel.py | 2 +- tests/distance/test_JaroWinkler.py | 7 +- tests/distance/test_OSA.py | 1 - tests/distance/test_init.py | 102 ++- tests/test_cpp_fallback.py | 2 +- tests/test_fuzz.py | 5 +- tests/test_hypothesis.py | 37 +- tests/test_pure_python_fallback.py | 2 +- tests/test_utils.py | 2 +- 73 files changed, 2188 insertions(+), 1638 deletions(-) create mode 100644 setup.cfg delete mode 100644 src/rapidfuzz/_utils.pyi delete mode 100644 src/rapidfuzz/distance/DamerauLevenshtein_py.pyi delete mode 100644 src/rapidfuzz/distance/Hamming_py.pyi delete mode 100644 src/rapidfuzz/distance/Indel_py.pyi delete mode 100644 src/rapidfuzz/distance/JaroWinkler_py.pyi delete mode 100644 src/rapidfuzz/distance/Jaro_py.pyi delete mode 100644 src/rapidfuzz/distance/LCSseq_py.pyi delete mode 100644 src/rapidfuzz/distance/Levenshtein_py.pyi delete mode 100644 src/rapidfuzz/distance/OSA_py.pyi delete mode 100644 src/rapidfuzz/fuzz_py.pyi delete mode 100644 src/rapidfuzz/string_metric.pyi diff --git a/.github/workflows/releasebuild.yml b/.github/workflows/releasebuild.yml index fbf7d897..b05c9048 100644 --- a/.github/workflows/releasebuild.yml +++ b/.github/workflows/releasebuild.yml @@ -160,7 +160,7 @@ jobs: fail-fast: false matrix: arch: [auto, aarch64, ppc64le, s390x] - python_tag: [ "cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*"] + python_tag: ["cp37-*", "cp38-*", "cp39-*", "cp310-*", "cp311-*", "pp37-*", "pp38-*", "pp39-*"] exclude: # PyPy builds not available for these platforms - arch: ppc64le diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a4c5fe83..afcd3a3b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,6 +97,21 @@ repos: - flake8-bugbear - pep8-naming +# Flake8 also supports pre-commit natively (same author) +- repo: https://github.com/PyCQA/flake8 + rev: "5.0.4" + hooks: + - id: flake8 + exclude: ^(docs/.*|tools/.*)$ + additional_dependencies: *flake8_dependencies + +# PyLint has native support - not always usable, but works for us +- repo: https://github.com/PyCQA/pylint + rev: "v2.15.3" + hooks: + - id: pylint + files: ^pybind11 + # CMake formatting - repo: https://github.com/cheshirekow/cmake-format-precommit rev: "v0.6.13" @@ -106,6 +121,15 @@ repos: types: [file] files: (\.cmake|CMakeLists.txt)(.in)?$ +# Check static types with mypy +#- repo: https://github.com/pre-commit/mirrors-mypy +# rev: "v0.971" +# hooks: +# - id: mypy +# args: [] +# exclude: ^(tests|docs)/ +# additional_dependencies: [nox, rich] + # Checks the manifest for missing files (native support) - repo: https://github.com/mgedmin/check-manifest rev: "0.48" diff --git a/bench/benchmark_fuzz.py b/bench/benchmark_fuzz.py index b12dc1e9..d762795a 100644 --- a/bench/benchmark_fuzz.py +++ b/bench/benchmark_fuzz.py @@ -1,7 +1,6 @@ # todo combine benchmarks of scorers into common code base import timeit -import numpy as np import pandas @@ -63,7 +62,7 @@ def scorer_benchmark(funcname): # token_ratio is unique to RapidFuzz time_token_ratio = benchmark( "token_ratio", - f"[rfuzz.token_ratio(a, b, processor=None) for b in b_list]", + "[rfuzz.token_ratio(a, b, processor=None) for b in b_list]", setup, lengths, count, @@ -72,7 +71,7 @@ def scorer_benchmark(funcname): # this gets very slow, so only benchmark it for smaller values time_token_ratio_simple = benchmark( "fuzzywuzzy", - f"[max(rfuzz.token_sort_ratio(a, b, processor=None), rfuzz.token_set_ratio(a, b, processor=None)) for b in b_list]", + "[max(rfuzz.token_sort_ratio(a, b, processor=None), rfuzz.token_set_ratio(a, b, processor=None)) for b in b_list]", setup, lengths, count, @@ -86,12 +85,12 @@ def scorer_benchmark(funcname): } ) -df.to_csv(f"results/token_ratio.csv", sep=",", index=False) +df.to_csv("results/token_ratio.csv", sep=",", index=False) # partial_token_ratio is unique to RapidFuzz time_partial_token_ratio = benchmark( "token_ratio", - f"[rfuzz.partial_token_ratio(a, b, processor=None) for b in b_list]", + "[rfuzz.partial_token_ratio(a, b, processor=None) for b in b_list]", setup, lengths, count, @@ -100,7 +99,10 @@ def scorer_benchmark(funcname): # this gets very slow, so only benchmark it for smaller values time_partial_token_ratio_simple = benchmark( "fuzzywuzzy", - f"[max(rfuzz.partial_token_sort_ratio(a, b, processor=None), rfuzz.partial_token_set_ratio(a, b, processor=None)) for b in b_list]", + ( + "[max(rfuzz.partial_token_sort_ratio(a, b, processor=None), " + "rfuzz.partial_token_set_ratio(a, b, processor=None)) for b in b_list]" + ), setup, lengths, count, @@ -114,4 +116,4 @@ def scorer_benchmark(funcname): } ) -df.to_csv(f"results/partial_token_ratio.csv", sep=",", index=False) +df.to_csv("results/partial_token_ratio.csv", sep=",", index=False) diff --git a/pyproject.toml b/pyproject.toml index 4fcb1d73..d2571a66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,3 +8,36 @@ requires = [ ] build-backend = "backend" backend-path = ["_custom_build"] + +[tool.isort] +profile = "black" + +[tool.mypy] +files = "src" +python_version = "3.7" +strict = true +show_error_codes = true +enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] +warn_unreachable = true + +[tool.pytest.ini_options] +minversion = "6.0" +addopts = ["-ra", "--showlocals", "--strict-markers", "--strict-config"] +xfail_strict = true +filterwarnings = ["error"] +log_cli_level = "info" +testpaths = ["tests"] + +[tool.pylint] +master.py-version = "3.6" +reports.output-format = "colorized" +messages_control.disable = [ + "design", + "fixme", + "imports", + "line-too-long", + "imports", + "invalid-name", + "protected-access", + "missing-module-docstring", +] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..804a966b --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 120 +show_source = True +exclude = .git, __pycache__, build, dist, docs, tools, venv +extend-ignore = E203, E722, B903, B950, N801, N802, N806 +extend-select = B9 diff --git a/setup.py b/setup.py index efdcfb39..41d841b9 100644 --- a/setup.py +++ b/setup.py @@ -69,7 +69,7 @@ def run_setup(with_binary): else: try: run_setup(True) - except: + except BaseException: show_message( "WARNING: The C extension could not be compiled, speedups" " are not enabled.", diff --git a/src/rapidfuzz/_utils.py b/src/rapidfuzz/_utils.py index d67f165c..28a02bf4 100644 --- a/src/rapidfuzz/_utils.py +++ b/src/rapidfuzz/_utils.py @@ -1,24 +1,33 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations -def _GetScorerFlagsDistance(**kwargs): +from typing import Any, Callable + + +def _get_scorer_flags_distance(**_kwargs: Any) -> dict[str, Any]: return {"optimal_score": 0, "worst_score": 2**63 - 1, "flags": (1 << 6)} -def _GetScorerFlagsSimilarity(**kwargs): +def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]: return {"optimal_score": 2**63 - 1, "worst_score": 0, "flags": (1 << 6)} -def _GetScorerFlagsNormalizedDistance(**kwargs): +def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]: return {"optimal_score": 0, "worst_score": 1, "flags": (1 << 5)} -def _GetScorerFlagsNormalizedSimilarity(**kwargs): +def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]: return {"optimal_score": 1, "worst_score": 0, "flags": (1 << 5)} -def fallback_import(module: str, name: str, set_attrs: bool = True): +def fallback_import( + module: str, + name: str, + cached_scorer_call: dict[str, Callable[..., dict[str, Any]]] | None = None, + set_attrs: bool = True, +) -> Any: """ import library function and possibly fall back to a pure Python version when no C++ implementation is available @@ -35,6 +44,9 @@ def fallback_import(module: str, name: str, set_attrs: bool = True): f"cannot import name '{name}' from '{py_mod.__name}' ({py_mod.__file__})" ) + if cached_scorer_call: + py_func._RF_ScorerPy = cached_scorer_call + if impl == "cpp": cpp_mod = importlib.import_module(module + "_cpp") elif impl == "python": @@ -55,14 +67,22 @@ def fallback_import(module: str, name: str, set_attrs: bool = True): if set_attrs: cpp_func.__name__ = py_func.__name__ cpp_func.__doc__ = py_func.__doc__ + + if cached_scorer_call: + cpp_func._RF_ScorerPy = cached_scorer_call + return cpp_func -default_distance_attribute = {"get_scorer_flags": _GetScorerFlagsDistance} -default_similarity_attribute = {"get_scorer_flags": _GetScorerFlagsSimilarity} -default_normalized_distance_attribute = { - "get_scorer_flags": _GetScorerFlagsNormalizedDistance +default_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_distance +} +default_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_similarity +} +default_normalized_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_normalized_distance } -default_normalized_similarity_attribute = { - "get_scorer_flags": _GetScorerFlagsNormalizedSimilarity +default_normalized_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_normalized_similarity } diff --git a/src/rapidfuzz/_utils.pyi b/src/rapidfuzz/_utils.pyi deleted file mode 100644 index 4b191a48..00000000 --- a/src/rapidfuzz/_utils.pyi +++ /dev/null @@ -1,7 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2022 Max Bachmann - -default_distance_attribute: dict -default_similarity_attribute: dict -default_normalized_distance_attribute: dict -default_normalized_similarity_attribute: dict diff --git a/src/rapidfuzz/cpp_common.pxd b/src/rapidfuzz/cpp_common.pxd index 5ad74b8d..fde9bb38 100644 --- a/src/rapidfuzz/cpp_common.pxd +++ b/src/rapidfuzz/cpp_common.pxd @@ -2,18 +2,25 @@ # cython: language_level=3, binding=True, linetrace=True from cpython.object cimport PyObject -from cpython.pycapsule cimport (PyCapsule_GetPointer, PyCapsule_IsValid, - PyCapsule_New) +from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_IsValid, PyCapsule_New from libc.stddef cimport wchar_t from libc.stdint cimport int64_t, uint64_t from libc.stdlib cimport free, malloc from libcpp cimport bool from libcpp.utility cimport move, pair from libcpp.vector cimport vector -from rapidfuzz_capi cimport (SCORER_STRUCT_VERSION, RF_GetScorerFlags, - RF_Kwargs, RF_KwargsInit, RF_Preprocessor, - RF_Scorer, RF_ScorerFlags, RF_ScorerFuncInit, - RF_String, RF_StringType) +from rapidfuzz_capi cimport ( + SCORER_STRUCT_VERSION, + RF_GetScorerFlags, + RF_Kwargs, + RF_KwargsInit, + RF_Preprocessor, + RF_Scorer, + RF_ScorerFlags, + RF_ScorerFuncInit, + RF_String, + RF_StringType, +) from array import array diff --git a/src/rapidfuzz/distance/DamerauLevenshtein.py b/src/rapidfuzz/distance/DamerauLevenshtein.py index 8be41ac2..3f6af28a 100644 --- a/src/rapidfuzz/distance/DamerauLevenshtein.py +++ b/src/rapidfuzz/distance/DamerauLevenshtein.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -8,12 +10,11 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.DamerauLevenshtein" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) +normalized_similarity = _fallback_import( + _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr +) diff --git a/src/rapidfuzz/distance/DamerauLevenshtein.pyi b/src/rapidfuzz/distance/DamerauLevenshtein.pyi index c5cb5d58..4f8d3a58 100644 --- a/src/rapidfuzz/distance/DamerauLevenshtein.pyi +++ b/src/rapidfuzz/distance/DamerauLevenshtein.pyi @@ -1,48 +1,35 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann -from typing_extensions import Protocol +from __future__ import annotations -from rapidfuzz.distance import Editops, Opcodes +from typing import Callable, Hashable, Sequence -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... -@_attr_decorator +def normalized_distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: ... def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: ... diff --git a/src/rapidfuzz/distance/DamerauLevenshtein_py.py b/src/rapidfuzz/distance/DamerauLevenshtein_py.py index de87184b..737490e2 100644 --- a/src/rapidfuzz/distance/DamerauLevenshtein_py.py +++ b/src/rapidfuzz/distance/DamerauLevenshtein_py.py @@ -1,10 +1,16 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations -def _damerau_levenshtein_distance_zhao(s1, s2): +from typing import Callable, Hashable, Sequence + + +def _damerau_levenshtein_distance_zhao( + s1: Sequence[Hashable], s2: Sequence[Hashable] +) -> int: maxVal = max(len(s1), len(s2)) + 1 - last_row_id = {} + last_row_id: dict[Hashable, int] = {} last_row_id_get = last_row_id.get size = len(s2) + 2 FR = [maxVal] * size @@ -31,7 +37,7 @@ def _damerau_levenshtein_distance_zhao(s1, s2): T = last_i2l1 # save H_i-2,l-1 else: k = last_row_id_get(s2[j - 1], -1) - l = last_col_id + l = last_col_id # noqa: E741 if (j - l) == 1: transpose = FR[j] + (i - k) @@ -49,7 +55,13 @@ def _damerau_levenshtein_distance_zhao(s1, s2): return dist -def distance(s1, s2, *, processor=None, score_cutoff=None): +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the Damerau-Levenshtein distance. @@ -89,7 +101,13 @@ def distance(s1, s2, *, processor=None, score_cutoff=None): return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1 -def similarity(s1, s2, *, processor=None, score_cutoff=None): +def similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the Damerau-Levenshtein similarity in the range [max, 0]. @@ -125,7 +143,13 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None): return sim if (score_cutoff is None or sim >= score_cutoff) else 0 -def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): +def normalized_distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized Damerau-Levenshtein similarity in the range [1, 0]. @@ -160,7 +184,13 @@ def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1 -def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): +def normalized_similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized Damerau-Levenshtein similarity in the range [0, 1]. diff --git a/src/rapidfuzz/distance/DamerauLevenshtein_py.pyi b/src/rapidfuzz/distance/DamerauLevenshtein_py.pyi deleted file mode 100644 index b27fcd9e..00000000 --- a/src/rapidfuzz/distance/DamerauLevenshtein_py.pyi +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... diff --git a/src/rapidfuzz/distance/Hamming.py b/src/rapidfuzz/distance/Hamming.py index 3bbdd407..5ed1a48c 100644 --- a/src/rapidfuzz/distance/Hamming.py +++ b/src/rapidfuzz/distance/Hamming.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -8,14 +10,13 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.Hamming" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_similarity = _fallback_import( + _mod, "similarity", cached_scorer_call=_norm_sim_attr +) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) editops = _fallback_import(_mod, "editops") opcodes = _fallback_import(_mod, "opcodes") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr diff --git a/src/rapidfuzz/distance/Hamming.pyi b/src/rapidfuzz/distance/Hamming.pyi index 2ad1f0ee..95860df1 100644 --- a/src/rapidfuzz/distance/Hamming.pyi +++ b/src/rapidfuzz/distance/Hamming.pyi @@ -1,42 +1,49 @@ -from typing import Callable, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann -from rapidfuzz.distance import Editops, Opcodes +from __future__ import annotations + +from typing import Callable, Hashable, Sequence -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") +from rapidfuzz.distance import Editops, Opcodes def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def editops( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, ) -> Editops: ... def opcodes( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, ) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/Hamming_py.py b/src/rapidfuzz/distance/Hamming_py.py index 88417f66..1aee8c68 100644 --- a/src/rapidfuzz/distance/Hamming_py.py +++ b/src/rapidfuzz/distance/Hamming_py.py @@ -1,8 +1,20 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations -def distance(s1, s2, *, processor=None, score_cutoff=None): +from typing import Callable, Hashable, Sequence + +from rapidfuzz.distance import Editops, Opcodes + + +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the Hamming distance between two strings. The hamming distance is defined as the number of positions @@ -48,7 +60,13 @@ def distance(s1, s2, *, processor=None, score_cutoff=None): return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1 -def similarity(s1, s2, *, processor=None, score_cutoff=None): +def similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the Hamming similarity between two strings. @@ -90,7 +108,13 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None): return sim if (score_cutoff is None or sim >= score_cutoff) else 0 -def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): +def normalized_distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized levenshtein similarity in the range [1, 0]. @@ -126,7 +150,13 @@ def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0 -def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): +def normalized_similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized hamming similarity in the range [0, 1]. @@ -157,7 +187,12 @@ def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): return norm_sim if (score_cutoff is None or norm_dist >= score_cutoff) else 0.0 -def editops(s1, s2, *, processor=None): +def editops( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, +) -> Editops: """ Return Editops describing how to turn s1 into s2. @@ -179,7 +214,12 @@ def editops(s1, s2, *, processor=None): raise NotImplementedError -def opcodes(s1, s2, *, processor=None): +def opcodes( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, +) -> Opcodes: """ Return Opcodes describing how to turn s1 into s2. diff --git a/src/rapidfuzz/distance/Hamming_py.pyi b/src/rapidfuzz/distance/Hamming_py.pyi deleted file mode 100644 index e64381bd..00000000 --- a/src/rapidfuzz/distance/Hamming_py.pyi +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -from rapidfuzz.distance import Editops, Opcodes - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -def editops( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None -) -> Editops: ... -def opcodes( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None -) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/Indel.py b/src/rapidfuzz/distance/Indel.py index 1527eac6..c91d48ef 100644 --- a/src/rapidfuzz/distance/Indel.py +++ b/src/rapidfuzz/distance/Indel.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -8,14 +10,13 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.Indel" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) +normalized_similarity = _fallback_import( + _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr +) editops = _fallback_import(_mod, "editops") opcodes = _fallback_import(_mod, "opcodes") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr diff --git a/src/rapidfuzz/distance/Indel.pyi b/src/rapidfuzz/distance/Indel.pyi index 2ad1f0ee..95860df1 100644 --- a/src/rapidfuzz/distance/Indel.pyi +++ b/src/rapidfuzz/distance/Indel.pyi @@ -1,42 +1,49 @@ -from typing import Callable, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann -from rapidfuzz.distance import Editops, Opcodes +from __future__ import annotations + +from typing import Callable, Hashable, Sequence -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") +from rapidfuzz.distance import Editops, Opcodes def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def editops( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, ) -> Editops: ... def opcodes( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, ) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/Indel_py.py b/src/rapidfuzz/distance/Indel_py.py index 9eba4e23..6042a1b2 100644 --- a/src/rapidfuzz/distance/Indel_py.py +++ b/src/rapidfuzz/distance/Indel_py.py @@ -1,11 +1,23 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + +from typing import Callable, Hashable, Sequence + +from rapidfuzz.distance import Editops, Opcodes + from .LCSseq_py import _block_similarity as lcs_seq_block_similarity from .LCSseq_py import similarity as lcs_seq_similarity -def distance(s1, s2, *, processor=None, score_cutoff=None): +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the minimum number of insertions and deletions required to change one sequence into the other. This is equivalent to the @@ -56,14 +68,25 @@ def distance(s1, s2, *, processor=None, score_cutoff=None): return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1 -def _block_distance(block, s1, s2, score_cutoff=None): +def _block_distance( + block: dict[Hashable, int], + s1: Sequence[Hashable], + s2: Sequence[Hashable], + score_cutoff: int | None = None, +) -> int: maximum = len(s1) + len(s2) lcs_sim = lcs_seq_block_similarity(block, s1, s2) dist = maximum - 2 * lcs_sim return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1 -def similarity(s1, s2, *, processor=None, score_cutoff=None): +def similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the Indel similarity in the range [max, 0]. @@ -99,7 +122,13 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None): return sim if (score_cutoff is None or sim >= score_cutoff) else 0 -def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): +def normalized_distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized levenshtein similarity in the range [1, 0]. @@ -134,14 +163,25 @@ def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1 -def _block_normalized_distance(block, s1, s2, score_cutoff=None): +def _block_normalized_distance( + block: dict[Hashable, int], + s1: Sequence[Hashable], + s2: Sequence[Hashable], + score_cutoff: float | None = None, +) -> float: maximum = len(s1) + len(s2) dist = _block_distance(block, s1, s2) norm_dist = dist / maximum if maximum else 0 return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1 -def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): +def normalized_similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized indel similarity in the range [0, 1]. @@ -194,13 +234,23 @@ def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0 -def _block_normalized_similarity(block, s1, s2, score_cutoff=None): +def _block_normalized_similarity( + block: dict[Hashable, int], + s1: Sequence[Hashable], + s2: Sequence[Hashable], + score_cutoff: float | None = None, +) -> float: norm_dist = _block_normalized_distance(block, s1, s2) norm_sim = 1.0 - norm_dist return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0 -def editops(s1, s2, *, processor=None): +def editops( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, +) -> Editops: """ Return Editops describing how to turn s1 into s2. @@ -242,7 +292,12 @@ def editops(s1, s2, *, processor=None): raise NotImplementedError -def opcodes(s1, s2, *, processor=None): +def opcodes( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, +) -> Opcodes: """ Return Opcodes describing how to turn s1 into s2. diff --git a/src/rapidfuzz/distance/Indel_py.pyi b/src/rapidfuzz/distance/Indel_py.pyi deleted file mode 100644 index e64381bd..00000000 --- a/src/rapidfuzz/distance/Indel_py.pyi +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -from rapidfuzz.distance import Editops, Opcodes - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -def editops( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None -) -> Editops: ... -def opcodes( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None -) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/Jaro.py b/src/rapidfuzz/distance/Jaro.py index 9517e3eb..7ea28c7e 100644 --- a/src/rapidfuzz/distance/Jaro.py +++ b/src/rapidfuzz/distance/Jaro.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -8,12 +10,11 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.Jaro" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) +normalized_similarity = _fallback_import( + _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr +) diff --git a/src/rapidfuzz/distance/Jaro.pyi b/src/rapidfuzz/distance/Jaro.pyi index b4243b25..3ddbefaf 100644 --- a/src/rapidfuzz/distance/Jaro.pyi +++ b/src/rapidfuzz/distance/Jaro.pyi @@ -1,46 +1,35 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann -from typing_extensions import Protocol +from __future__ import annotations -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict +from typing import Callable, Hashable, Sequence -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: ... -@_attr_decorator def normalized_distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... -@_attr_decorator def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: ... -@_attr_decorator def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... diff --git a/src/rapidfuzz/distance/JaroWinkler.py b/src/rapidfuzz/distance/JaroWinkler.py index d1ce7413..be8bf6d4 100644 --- a/src/rapidfuzz/distance/JaroWinkler.py +++ b/src/rapidfuzz/distance/JaroWinkler.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -8,12 +10,11 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.JaroWinkler" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) +normalized_similarity = _fallback_import( + _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr +) diff --git a/src/rapidfuzz/distance/JaroWinkler.pyi b/src/rapidfuzz/distance/JaroWinkler.pyi index b09539be..9128519c 100644 --- a/src/rapidfuzz/distance/JaroWinkler.pyi +++ b/src/rapidfuzz/distance/JaroWinkler.pyi @@ -1,50 +1,39 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann -from typing_extensions import Protocol +from __future__ import annotations -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict +from typing import Callable, Hashable, Sequence -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: ... -@_attr_decorator def normalized_distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... -@_attr_decorator def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: ... -@_attr_decorator def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... diff --git a/src/rapidfuzz/distance/JaroWinkler_py.py b/src/rapidfuzz/distance/JaroWinkler_py.py index e7897487..e9eef340 100644 --- a/src/rapidfuzz/distance/JaroWinkler_py.py +++ b/src/rapidfuzz/distance/JaroWinkler_py.py @@ -1,11 +1,20 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + +from typing import Callable, Hashable, Sequence + from rapidfuzz.distance import Jaro def similarity( - s1, s2, *, prefix_weight=0.1, processor=None, score_cutoff=None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + prefix_weight: float = 0.1, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: """ Calculates the jaro winkler similarity @@ -37,9 +46,6 @@ def similarity( ValueError If prefix_weight is invalid """ - if s1 is None or s2 is None: - return 0 - if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -77,7 +83,12 @@ def similarity( def normalized_similarity( - s1, s2, *, prefix_weight=0.1, processor=None, score_cutoff=None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + prefix_weight: float = 0.1, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: """ Calculates the normalized jaro winkler similarity @@ -118,7 +129,14 @@ def normalized_similarity( ) -def distance(s1, s2, *, processor=None, prefix_weight=0.1, score_cutoff=None) -> float: +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + prefix_weight: float = 0.1, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the jaro winkler distance @@ -162,7 +180,12 @@ def distance(s1, s2, *, processor=None, prefix_weight=0.1, score_cutoff=None) -> def normalized_distance( - s1, s2, *, prefix_weight=0.1, processor=None, score_cutoff=None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + prefix_weight: float = 0.1, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, ) -> float: """ Calculates the normalized jaro winkler distance diff --git a/src/rapidfuzz/distance/JaroWinkler_py.pyi b/src/rapidfuzz/distance/JaroWinkler_py.pyi deleted file mode 100644 index b09539be..00000000 --- a/src/rapidfuzz/distance/JaroWinkler_py.pyi +++ /dev/null @@ -1,50 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> float: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> float: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... diff --git a/src/rapidfuzz/distance/Jaro_py.py b/src/rapidfuzz/distance/Jaro_py.py index 337b426b..780aebfd 100644 --- a/src/rapidfuzz/distance/Jaro_py.py +++ b/src/rapidfuzz/distance/Jaro_py.py @@ -1,64 +1,78 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + +from typing import Callable, Hashable, Sequence + def _jaro_calculate_similarity( - P_len: int, T_len: int, CommonChars: int, Transpositions: int + pattern_len: int, text_len: int, common_chars: int, transpositions: int ) -> float: - Transpositions //= 2 - Sim = 0.0 - Sim += CommonChars / P_len - Sim += CommonChars / T_len - Sim += (CommonChars - Transpositions) / CommonChars - return Sim / 3.0 + transpositions //= 2 + sim = 0.0 + sim += common_chars / pattern_len + sim += common_chars / text_len + sim += (common_chars - transpositions) / common_chars + return sim / 3.0 -def _jaro_length_filter(P_len: int, T_len: int, score_cutoff: float) -> bool: +def _jaro_length_filter(pattern_len: int, text_len: int, score_cutoff: float) -> bool: """ filter matches below score_cutoff based on string lengths """ - if not P_len or not T_len: + if not pattern_len or not text_len: return False - sim = _jaro_calculate_similarity(P_len, T_len, min(P_len, T_len), 0) + sim = _jaro_calculate_similarity( + pattern_len, text_len, min(pattern_len, text_len), 0 + ) return sim >= score_cutoff def _jaro_common_char_filter( - P_len: int, T_len: int, CommonChars: int, score_cutoff: float + pattern_len: int, text_len: int, common_chars: int, score_cutoff: float ) -> bool: """ filter matches below score_cutoff based on string lengths and common characters """ - if not CommonChars: + if not common_chars: return False - sim = _jaro_calculate_similarity(P_len, T_len, CommonChars, 0) + sim = _jaro_calculate_similarity(pattern_len, text_len, common_chars, 0) return sim >= score_cutoff -def _jaro_bounds(s1, s2): +def _jaro_bounds( + s1: Sequence[Hashable], s2: Sequence[Hashable] +) -> tuple[Sequence[Hashable], Sequence[Hashable], int]: """ find bounds and skip out of bound parts of the sequences """ - P_len = len(s1) - T_len = len(s2) + pattern_len = len(s1) + text_len = len(s2) # since jaro uses a sliding window some parts of T/P might never be in # range an can be removed ahead of time - Bound = 0 - if T_len > P_len: - Bound = T_len // 2 - 1 - if T_len > P_len + Bound: - s2 = s2[: P_len + Bound] + bound = 0 + if text_len > pattern_len: + bound = text_len // 2 - 1 + if text_len > pattern_len + bound: + s2 = s2[: pattern_len + bound] else: - Bound = P_len // 2 - 1 - if P_len > T_len + Bound: - s1 = s1[: T_len + Bound] - return s1, s2, Bound - - -def similarity(s1, s2, *, processor=None, score_cutoff=None) -> float: + bound = pattern_len // 2 - 1 + if pattern_len > text_len + bound: + s1 = s1[: text_len + bound] + return s1, s2, bound + + +def similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the jaro similarity @@ -81,9 +95,6 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None) -> float: similarity : float similarity between s1 and s2 as a float between 0 and 1.0 """ - if s1 is None or s2 is None: - return 0 - if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -91,35 +102,35 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None) -> float: if score_cutoff is None: score_cutoff = 0 - P_len = len(s1) - T_len = len(s2) + pattern_len = len(s1) + text_len = len(s2) # short circuit if score_cutoff can not be reached - if not _jaro_length_filter(P_len, T_len, score_cutoff): + if not _jaro_length_filter(pattern_len, text_len, score_cutoff): return 0 - if P_len == 1 and T_len == 1: + if pattern_len == 1 and text_len == 1: return float(s1[0] == s2[0]) - s1, s2, Bound = _jaro_bounds(s1, s2) + s1, s2, bound = _jaro_bounds(s1, s2) - s1_flags = [False] * P_len - s2_flags = [False] * T_len + s1_flags = [False] * pattern_len + s2_flags = [False] * text_len # todo use bitparallel implementation # looking only within search range, count & flag matched pairs - CommonChars = 0 + common_chars = 0 for i, s1_ch in enumerate(s1): - low = max(0, i - Bound) - hi = min(i + Bound, T_len - 1) + low = max(0, i - bound) + hi = min(i + bound, text_len - 1) for j in range(low, hi + 1): if not s2_flags[j] and s2[j] == s1_ch: s1_flags[i] = s2_flags[j] = True - CommonChars += 1 + common_chars += 1 break # short circuit if score_cutoff can not be reached - if not _jaro_common_char_filter(P_len, T_len, CommonChars, score_cutoff): + if not _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff): return 0 # todo use bitparallel implementation @@ -127,17 +138,23 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None) -> float: k = trans_count = 0 for i, s1_f in enumerate(s1_flags): if s1_f: - for j in range(k, T_len): + for j in range(k, text_len): if s2_flags[j]: k = j + 1 break if s1[i] != s2[j]: trans_count += 1 - return _jaro_calculate_similarity(P_len, T_len, CommonChars, trans_count) + return _jaro_calculate_similarity(pattern_len, text_len, common_chars, trans_count) -def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None) -> float: +def normalized_similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the normalized jaro similarity @@ -163,7 +180,13 @@ def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None) -> float return similarity(s1, s2, processor=processor, score_cutoff=score_cutoff) -def distance(s1, s2, *, processor=None, score_cutoff=None) -> float: +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the jaro distance @@ -198,7 +221,13 @@ def distance(s1, s2, *, processor=None, score_cutoff=None) -> float: return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0 -def normalized_distance(s1, s2, *, processor=None, score_cutoff=None) -> float: +def normalized_distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the normalized jaro distance diff --git a/src/rapidfuzz/distance/Jaro_py.pyi b/src/rapidfuzz/distance/Jaro_py.pyi deleted file mode 100644 index b4243b25..00000000 --- a/src/rapidfuzz/distance/Jaro_py.pyi +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> float: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> float: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... diff --git a/src/rapidfuzz/distance/LCSseq.py b/src/rapidfuzz/distance/LCSseq.py index c2744528..b535cbab 100644 --- a/src/rapidfuzz/distance/LCSseq.py +++ b/src/rapidfuzz/distance/LCSseq.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -8,14 +10,13 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.LCSseq" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) +normalized_similarity = _fallback_import( + _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr +) editops = _fallback_import(_mod, "editops") opcodes = _fallback_import(_mod, "opcodes") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr diff --git a/src/rapidfuzz/distance/LCSseq.pyi b/src/rapidfuzz/distance/LCSseq.pyi index 2ad1f0ee..95860df1 100644 --- a/src/rapidfuzz/distance/LCSseq.pyi +++ b/src/rapidfuzz/distance/LCSseq.pyi @@ -1,42 +1,49 @@ -from typing import Callable, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann -from rapidfuzz.distance import Editops, Opcodes +from __future__ import annotations + +from typing import Callable, Hashable, Sequence -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") +from rapidfuzz.distance import Editops, Opcodes def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def editops( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, ) -> Editops: ... def opcodes( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, ) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/LCSseq_py.py b/src/rapidfuzz/distance/LCSseq_py.py index f766ebb6..a2618f64 100644 --- a/src/rapidfuzz/distance/LCSseq_py.py +++ b/src/rapidfuzz/distance/LCSseq_py.py @@ -1,8 +1,20 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations -def similarity(s1, s2, *, processor=None, score_cutoff=None): +from typing import Callable, Hashable, Sequence + +from rapidfuzz.distance import Editops, Opcodes + + +def similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the length of the longest common subsequence @@ -34,7 +46,7 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None): return 0 S = (1 << len(s1)) - 1 - block = {} + block: dict[Hashable, int] = {} block_get = block.get x = 1 for ch1 in s1: @@ -51,7 +63,12 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None): return res if (score_cutoff is None or res >= score_cutoff) else 0 -def _block_similarity(block, s1, s2, score_cutoff=None): +def _block_similarity( + block: dict[Hashable, int], + s1: Sequence[Hashable], + s2: Sequence[Hashable], + score_cutoff: int | None = None, +) -> int: if not s1: return 0 @@ -68,7 +85,13 @@ def _block_similarity(block, s1, s2, score_cutoff=None): return res if (score_cutoff is None or res >= score_cutoff) else 0 -def distance(s1, s2, *, processor=None, score_cutoff=None): +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the LCS distance in the range [0, max]. @@ -119,7 +142,13 @@ def distance(s1, s2, *, processor=None, score_cutoff=None): return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1 -def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): +def normalized_distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized LCS similarity in the range [1, 0]. @@ -156,7 +185,13 @@ def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): return norm_sim if (score_cutoff is None or norm_sim <= score_cutoff) else 1 -def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): +def normalized_similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized LCS similarity in the range [0, 1]. @@ -208,7 +243,12 @@ def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0 -def editops(s1, s2, *, processor=None): +def editops( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, +) -> Editops: """ Return Editops describing how to turn s1 into s2. @@ -250,7 +290,12 @@ def editops(s1, s2, *, processor=None): raise NotImplementedError -def opcodes(s1, s2, *, processor=None): +def opcodes( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, +) -> Opcodes: """ Return Opcodes describing how to turn s1 into s2. diff --git a/src/rapidfuzz/distance/LCSseq_py.pyi b/src/rapidfuzz/distance/LCSseq_py.pyi deleted file mode 100644 index 79630bef..00000000 --- a/src/rapidfuzz/distance/LCSseq_py.pyi +++ /dev/null @@ -1,54 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -from rapidfuzz.distance import Editops, Opcodes - -class _ScorerAttributes(Protocol): - _RF_ScorerPy7: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -def editops( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None -) -> Editops: ... -def opcodes( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None -) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/Levenshtein.py b/src/rapidfuzz/distance/Levenshtein.py index 4429e217..3c83f5e8 100644 --- a/src/rapidfuzz/distance/Levenshtein.py +++ b/src/rapidfuzz/distance/Levenshtein.py @@ -7,6 +7,8 @@ substitutions required to transform s1 into s2. """ +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -14,14 +16,13 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.Levenshtein" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) +normalized_similarity = _fallback_import( + _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr +) editops = _fallback_import(_mod, "editops") opcodes = _fallback_import(_mod, "opcodes") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr diff --git a/src/rapidfuzz/distance/Levenshtein.pyi b/src/rapidfuzz/distance/Levenshtein.pyi index bb8c15d1..7433b927 100644 --- a/src/rapidfuzz/distance/Levenshtein.pyi +++ b/src/rapidfuzz/distance/Levenshtein.pyi @@ -1,54 +1,61 @@ -from typing import Callable, Hashable, Optional, Sequence, Tuple, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann +""" +The Levenshtein (edit) distance is a string metric to measure the +difference between two strings/sequences s1 and s2. +It's defined as the minimum number of insertions, deletions or +substitutions required to transform s1 into s2. +""" -from rapidfuzz.distance import Editops, Opcodes +from __future__ import annotations + +from typing import Callable, Hashable, Sequence -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") +from rapidfuzz.distance import Editops, Opcodes def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def editops( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_hint: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_hint: int | None = None, ) -> Editops: ... def opcodes( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_hint: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_hint: int | None = None, ) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/Levenshtein_py.py b/src/rapidfuzz/distance/Levenshtein_py.py index 51a27707..b23f58c2 100644 --- a/src/rapidfuzz/distance/Levenshtein_py.py +++ b/src/rapidfuzz/distance/Levenshtein_py.py @@ -1,10 +1,16 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann -from .Indel_py import distance as indel_distance +from __future__ import annotations +from typing import Callable, Hashable, Sequence -def _levenshtein_maximum(s1, s2, weights): +from rapidfuzz.distance import Editops, Indel, Opcodes + + +def _levenshtein_maximum( + s1: Sequence[Hashable], s2: Sequence[Hashable], weights: tuple[int, int, int] +) -> int: len1 = len(s1) len2 = len(s2) insert, delete, replace = weights @@ -19,7 +25,9 @@ def _levenshtein_maximum(s1, s2, weights): return max_dist -def _uniform_generic(s1, s2, weights): +def _uniform_generic( + s1: Sequence[Hashable], s2: Sequence[Hashable], weights: tuple[int, int, int] +) -> int: len1 = len(s1) insert, delete, replace = weights cache = list(range(0, (len1 + 1) * delete, delete)) @@ -37,7 +45,7 @@ def _uniform_generic(s1, s2, weights): return cache[-1] -def _uniform_distance(s1, s2): +def _uniform_distance(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int: if not s1: return len(s2) @@ -46,7 +54,7 @@ def _uniform_distance(s1, s2): currDist = len(s1) mask = 1 << (len(s1) - 1) - block = {} + block: dict[Hashable, int] = {} block_get = block.get x = 1 for ch1 in s1: @@ -73,7 +81,14 @@ def _uniform_distance(s1, s2): return currDist -def distance(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None): +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the minimum number of insertions, deletions, and substitutions required to change one sequence into the other according to Levenshtein with custom @@ -132,17 +147,24 @@ def distance(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None): s1 = processor(s1) s2 = processor(s2) - if weights == (1, 1, 1): + if weights is None or weights == (1, 1, 1): dist = _uniform_distance(s1, s2) elif weights == (1, 1, 2): - dist = indel_distance(s1, s2) + dist = Indel.distance(s1, s2) else: dist = _uniform_generic(s1, s2, weights) return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1 -def similarity(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None): +def similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the levenshtein similarity in the range [max, 0] using custom costs for insertion, deletion and substitution. @@ -183,6 +205,7 @@ def similarity(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None): s1 = processor(s1) s2 = processor(s2) + weights = weights or (1, 1, 1) maximum = _levenshtein_maximum(s1, s2, weights) dist = distance(s1, s2, weights=weights) sim = maximum - dist @@ -190,8 +213,13 @@ def similarity(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None): def normalized_distance( - s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None -): + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized levenshtein distance in the range [1, 0] using custom costs for insertion, deletion and substitution. @@ -231,6 +259,7 @@ def normalized_distance( s1 = processor(s1) s2 = processor(s2) + weights = weights or (1, 1, 1) maximum = _levenshtein_maximum(s1, s2, weights) dist = distance(s1, s2, weights=weights) norm_dist = dist / maximum if maximum else 0 @@ -238,8 +267,13 @@ def normalized_distance( def normalized_similarity( - s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None -): + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized levenshtein similarity in the range [0, 1] using custom costs for insertion, deletion and substitution. @@ -303,12 +337,19 @@ def normalized_similarity( s1 = processor(s1) s2 = processor(s2) + weights = weights or (1, 1, 1) norm_dist = normalized_distance(s1, s2, weights=weights) norm_sim = 1.0 - norm_dist return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0 -def editops(s1, s2, *, processor=None, score_hint=None): +def editops( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_hint: int | None = None, +) -> Editops: """ Return Editops describing how to turn s1 into s2. @@ -353,7 +394,13 @@ def editops(s1, s2, *, processor=None, score_hint=None): raise NotImplementedError -def opcodes(s1, s2, *, processor=None, score_hint=None): +def opcodes( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_hint: int | None = None, +) -> Opcodes: """ Return Opcodes describing how to turn s1 into s2. diff --git a/src/rapidfuzz/distance/Levenshtein_py.pyi b/src/rapidfuzz/distance/Levenshtein_py.pyi deleted file mode 100644 index 36fc8895..00000000 --- a/src/rapidfuzz/distance/Levenshtein_py.pyi +++ /dev/null @@ -1,66 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, Tuple, TypeVar - -from typing_extensions import Protocol - -from rapidfuzz.distance import Editops, Opcodes - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -def editops( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_hint: Optional[int] = None -) -> Editops: ... -def opcodes( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_hint: Optional[int] = None -) -> Opcodes: ... diff --git a/src/rapidfuzz/distance/OSA.py b/src/rapidfuzz/distance/OSA.py index 9fe610dc..ee8e2441 100644 --- a/src/rapidfuzz/distance/OSA.py +++ b/src/rapidfuzz/distance/OSA.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import default_distance_attribute as _dist_attr from rapidfuzz._utils import default_normalized_distance_attribute as _norm_dist_attr from rapidfuzz._utils import default_normalized_similarity_attribute as _norm_sim_attr @@ -8,12 +10,11 @@ from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.distance.OSA" -distance = _fallback_import(_mod, "distance") -similarity = _fallback_import(_mod, "similarity") -normalized_distance = _fallback_import(_mod, "normalized_distance") -normalized_similarity = _fallback_import(_mod, "normalized_similarity") - -distance._RF_ScorerPy = _dist_attr -similarity._RF_ScorerPy = _sim_attr -normalized_distance._RF_ScorerPy = _norm_dist_attr -normalized_similarity._RF_ScorerPy = _norm_sim_attr +distance = _fallback_import(_mod, "distance", cached_scorer_call=_dist_attr) +similarity = _fallback_import(_mod, "similarity", cached_scorer_call=_sim_attr) +normalized_distance = _fallback_import( + _mod, "normalized_distance", cached_scorer_call=_norm_dist_attr +) +normalized_similarity = _fallback_import( + _mod, "normalized_similarity", cached_scorer_call=_norm_sim_attr +) diff --git a/src/rapidfuzz/distance/OSA.pyi b/src/rapidfuzz/distance/OSA.pyi index b27fcd9e..ab64c917 100644 --- a/src/rapidfuzz/distance/OSA.pyi +++ b/src/rapidfuzz/distance/OSA.pyi @@ -1,46 +1,35 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann -from typing_extensions import Protocol +from __future__ import annotations -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict +from typing import Callable, Hashable, Sequence -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator def distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... -@_attr_decorator def normalized_distance( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... -@_attr_decorator def similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, ) -> int: ... -@_attr_decorator def normalized_similarity( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... diff --git a/src/rapidfuzz/distance/OSA_cpp.py b/src/rapidfuzz/distance/OSA_cpp.py index 3043cad7..5a692e67 100644 --- a/src/rapidfuzz/distance/OSA_cpp.py +++ b/src/rapidfuzz/distance/OSA_cpp.py @@ -10,4 +10,4 @@ ) from rapidfuzz.distance.metrics_cpp import osa_similarity as similarity -__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"] +__all__ = ["distance", "similarity", "normalized_distance", "normalized_similarity"] diff --git a/src/rapidfuzz/distance/OSA_py.py b/src/rapidfuzz/distance/OSA_py.py index 91e6555d..18d9f9dd 100644 --- a/src/rapidfuzz/distance/OSA_py.py +++ b/src/rapidfuzz/distance/OSA_py.py @@ -1,8 +1,12 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations -def _osa_distance_hyrroe2003(s1, s2): +from typing import Callable, Hashable, Sequence + + +def _osa_distance_hyrroe2003(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int: if not s1: return len(s2) @@ -13,7 +17,7 @@ def _osa_distance_hyrroe2003(s1, s2): currDist = len(s1) mask = 1 << (len(s1) - 1) - block = {} + block: dict[Hashable, int] = {} block_get = block.get x = 1 for ch1 in s1: @@ -45,7 +49,13 @@ def _osa_distance_hyrroe2003(s1, s2): return currDist -def distance(s1, s2, *, processor=None, score_cutoff=None): +def distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the optimal string alignment (OSA) distance. @@ -87,7 +97,13 @@ def distance(s1, s2, *, processor=None, score_cutoff=None): return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1 -def similarity(s1, s2, *, processor=None, score_cutoff=None): +def similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the optimal string alignment (OSA) similarity in the range [max, 0]. @@ -123,7 +139,13 @@ def similarity(s1, s2, *, processor=None, score_cutoff=None): return sim if (score_cutoff is None or sim >= score_cutoff) else 0 -def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): +def normalized_distance( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized optimal string alignment (OSA) similarity in the range [1, 0]. @@ -158,7 +180,13 @@ def normalized_distance(s1, s2, *, processor=None, score_cutoff=None): return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1 -def normalized_similarity(s1, s2, *, processor=None, score_cutoff=None): +def normalized_similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized optimal string alignment (OSA) similarity in the range [0, 1]. diff --git a/src/rapidfuzz/distance/OSA_py.pyi b/src/rapidfuzz/distance/OSA_py.pyi deleted file mode 100644 index b27fcd9e..00000000 --- a/src/rapidfuzz/distance/OSA_py.pyi +++ /dev/null @@ -1,46 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_distance( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None -) -> int: ... -@_attr_decorator -def normalized_similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... diff --git a/src/rapidfuzz/distance/__init__.py b/src/rapidfuzz/distance/__init__.py index dc3d13ab..95d2f47a 100644 --- a/src/rapidfuzz/distance/__init__.py +++ b/src/rapidfuzz/distance/__init__.py @@ -1,18 +1,20 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann -from rapidfuzz._utils import fallback_import as _fallback_import +from __future__ import annotations -_mod = "rapidfuzz.distance._initialize" -Editop = _fallback_import(_mod, "Editop", False) -Editops = _fallback_import(_mod, "Editops", False) -Opcode = _fallback_import(_mod, "Opcode", False) -Opcodes = _fallback_import(_mod, "Opcodes", False) -ScoreAlignment = _fallback_import(_mod, "ScoreAlignment", False) -MatchingBlock = _fallback_import(_mod, "MatchingBlock", False) +from rapidfuzz._utils import fallback_import as _fallback_import from . import DamerauLevenshtein, Hamming, Indel, Jaro, JaroWinkler, LCSseq, Levenshtein +_mod = "rapidfuzz.distance._initialize" +Editop = _fallback_import(_mod, "Editop", set_attrs=False) +Editops = _fallback_import(_mod, "Editops", set_attrs=False) +Opcode = _fallback_import(_mod, "Opcode", set_attrs=False) +Opcodes = _fallback_import(_mod, "Opcodes", set_attrs=False) +ScoreAlignment = _fallback_import(_mod, "ScoreAlignment", set_attrs=False) +MatchingBlock = _fallback_import(_mod, "MatchingBlock", set_attrs=False) + __all__ = [ "Editop", "Editops", diff --git a/src/rapidfuzz/distance/__init__.pyi b/src/rapidfuzz/distance/__init__.pyi index fae164d1..d5c9f6a5 100644 --- a/src/rapidfuzz/distance/__init__.pyi +++ b/src/rapidfuzz/distance/__init__.pyi @@ -1,4 +1,9 @@ -from typing import List, Tuple, Union +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann + +from __future__ import annotations + +from typing import Iterator, List, Tuple, Union from . import DamerauLevenshtein as DamerauLevenshtein from . import Hamming as Hamming @@ -18,10 +23,11 @@ class MatchingBlock: b: int size: int - def __init__(self, a: int, b: int, size: int) -> None: ... + def __init__(self, a: int, b: int, size: int): ... def __len__(self) -> int: ... def __eq__(self, other: object) -> bool: ... - def __getitem__(self, i: int) -> Union[str, int]: ... + def __getitem__(self, i: int) -> int: ... + def __iter__(self) -> Iterator[int]: ... def __repr__(self) -> str: ... class Editop: @@ -29,29 +35,33 @@ class Editop: src_pos: int dest_pos: int - def __init__(self, tag: str, src_pos: int, dest_pos: int) -> None: ... + def __init__(self, tag: str, src_pos: int, dest_pos: int): ... def __len__(self) -> int: ... def __eq__(self, other: object) -> bool: ... - def __getitem__(self, i: int) -> Union[str, int]: ... + def __getitem__(self, i: int) -> int | str: ... + def __iter__(self) -> Iterator[int | str]: ... def __repr__(self) -> str: ... class Editops: + + _src_len: int + _dest_len: int + _editops: list[Editop] + def __init__( - self, editops: _AnyOpList = None, src_len: int = 0, dest_len: int = 0 - ) -> None: ... + self, + editops: _AnyOpList | None = None, + src_len: int = 0, + dest_len: int = 0, + ): ... @classmethod def from_opcodes(cls, opcodes: Opcodes) -> Editops: ... - def as_opcodes(self) -> Opcodes: ... - def as_matching_blocks(self) -> List[MatchingBlock]: ... - def as_list(self) -> List[Tuple[str, int, int]]: ... - def __eq__(self, other: object) -> bool: ... - def __len__(self) -> int: ... + def as_matching_blocks(self) -> list[MatchingBlock]: ... + def as_list(self) -> list[Editop]: ... def copy(self) -> Editops: ... def inverse(self) -> Editops: ... - def remove_subsequence(self, subsequence: Editops) -> Editops: ... - def apply( - self, source_string: Union[str, bytes], destination_string: Union[str, bytes] - ) -> str: ... + def remove_subsequence(self, subsequence: Editops) -> None: ... + def apply(self, source_string: str, destination_string: str) -> str: ... @property def src_len(self) -> int: ... @src_len.setter @@ -60,8 +70,11 @@ class Editops: def dest_len(self) -> int: ... @dest_len.setter def dest_len(self, value: int) -> None: ... - def __delitem__(self, item: Union[int, slice]) -> None: ... - def __getitem__(self, key: Union[int, slice]) -> Editop: ... + def __eq__(self, other: object) -> bool: ... + def __len__(self) -> int: ... + def __delitem__(self, key: int | slice) -> None: ... + def __getitem__(self, key: int | slice) -> Editops | Editop: ... + def __iter__(self) -> Iterator[Editop]: ... def __repr__(self) -> str: ... class Opcode: @@ -73,28 +86,31 @@ class Opcode: def __init__( self, tag: str, src_start: int, src_end: int, dest_start: int, dest_end: int - ) -> None: ... + ): ... def __len__(self) -> int: ... def __eq__(self, other: object) -> bool: ... - def __getitem__(self, key: int) -> Union[str, int]: ... - def __repr__(self) -> str: ... + def __getitem__(self, i: int) -> int | str: ... + def __iter__(self) -> Iterator[int | str]: ... class Opcodes: + _src_len: int + _dest_len: int + _opcodes: list[Opcode] + def __init__( - self, opcodes: _AnyOpList = None, src_len: int = 0, dest_len: int = 0 - ) -> None: ... + self, + opcodes: _AnyOpList | None = None, + src_len: int = 0, + dest_len: int = 0, + ): ... @classmethod def from_editops(cls, editops: Editops) -> Opcodes: ... def as_editops(self) -> Editops: ... - def as_matching_blocks(self) -> List[MatchingBlock]: ... - def as_list(self) -> List[Tuple[str, int, int, int, int]]: ... - def __eq__(self, other: object) -> bool: ... - def __len__(self) -> int: ... + def as_matching_blocks(self) -> list[MatchingBlock]: ... + def as_list(self) -> list[Opcode]: ... def copy(self) -> Opcodes: ... def inverse(self) -> Opcodes: ... - def apply( - self, source_string: Union[str, bytes], destination_string: Union[str, bytes] - ) -> str: ... + def apply(self, source_string: str, destination_string: str) -> str: ... @property def src_len(self) -> int: ... @src_len.setter @@ -103,11 +119,14 @@ class Opcodes: def dest_len(self) -> int: ... @dest_len.setter def dest_len(self, value: int) -> None: ... + def __eq__(self, other: object) -> bool: ... + def __len__(self) -> int: ... def __getitem__(self, key: int) -> Opcode: ... + def __iter__(self) -> Iterator[Opcode]: ... def __repr__(self) -> str: ... class ScoreAlignment: - score: Union[int, float] + score: int | float src_start: int src_end: int dest_start: int @@ -115,13 +134,14 @@ class ScoreAlignment: def __init__( self, - score: Union[int, float], + score: int | float, src_start: int, src_end: int, dest_start: int, dest_end: int, - ) -> None: ... + ): ... def __len__(self) -> int: ... def __eq__(self, other: object) -> bool: ... - def __getitem__(self, i: int) -> Union[int, float]: ... + def __getitem__(self, i: int) -> int | float: ... + def __iter__(self) -> Iterator[int | float]: ... def __repr__(self) -> str: ... diff --git a/src/rapidfuzz/distance/_initialize_cpp.pyx b/src/rapidfuzz/distance/_initialize_cpp.pyx index de0cb35c..ca202bde 100644 --- a/src/rapidfuzz/distance/_initialize_cpp.pyx +++ b/src/rapidfuzz/distance/_initialize_cpp.pyx @@ -1,11 +1,9 @@ # distutils: language=c++ # cython: language_level=3, binding=True, linetrace=True -from cpp_common cimport (EditType, RfEditOp, RfOpcode, convert_string, - is_valid_string) +from cpp_common cimport EditType, RfEditOp, RfOpcode, convert_string, is_valid_string from cpython.list cimport PyList_New, PyList_SET_ITEM -from cpython.pycapsule cimport (PyCapsule_GetPointer, PyCapsule_IsValid, - PyCapsule_New) +from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_IsValid, PyCapsule_New from cpython.ref cimport Py_INCREF from libc.stdint cimport int64_t, uint32_t from libc.stdlib cimport free, malloc @@ -87,11 +85,12 @@ cdef RfEditops list_to_editops(ops, size_t src_len, size_t dest_len) except *: result.emplace_back(edit_type, src_pos, dest_pos) # validate order of editops - for i in range(0, ops_len - 1): - if result[i + 1].src_pos < result[i].src_pos or result[i + 1].dest_pos < result[i].dest_pos: - raise ValueError("List of edit operations out of order") - if result[i + 1].src_pos == result[i].src_pos and result[i + 1].dest_pos == result[i].dest_pos: - raise ValueError("Duplicated edit operation") + if (result.size()): + for i in range(0, result.size() - 1): + if result[i + 1].src_pos < result[i].src_pos or result[i + 1].dest_pos < result[i].dest_pos: + raise ValueError("List of edit operations out of order") + if result[i + 1].src_pos == result[i].src_pos and result[i + 1].dest_pos == result[i].dest_pos: + raise ValueError("Duplicated edit operation") result.shrink_to_fit() return result @@ -178,6 +177,10 @@ cdef list opcodes_to_list(const RfOpcodes& ops): return result_list cdef class MatchingBlock: + """ + Triple describing matching subsequences + """ + cdef public size_t a cdef public size_t b cdef public size_t size @@ -191,13 +194,16 @@ cdef class MatchingBlock: return 3 def __eq__(self, other): - if len(other) != 3: + try: + if len(other) != 3: + return False + + return (other[0] == self.a + and other[1] == self.b + and other[2] == self.size) + except: return False - return (other[0] == self.a - and other[1] == self.b - and other[2] == self.size) - def __getitem__(self, Py_ssize_t i): if i==0 or i==-3: return self.a if i==1 or i==-2: return self.b @@ -205,6 +211,11 @@ cdef class MatchingBlock: raise IndexError('MatchingBlock index out of range') + def __iter__(self): + yield self.a + yield self.b + yield self.size + def __repr__(self): return f"MatchingBlock(a={self.a}, b={self.b}, size={self.size})" @@ -324,13 +335,16 @@ cdef class Editop: return 3 def __eq__(self, other): - if len(other) != 3: + try: + if len(other) != 3: + return False + + return (other[0] == self.tag + and other[1] == self.src_pos + and other[2] == self.dest_pos) + except: return False - return (other[0] == self.tag - and other[1] == self.src_pos - and other[2] == self.dest_pos) - def __getitem__(self, Py_ssize_t i): if i==0 or i==-3: return self.tag if i==1 or i==-2: return self.src_pos @@ -338,6 +352,11 @@ cdef class Editop: raise IndexError('Editop index out of range') + def __iter__(self): + yield self.tag + yield self.src_pos + yield self.dest_pos + def __repr__(self): return f"Editop(tag='{self.tag}', src_pos={self.src_pos}, dest_pos={self.dest_pos})" @@ -550,6 +569,15 @@ cdef class Editops: else: raise TypeError("Expected index or slice") + def __iter__(self): + cdef size_t i + for i in range(self.editops.size()): + yield Editop( + edit_type_to_str(self.editops[i].type), + self.editops[i].src_pos, + self.editops[i].dest_pos + ) + def __repr__(self): return "Editops([" + ", ".join(repr(op) for op in self) + f"], src_len={self.editops.get_src_len()}, dest_len={self.editops.get_dest_len()})" @@ -598,15 +626,18 @@ cdef class Opcode: return 5 def __eq__(self, other): - if len(other) != 5: + try: + if len(other) != 5: + return False + + return (other[0] == self.tag + and other[1] == self.src_start + and other[2] == self.src_end + and other[3] == self.dest_start + and other[4] == self.dest_end) + except: return False - return (other[0] == self.tag - and other[1] == self.src_start - and other[2] == self.src_end - and other[3] == self.dest_start - and other[4] == self.dest_end) - def __getitem__(self, Py_ssize_t i): if i==0 or i==-5: return self.tag if i==1 or i==-4: return self.src_start @@ -616,6 +647,13 @@ cdef class Opcode: raise IndexError('Opcode index out of range') + def __iter__(self): + yield self.tag + yield self.src_start + yield self.src_end + yield self.dest_start + yield self.dest_end + def __repr__(self): return f"Opcode(tag='{self.tag}', src_start={self.src_start}, src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})" @@ -788,6 +826,17 @@ cdef class Opcodes: else: raise TypeError("Expected index") + def __iter__(self): + cdef size_t i + for i in range(self.opcodes.size()): + yield Opcode( + edit_type_to_str(self.opcodes[i].type), + self.opcodes[i].src_begin, + self.opcodes[i].src_end, + self.opcodes[i].dest_begin, + self.opcodes[i].dest_end + ) + def __repr__(self): return "Opcodes([" + ", ".join(repr(op) for op in self) + f"], src_len={self.opcodes.get_src_len()}, dest_len={self.opcodes.get_dest_len()})" @@ -811,15 +860,18 @@ cdef class ScoreAlignment: return 5 def __eq__(self, other): - if len(other) != 5: + try: + if len(other) != 5: + return False + + return (other[0] == self.score + and other[1] == self.src_start + and other[2] == self.src_end + and other[3] == self.dest_start + and other[4] == self.dest_end) + except: return False - return (other[0] == self.score - and other[1] == self.src_start - and other[2] == self.src_end - and other[3] == self.dest_start - and other[4] == self.dest_end) - def __getitem__(self, Py_ssize_t i): if i==0 or i==-5: return self.score if i==1 or i==-4: return self.src_start @@ -829,5 +881,12 @@ cdef class ScoreAlignment: raise IndexError('Opcode index out of range') + def __iter__(self): + yield self.score + yield self.src_start + yield self.src_end + yield self.dest_start + yield self.dest_end + def __repr__(self): return f"ScoreAlignment(score={self.score}, src_start={self.src_start}, src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})" diff --git a/src/rapidfuzz/distance/_initialize_py.py b/src/rapidfuzz/distance/_initialize_py.py index 0174e7c8..09352caa 100644 --- a/src/rapidfuzz/distance/_initialize_py.py +++ b/src/rapidfuzz/distance/_initialize_py.py @@ -1,33 +1,156 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + +from typing import TYPE_CHECKING, Iterator, Sequence, Tuple, Union + + +def _list_to_editops( + ops: _AnyOpList | None, + src_len: int, + dest_len: int, +) -> list[Editop]: + if not ops: + return [] + + if len(ops[0]) == 5: + return Opcodes(ops, src_len, dest_len).as_editops()._editops + + blocks: list[Editop] = [] + for op in ops: + edit_type: str + src_pos: int + dest_pos: int + edit_type, src_pos, dest_pos = op # type: ignore[misc, assignment] + + if src_pos > src_len or dest_pos > dest_len: + raise ValueError("List of edit operations invalid") + + if src_pos == src_len and edit_type != "insert": + raise ValueError("List of edit operations invalid") + if dest_pos == dest_len and edit_type != "delete": + raise ValueError("List of edit operations invalid") + + # keep operations are not relevant in editops + if edit_type == "equal": + continue + + blocks.append(Editop(edit_type, src_pos, dest_pos)) + + # validate order of editops + for i in range(0, len(blocks) - 1): + if ( + blocks[i + 1].src_pos < blocks[i].src_pos + or blocks[i + 1].dest_pos < blocks[i].dest_pos + ): + raise ValueError("List of edit operations out of order") + if ( + blocks[i + 1].src_pos == blocks[i].src_pos + and blocks[i + 1].dest_pos == blocks[i].dest_pos + ): + raise ValueError("Duplicated edit operation") + + return blocks + + +def _list_to_opcodes( + ops: _AnyOpList | None, + src_len: int, + dest_len: int, +) -> list[Opcode]: + if not ops or len(ops[0]) == 3: + return Editops(ops, src_len, dest_len).as_opcodes()._opcodes + + blocks: list[Opcode] = [] + for op in ops: + edit_type: str + src_start: int + src_end: int + dest_start: int + dest_end: int + edit_type, src_start, src_end, dest_start, dest_end = op # type: ignore[misc, assignment] + + if src_end > src_len or dest_end > dest_len: + raise ValueError("List of edit operations invalid") + if src_end < src_start or dest_end < dest_start: + raise ValueError("List of edit operations invalid") + + if edit_type in {"equal", "replace"}: + if src_end - src_start != dest_end - dest_start or src_start == src_end: + raise ValueError("List of edit operations invalid") + if edit_type == "insert": + if src_start != src_end or dest_start == dest_end: + raise ValueError("List of edit operations invalid") + elif edit_type == "delete": + if src_start == src_end or dest_start != dest_end: + raise ValueError("List of edit operations invalid") + + # merge similar adjacent blocks + if blocks: + if ( + blocks[-1].tag == edit_type + and blocks[-1].src_end == src_start + and blocks[-1].dest_end == dest_start + ): + blocks[-1].src_end = src_end + blocks[-1].dest_end = dest_end + continue + + blocks.append(Opcode(edit_type, src_start, src_end, dest_start, dest_end)) + + # check if edit operations span the complete string + if blocks[0].src_start != 0 or blocks[0].dest_start != 0: + raise ValueError("List of edit operations does not start at position 0") + if blocks[-1].src_end != src_len or blocks[-1].dest_end != dest_len: + raise ValueError("List of edit operations does not end at the string ends") + for i in range(0, len(blocks) - 1): + if ( + blocks[i + 1].src_start != blocks[i].src_end + or blocks[i + 1].dest_start != blocks[i].dest_end + ): + raise ValueError("List of edit operations is not continuous") + + return blocks + class MatchingBlock: - def __init__(self, a, b, size): - self.a = a - self.b = b - self.size = size + """ + Triple describing matching subsequences + """ + + def __init__(self, a: int, b: int, size: int): + self.a: int = a + self.b: int = b + self.size: int = size - def __len__(self): + def __len__(self) -> int: return 3 - def __eq__(self, other): - if len(other) != 3: - return False + def __eq__(self, other: object) -> bool: + try: + if len(other) != 3: # type: ignore[arg-type] + return False - return other[0] == self.a and other[1] == self.b and other[2] == self.size + return bool(other[0] == self.a and other[1] == self.b and other[2] == self.size) # type: ignore[index] + except TypeError: + return False - def __getitem__(self, i): - if i == 0 or i == -3: + def __getitem__(self, i: int) -> int: + if i in {0, -3}: return self.a - if i == 1 or i == -2: + if i in {1, -2}: return self.b - if i == 2 or i == -1: + if i in {2, -1}: return self.size raise IndexError("MatchingBlock index out of range") - def __repr__(self): + def __iter__(self) -> Iterator[int]: + for i in range(3): + yield self[i] + + def __repr__(self) -> str: return f"MatchingBlock(a={self.a}, b={self.b}, size={self.size})" @@ -49,35 +172,42 @@ class Editop: +-----------+---------------------------------------------------+ """ - def __init__(self, tag, src_pos, dest_pos): - self.tag = tag - self.src_pos = src_pos - self.dest_pos = dest_pos + def __init__(self, tag: str, src_pos: int, dest_pos: int): + self.tag: str = tag + self.src_pos: int = src_pos + self.dest_pos: int = dest_pos - def __len__(self): + def __len__(self) -> int: return 3 - def __eq__(self, other): - if len(other) != 3: + def __eq__(self, other: object) -> bool: + try: + if len(other) != 3: # type: ignore[arg-type] + return False + + return bool( + other[0] == self.tag # type: ignore[index] + and other[1] == self.src_pos # type: ignore[index] + and other[2] == self.dest_pos # type: ignore[index] + ) + except TypeError: return False - return ( - other[0] == self.tag - and other[1] == self.src_pos - and other[2] == self.dest_pos - ) - - def __getitem__(self, i): - if i == 0 or i == -3: + def __getitem__(self, i: int) -> int | str: + if i in {0, -3}: return self.tag - if i == 1 or i == -2: + if i in {1, -2}: return self.src_pos - if i == 2 or i == -1: + if i in {2, -1}: return self.dest_pos raise IndexError("Editop index out of range") - def __repr__(self): + def __iter__(self) -> Iterator[int | str]: + for i in range(3): + yield self[i] + + def __repr__(self) -> str: return ( f"Editop(tag={self.tag}, src_pos={self.src_pos}, dest_pos={self.dest_pos})" ) @@ -88,11 +218,18 @@ class Editops: List like object of Editos describing how to turn s1 into s2. """ - def __init__(self, editops=None, src_len=0, dest_len=0): - raise NotImplementedError + def __init__( + self, + editops: _AnyOpList | None = None, + src_len: int = 0, + dest_len: int = 0, + ): + self._src_len: int = src_len + self._dest_len: int = dest_len + self._editops: list[Editop] = _list_to_editops(editops, src_len, dest_len) @classmethod - def from_opcodes(cls, opcodes): + def from_opcodes(cls, opcodes: Opcodes) -> Editops: """ Create Editops from Opcodes @@ -106,9 +243,9 @@ def from_opcodes(cls, opcodes): editops : Editops Opcodes converted to Editops """ - raise NotImplementedError + return opcodes.as_editops() - def as_opcodes(self): + def as_opcodes(self) -> Opcodes: """ Convert to Opcodes @@ -117,26 +254,114 @@ def as_opcodes(self): opcodes : Opcodes Editops converted to Opcodes """ - raise NotImplementedError - - def as_matching_blocks(self): - raise NotImplementedError + x = Opcodes.__new__(Opcodes) + x._src_len = self._src_len + x._dest_len = self._dest_len + blocks = [] + src_pos = 0 + dest_pos = 0 + i = 0 + while i < len(self._editops): + if ( + src_pos < self._editops[i].src_pos + or dest_pos < self._editops[i].dest_pos + ): + blocks.append( + Opcode( + "equal", + src_pos, + self._editops[i].src_pos, + dest_pos, + self._editops[i].dest_pos, + ) + ) + src_pos = self._editops[i].src_pos + dest_pos = self._editops[i].dest_pos + + src_begin = src_pos + dest_begin = dest_pos + tag = self._editops[i].tag + while ( + i < len(self._editops) + and self._editops[i].tag == tag + and src_pos == self._editops[i].src_pos + and dest_pos == self._editops[i].dest_pos + ): + if tag == "replace": + src_pos += 1 + dest_pos += 1 + elif tag == "insert": + dest_pos += 1 + elif tag == "delete": + src_pos += 1 + + i += 1 + + blocks.append(Opcode(tag, src_begin, src_pos, dest_begin, dest_pos)) + + if src_pos < self.src_len or dest_pos < self.dest_len: + blocks.append( + Opcode("equal", src_pos, self.src_len, dest_pos, self.dest_len) + ) + + x._opcodes = blocks + return x + + def as_matching_blocks(self) -> list[MatchingBlock]: + """ + Convert to matching blocks - def as_list(self): + Returns + ------- + matching blocks : list[MatchingBlock] + Editops converted to matching blocks + """ + blocks = [] + src_pos = 0 + dest_pos = 0 + for op in self: + if src_pos < op.src_pos or dest_pos < op.dest_pos: + length = min(op.src_pos - src_pos, op.dest_pos - dest_pos) + if length > 0: + blocks.append(MatchingBlock(src_pos, dest_pos, length)) + src_pos = op.src_pos + dest_pos = op.dest_pos + + if op.tag == "replace": + src_pos += 1 + dest_pos += 1 + elif op.tag == "delete": + src_pos += 1 + elif op.tag == "insert": + dest_pos += 1 + + if src_pos < self.src_len or dest_pos < self.dest_len: + length = min(self.src_len - src_pos, self.dest_len - dest_pos) + if length > 0: + blocks.append(MatchingBlock(src_pos, dest_pos, length)) + + blocks.append(MatchingBlock(self.src_len, self.dest_len, 0)) + return blocks + + def as_list(self) -> list[Editop]: """ Convert Editops to a list of tuples. This is the equivalent of ``[x for x in editops]`` """ - raise NotImplementedError + return self._editops - def copy(self): + def copy(self) -> Editops: """ performs copy of Editops """ - raise NotImplementedError + x = Editops.__new__(Editops) + x._src_len = self._src_len + x._dest_len = self._dest_len + x._editops = self._editops[::] + return x - def inverse(self): + def inverse(self) -> Editops: """ Invert Editops, so it describes how to transform the destination string to the source string. @@ -159,44 +384,112 @@ def inverse(self): Editop(tag=replace, src_pos=2, dest_pos=3), Editop(tag=delete, src_pos=3, dest_pos=4)] """ - raise NotImplementedError + blocks = [] + for op in self: + tag = op.tag + if tag == "delete": + tag = "insert" + elif tag == "insert": + tag = "delete" + + blocks.append(Editop(tag, op.dest_pos, op.src_pos)) + + x = Editops.__new__(Editops) + x._src_len = self.dest_len + x._dest_len = self.src_len + x._editops = blocks + return x + + def remove_subsequence(self, subsequence: Editops) -> None: + """ + remove a subsequence + + Parameters + ---------- + subsequence : Editops + subsequence to remove (has to be a subset of editops) - def remove_subsequence(self, subsequence): + Returns + ------- + sequence : Editops + a copy of the editops without the subsequence + """ raise NotImplementedError - def apply(self, source_string, destination_string): + def apply(self, source_string: str, destination_string: str) -> str: + """ + apply editops to source_string + + Parameters + ---------- + source_string : str | bytes + string to apply editops to + destination_string : str | bytes + string to use for replacements / insertions into source_string + + Returns + ------- + mod_string : str + modified source_string + + """ raise NotImplementedError @property - def src_len(self): - raise NotImplementedError + def src_len(self) -> int: + return self._src_len @src_len.setter - def src_len(self, value): - raise NotImplementedError + def src_len(self, value: int) -> None: + self._src_len = value @property - def dest_len(self): - raise NotImplementedError + def dest_len(self) -> int: + return self._dest_len @dest_len.setter - def dest_len(self, value): - raise NotImplementedError + def dest_len(self, value: int) -> None: + self._dest_len = value - def __eq__(self, other): - raise NotImplementedError + def __eq__(self, other: object) -> bool: + if not isinstance(other, Editops): + return False - def __len__(self): - raise NotImplementedError + return ( + self.dest_len == other.dest_len + and self.src_len == other.src_len + and self._editops == other._editops + ) - def __delitem__(self, item) -> None: - raise NotImplementedError + def __len__(self) -> int: + return len(self._editops) - def __getitem__(self, key): - raise NotImplementedError + def __delitem__(self, key: int | slice) -> None: + del self._editops[key] + + def __getitem__(self, key: int | slice) -> Editops | Editop: + if isinstance(key, int): + return self._editops[key] + + start, stop, step = key.indices(len(self._editops)) + if step < 0: + raise ValueError("step sizes below 0 lead to an invalid order of editops") + + x = Editops.__new__(Editops) + x._src_len = self._src_len + x._dest_len = self._dest_len + x._editops = self._editops[start:stop:step] + return x - def __repr__(self): - return "[" + ", ".join(repr(op) for op in self) + "]" + def __iter__(self) -> Iterator[Editop]: + yield from self._editops + + def __repr__(self) -> str: + return ( + "Editops([" + + ", ".join(repr(op) for op in self) + + f"], src_len={self.src_len}, dest_len={self.dest_len})" + ) class Opcode: @@ -228,44 +521,56 @@ class Opcode: interoperable """ - def __init__(self, tag, src_start, src_end, dest_start, dest_end): - self.tag = tag - self.src_start = src_start - self.src_end = src_end - self.dest_start = dest_start - self.dest_end = dest_end + def __init__( + self, tag: str, src_start: int, src_end: int, dest_start: int, dest_end: int + ): + self.tag: str = tag + self.src_start: int = src_start + self.src_end: int = src_end + self.dest_start: int = dest_start + self.dest_end: int = dest_end - def __len__(self): + def __len__(self) -> int: return 5 - def __eq__(self, other): - if len(other) != 5: + def __eq__(self, other: object) -> bool: + try: + if len(other) != 5: # type: ignore[arg-type] + return False + + return bool( + other[0] == self.tag # type: ignore[index] + and other[1] == self.src_start # type: ignore[index] + and other[2] == self.src_end # type: ignore[index] + and other[3] == self.dest_start # type: ignore[index] + and other[4] == self.dest_end # type: ignore[index] + ) + except TypeError: return False - return ( - other[0] == self.tag - and other[1] == self.src_start - and other[2] == self.src_end - and other[3] == self.dest_start - and other[4] == self.dest_end - ) - - def __getitem__(self, i): - if i == 0 or i == -5: + def __getitem__(self, i: int) -> int | str: + if i in {0, -5}: return self.tag - if i == 1 or i == -4: + if i in {1, -4}: return self.src_start - if i == 2 or i == -3: + if i in {2, -3}: return self.src_end - if i == 3 or i == -2: + if i in {3, -2}: return self.dest_start - if i == 4 or i == -1: + if i in {4, -1}: return self.dest_end raise IndexError("Opcode index out of range") - def __repr__(self): - return f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})" + def __iter__(self) -> Iterator[int | str]: + for i in range(5): + yield self[i] + + def __repr__(self) -> str: + return ( + f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, " + f"dest_start={self.dest_start}, dest_end={self.dest_end})" + ) class Opcodes: @@ -276,11 +581,18 @@ class Opcodes: and likewise for dest_start == the previous dest_end. """ - def __init__(self, opcodes=None, src_len=0, dest_len=0): - raise NotImplementedError + def __init__( + self, + opcodes: _AnyOpList | None = None, + src_len: int = 0, + dest_len: int = 0, + ): + self._src_len: int = src_len + self._dest_len: int = dest_len + self._opcodes: list[Opcode] = _list_to_opcodes(opcodes, src_len, dest_len) @classmethod - def from_editops(cls, editops): + def from_editops(cls, editops: Editops) -> Opcodes: """ Create Opcodes from Editops @@ -294,9 +606,9 @@ def from_editops(cls, editops): opcodes : Opcodes Editops converted to Opcodes """ - raise NotImplementedError + return editops.as_opcodes() - def as_editops(self): + def as_editops(self) -> Editops: """ Convert Opcodes to Editops @@ -305,27 +617,65 @@ def as_editops(self): editops : Editops Opcodes converted to Editops """ - raise NotImplementedError + x = Editops.__new__(Editops) + x._src_len = self._src_len + x._dest_len = self._dest_len + blocks = [] + for op in self: + if op.tag == "replace": + for j in range(op.src_end - op.src_start): + blocks.append( + Editop("replace", op.src_start + j, op.dest_start + j) + ) + elif op.tag == "insert": + for j in range(op.dest_end - op.dest_start): + blocks.append(Editop("insert", op.src_start, op.dest_start + j)) + elif op.tag == "delete": + for j in range(op.src_end - op.src_start): + blocks.append(Editop("delete", op.src_start + j, op.dest_start)) + + x._editops = blocks + return x + + def as_matching_blocks(self) -> list[MatchingBlock]: + """ + Convert to matching blocks - def as_matching_blocks(self): - raise NotImplementedError + Returns + ------- + matching blocks : list[MatchingBlock] + Opcodes converted to matching blocks + """ + blocks = [] + for op in self: + if op.tag == "equal": + length = min(op.src_end - op.src_start, op.dest_end - op.dest_start) + if length > 0: + blocks.append(MatchingBlock(op.src_start, op.dest_start, length)) + + blocks.append(MatchingBlock(self.src_len, self.dest_len, 0)) + return blocks - def as_list(self): + def as_list(self) -> list[Opcode]: """ Convert Opcodes to a list of tuples, which is compatible with the opcodes of difflibs SequenceMatcher. This is the equivalent of ``[x for x in opcodes]`` """ - raise NotImplementedError + return self._opcodes[::] - def copy(self): + def copy(self) -> Opcodes: """ performs copy of Opcodes """ - raise NotImplementedError + x = Opcodes.__new__(Opcodes) + x._src_len = self._src_len + x._dest_len = self._dest_len + x._opcodes = self._opcodes[::] + return x - def inverse(self): + def inverse(self) -> Opcodes: """ Invert Opcodes, so it describes how to transform the destination string to the source string. @@ -350,38 +700,87 @@ def inverse(self): Opcode(tag=replace, src_start=2, src_end=3, dest_start=3, dest_end=4), Opcode(tag=delete, src_start=3, src_end=4, dest_start=4, dest_end=4)] """ - raise NotImplementedError + blocks = [] + for op in self: + tag = op.tag + if tag == "delete": + tag = "insert" + elif tag == "insert": + tag = "delete" + + blocks.append( + Opcode(tag, op.dest_start, op.dest_end, op.src_start, op.src_end) + ) + + x = Opcodes.__new__(Opcodes) + x._src_len = self.dest_len + x._dest_len = self.src_len + x._opcodes = blocks + return x + + def apply(self, source_string: str, destination_string: str) -> str: + """ + apply opcodes to source_string + + Parameters + ---------- + source_string : str | bytes + string to apply opcodes to + destination_string : str | bytes + string to use for replacements / insertions into source_string - def apply(self, source_string, destination_string): + Returns + ------- + mod_string : str + modified source_string + + """ raise NotImplementedError @property - def src_len(self): - raise NotImplementedError + def src_len(self) -> int: + return self._src_len @src_len.setter - def src_len(self, value): - raise NotImplementedError + def src_len(self, value: int) -> None: + self._src_len = value @property - def dest_len(self): - raise NotImplementedError + def dest_len(self) -> int: + return self._dest_len @dest_len.setter - def dest_len(self, value): - raise NotImplementedError + def dest_len(self, value: int) -> None: + self._dest_len = value - def __eq__(self, other): - raise NotImplementedError + def __eq__(self, other: object) -> bool: + if not isinstance(other, Opcodes): + return False - def __len__(self): - raise NotImplementedError + return ( + self.dest_len == other.dest_len + and self.src_len == other.src_len + and self._opcodes == other._opcodes + ) - def __getitem__(self, key): - raise NotImplementedError + def __len__(self) -> int: + return len(self._opcodes) + + def __getitem__(self, key: int) -> Opcode: + if isinstance(key, int): + return self._opcodes[key] - def __repr__(self): - return "[" + ", ".join(repr(op) for op in self) + "]" + raise TypeError("Expected index") + + def __iter__(self) -> Iterator[Opcode]: + yield from self._opcodes + + def __repr__(self) -> str: + return ( + "Opcodes([" + + ", ".join(repr(op) for op in self) + + f"], src_len={self.src_len}, dest_len={self.dest_len})" + ) class ScoreAlignment: @@ -393,41 +792,65 @@ class ScoreAlignment: src[src_start:src_end] and dest[dest_start:dest_end] """ - def __init__(self, score, src_start, src_end, dest_start, dest_end): - self.score = score - self.src_start = src_start - self.src_end = src_end - self.dest_start = dest_start - self.dest_end = dest_end - - def __len__(self): + def __init__( + self, + score: int | float, + src_start: int, + src_end: int, + dest_start: int, + dest_end: int, + ): + self.score: int | float = score + self.src_start: int = src_start + self.src_end: int = src_end + self.dest_start: int = dest_start + self.dest_end: int = dest_end + + def __len__(self) -> int: return 5 - def __eq__(self, other): - if len(other) != 5: + def __eq__(self, other: object) -> bool: + try: + if len(other) != 5: # type: ignore[arg-type] + return False + + return bool( + other[0] == self.score # type: ignore[index] + and other[1] == self.src_start # type: ignore[index] + and other[2] == self.src_end # type: ignore[index] + and other[3] == self.dest_start # type: ignore[index] + and other[4] == self.dest_end # type: ignore[index] + ) + except TypeError: return False - return ( - other[0] == self.score - and other[1] == self.src_start - and other[2] == self.src_end - and other[3] == self.dest_start - and other[4] == self.dest_end - ) - - def __getitem__(self, i): - if i == 0 or i == -5: + def __getitem__(self, i: int) -> int | float: + if i in {0, -5}: return self.score - if i == 1 or i == -4: + if i in {1, -4}: return self.src_start - if i == 2 or i == -3: + if i in {2, -3}: return self.src_end - if i == 3 or i == -2: + if i in {3, -2}: return self.dest_start - if i == 4 or i == -1: + if i in {4, -1}: return self.dest_end raise IndexError("Opcode index out of range") - def __repr__(self): - return f"ScoreAlignment(score={self.score}, src_start={self.src_start}, src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})" + def __iter__(self) -> Iterator[int | float]: + for i in range(5): + yield self[i] + + def __repr__(self) -> str: + return ( + f"ScoreAlignment(score={self.score}, src_start={self.src_start}, " + f"src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})" + ) + + +if TYPE_CHECKING: + _AnyOpList = Union[ + Sequence[Union[Editop, Tuple[str, int, int]]], + Sequence[Union[Opcode, Tuple[str, int, int, int, int]]], + ] diff --git a/src/rapidfuzz/distance/metrics_cpp.pyi b/src/rapidfuzz/distance/metrics_cpp.pyi index d5a2e60b..1597a9a2 100644 --- a/src/rapidfuzz/distance/metrics_cpp.pyi +++ b/src/rapidfuzz/distance/metrics_cpp.pyi @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import Callable, Hashable, Optional, Sequence, Tuple, TypeVar from rapidfuzz.distance import Editops, Opcodes @@ -12,7 +14,7 @@ def levenshtein_distance( *, weights: Optional[Tuple[int, int, int]] = (1, 1, 1), processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def levenshtein_normalized_distance( s1: _S1, @@ -20,7 +22,7 @@ def levenshtein_normalized_distance( *, weights: Optional[Tuple[int, int, int]] = (1, 1, 1), processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def levenshtein_similarity( s1: _S1, @@ -28,7 +30,7 @@ def levenshtein_similarity( *, weights: Optional[Tuple[int, int, int]] = (1, 1, 1), processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def levenshtein_normalized_similarity( s1: _S1, @@ -36,49 +38,49 @@ def levenshtein_normalized_similarity( *, weights: Optional[Tuple[int, int, int]] = (1, 1, 1), processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def levenshtein_editops( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_hint: Optional[int] = None + score_hint: Optional[int] = None, ) -> Editops: ... def levenshtein_opcodes( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_hint: Optional[int] = None + score_hint: Optional[int] = None, ) -> Opcodes: ... def indel_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def indel_normalized_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def indel_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def indel_normalized_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def indel_editops( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None @@ -91,28 +93,28 @@ def lcs_seq_distance( s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def lcs_seq_normalized_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def lcs_seq_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def lcs_seq_normalized_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def lcs_seq_editops( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None @@ -125,28 +127,28 @@ def hamming_distance( s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def hamming_normalized_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def hamming_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def hamming_normalized_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def hamming_editops( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None @@ -159,84 +161,84 @@ def damerau_levenshtein_distance( s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def damerau_levenshtein_normalized_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def damerau_levenshtein_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def damerau_levenshtein_normalized_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def osa_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def osa_normalized_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def osa_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> int: ... def osa_normalized_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def jaro_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> float: ... def jaro_normalized_distance( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def jaro_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> float: ... def jaro_normalized_similarity( s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def jaro_winkler_distance( s1: _S1, @@ -244,7 +246,7 @@ def jaro_winkler_distance( *, prefix_weight: float = 0.1, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> float: ... def jaro_winkler_normalized_distance( s1: _S1, @@ -252,7 +254,7 @@ def jaro_winkler_normalized_distance( *, prefix_weight: float = 0.1, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... def jaro_winkler_similarity( s1: _S1, @@ -260,7 +262,7 @@ def jaro_winkler_similarity( *, prefix_weight: float = 0.1, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[int] = None + score_cutoff: Optional[int] = None, ) -> float: ... def jaro_winkler_normalized_similarity( s1: _S1, @@ -268,5 +270,5 @@ def jaro_winkler_normalized_similarity( *, prefix_weight: float = 0.1, processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + score_cutoff: Optional[float] = 0, ) -> float: ... diff --git a/src/rapidfuzz/distance/metrics_cpp.pyx b/src/rapidfuzz/distance/metrics_cpp.pyx index fdd1c2dc..41cf3242 100644 --- a/src/rapidfuzz/distance/metrics_cpp.pyx +++ b/src/rapidfuzz/distance/metrics_cpp.pyx @@ -3,13 +3,19 @@ from ._initialize_cpp import Editops -from rapidfuzz_capi cimport (RF_SCORER_FLAG_MULTI_STRING_CALL, - RF_SCORER_FLAG_MULTI_STRING_INIT, - RF_SCORER_FLAG_RESULT_F64, - RF_SCORER_FLAG_RESULT_I64, - RF_SCORER_FLAG_SYMMETRIC, RF_Kwargs, - RF_Preprocess, RF_Scorer, RF_ScorerFlags, - RF_ScorerFunc, RF_String) +from rapidfuzz_capi cimport ( + RF_SCORER_FLAG_MULTI_STRING_CALL, + RF_SCORER_FLAG_MULTI_STRING_INIT, + RF_SCORER_FLAG_RESULT_F64, + RF_SCORER_FLAG_RESULT_I64, + RF_SCORER_FLAG_SYMMETRIC, + RF_Kwargs, + RF_Preprocess, + RF_Scorer, + RF_ScorerFlags, + RF_ScorerFunc, + RF_String, +) from ._initialize_cpp cimport Editops, RfEditops @@ -17,8 +23,12 @@ from ._initialize_cpp cimport Editops, RfEditops from array import array -from cpp_common cimport (CreateScorerContext, NoKwargsInit, RF_StringWrapper, - preprocess_strings) +from cpp_common cimport ( + CreateScorerContext, + NoKwargsInit, + RF_StringWrapper, + preprocess_strings, +) from cpython.pycapsule cimport PyCapsule_New from libc.stdint cimport INT64_MAX, int64_t from libc.stdlib cimport free, malloc diff --git a/src/rapidfuzz/fuzz.py b/src/rapidfuzz/fuzz.py index 4ad20d85..7633109f 100644 --- a/src/rapidfuzz/fuzz.py +++ b/src/rapidfuzz/fuzz.py @@ -1,17 +1,44 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2021 Max Bachmann +from __future__ import annotations + +from typing import Any, Callable + from rapidfuzz._utils import fallback_import as _fallback_import + +def _get_scorer_flags_fuzz(**_kwargs: Any) -> dict[str, Any]: + return {"optimal_score": 100, "worst_score": 0, "flags": (1 << 5)} + + +_fuzz_attribute: dict[str, Callable[..., dict[str, Any]]] = { + "get_scorer_flags": _get_scorer_flags_fuzz +} + _mod = "rapidfuzz.fuzz" -ratio = _fallback_import(_mod, "ratio") -partial_ratio = _fallback_import(_mod, "partial_ratio") -partial_ratio_alignment = _fallback_import(_mod, "partial_ratio_alignment") -token_sort_ratio = _fallback_import(_mod, "token_sort_ratio") -token_set_ratio = _fallback_import(_mod, "token_set_ratio") -token_ratio = _fallback_import(_mod, "token_ratio") -partial_token_sort_ratio = _fallback_import(_mod, "partial_token_sort_ratio") -partial_token_set_ratio = _fallback_import(_mod, "partial_token_set_ratio") -partial_token_ratio = _fallback_import(_mod, "partial_token_ratio") -WRatio = _fallback_import(_mod, "WRatio") -QRatio = _fallback_import(_mod, "QRatio") +ratio = _fallback_import(_mod, "ratio", cached_scorer_call=_fuzz_attribute) +partial_ratio = _fallback_import( + _mod, "partial_ratio", cached_scorer_call=_fuzz_attribute +) +partial_ratio_alignment = _fallback_import( + _mod, "partial_ratio_alignment", cached_scorer_call=_fuzz_attribute +) +token_sort_ratio = _fallback_import( + _mod, "token_sort_ratio", cached_scorer_call=_fuzz_attribute +) +token_set_ratio = _fallback_import( + _mod, "token_set_ratio", cached_scorer_call=_fuzz_attribute +) +token_ratio = _fallback_import(_mod, "token_ratio", cached_scorer_call=_fuzz_attribute) +partial_token_sort_ratio = _fallback_import( + _mod, "partial_token_sort_ratio", cached_scorer_call=_fuzz_attribute +) +partial_token_set_ratio = _fallback_import( + _mod, "partial_token_set_ratio", cached_scorer_call=_fuzz_attribute +) +partial_token_ratio = _fallback_import( + _mod, "partial_token_ratio", cached_scorer_call=_fuzz_attribute +) +WRatio = _fallback_import(_mod, "WRatio", cached_scorer_call=_fuzz_attribute) +QRatio = _fallback_import(_mod, "QRatio", cached_scorer_call=_fuzz_attribute) diff --git a/src/rapidfuzz/fuzz.pyi b/src/rapidfuzz/fuzz.pyi index 9951fec3..e0582aa1 100644 --- a/src/rapidfuzz/fuzz.pyi +++ b/src/rapidfuzz/fuzz.pyi @@ -1,86 +1,87 @@ -from typing import Callable, Hashable, Optional, Sequence, TypeVar +# SPDX-License-Identifier: MIT +# Copyright (C) 2021 Max Bachmann + +from __future__ import annotations + +from typing import Any, Callable, Hashable, Sequence from rapidfuzz.distance import ScoreAlignment from rapidfuzz.utils import default_process -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - def ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def partial_ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, ) -> float: ... def partial_ratio_alignment( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> Optional[ScoreAlignment]: ... + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = 0, +) -> ScoreAlignment | None: ... def token_sort_ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... def token_set_ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... def token_ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... def partial_token_sort_ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... def partial_token_set_ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... def partial_token_ratio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... def WRatio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... def QRatio( - s1: _S1, - s2: _S2, + s1: Sequence[Hashable], + s2: Sequence[Hashable], *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 + processor: Callable[..., Sequence[Hashable]] | None = default_process, + score_cutoff: float | None = 0, ) -> float: ... diff --git a/src/rapidfuzz/fuzz_cpp.pyx b/src/rapidfuzz/fuzz_cpp.pyx index f3f9b00a..75c37aef 100644 --- a/src/rapidfuzz/fuzz_cpp.pyx +++ b/src/rapidfuzz/fuzz_cpp.pyx @@ -3,10 +3,16 @@ from .distance._initialize_cpp import ScoreAlignment -from rapidfuzz_capi cimport (RF_SCORER_FLAG_MULTI_STRING_INIT, - RF_SCORER_FLAG_RESULT_F64, - RF_SCORER_FLAG_SYMMETRIC, RF_Kwargs, RF_Scorer, - RF_ScorerFlags, RF_ScorerFunc, RF_String) +from rapidfuzz_capi cimport ( + RF_SCORER_FLAG_MULTI_STRING_INIT, + RF_SCORER_FLAG_RESULT_F64, + RF_SCORER_FLAG_SYMMETRIC, + RF_Kwargs, + RF_Scorer, + RF_ScorerFlags, + RF_ScorerFunc, + RF_String, +) # required for preprocess_strings @@ -14,9 +20,15 @@ from array import array from rapidfuzz.utils import default_process -from cpp_common cimport (AddScorerContext, CreateScorerContext, - CreateScorerContextPy, NoKwargsInit, RF_StringWrapper, - RfScoreAlignment, preprocess_strings) +from cpp_common cimport ( + AddScorerContext, + CreateScorerContext, + CreateScorerContextPy, + NoKwargsInit, + RF_StringWrapper, + RfScoreAlignment, + preprocess_strings, +) from libc.stdint cimport int64_t, uint32_t from libcpp cimport bool diff --git a/src/rapidfuzz/fuzz_py.py b/src/rapidfuzz/fuzz_py.py index b9364a4a..41a73d91 100644 --- a/src/rapidfuzz/fuzz_py.py +++ b/src/rapidfuzz/fuzz_py.py @@ -1,7 +1,11 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from math import ceil +from typing import Callable, Hashable +from rapidfuzz.distance import ScoreAlignment from rapidfuzz.distance.Indel_py import ( _block_normalized_similarity as indel_block_normalized_similarity, ) @@ -11,23 +15,27 @@ ) from rapidfuzz.utils_py import default_process -from .distance._initialize_py import ScoreAlignment - -def _norm_distance(dist, lensum, score_cutoff): +def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float: score = (100 - 100 * dist / lensum) if lensum else 100 return score if score >= score_cutoff else 0 -def ratio(s1, s2, *, processor=None, score_cutoff=None): +def ratio( + s1: str | bytes, + s2: str | bytes, + *, + processor: Callable[..., str | bytes] | None | bool = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the normalized Indel distance. Parameters ---------- - s1 : Sequence[Hashable] + s1 : str | bytes First string to compare. - s2 : Sequence[Hashable] + s2 : str | bytes Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -55,9 +63,6 @@ def ratio(s1, s2, *, processor=None, score_cutoff=None): >>> fuzz.ratio("this is a test", "this is a test!") 96.55171966552734 """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -72,7 +77,9 @@ def ratio(s1, s2, *, processor=None, score_cutoff=None): return score * 100 -def _partial_ratio_short_needle(s1, s2, score_cutoff): +def _partial_ratio_short_needle( + s1: str | bytes, s2: str | bytes, score_cutoff: float +) -> ScoreAlignment: """ implementation of partial_ratio for needles <= 64. assumes s1 is already the shorter string @@ -83,7 +90,7 @@ def _partial_ratio_short_needle(s1, s2, score_cutoff): res = ScoreAlignment(0, 0, len1, 0, len1) - block = {} + block: dict[Hashable, int] = {} block_get = block.get x = 1 for ch1 in s1: @@ -145,16 +152,22 @@ def _partial_ratio_short_needle(s1, s2, score_cutoff): return res -def partial_ratio(s1, s2, *, processor=None, score_cutoff=None): +def partial_ratio( + s1: str | bytes, + s2: str | bytes, + *, + processor: Callable[..., str | bytes] | None | bool = None, + score_cutoff: float | None = None, +) -> float: """ Searches for the optimal alignment of the shorter string in the longer string and returns the fuzz.ratio for this alignment. Parameters ---------- - s1 : Sequence[Hashable] + s1 : str | bytes First string to compare. - s2 : Sequence[Hashable] + s2 : str | bytes Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -210,9 +223,6 @@ def partial_ratio(s1, s2, *, processor=None, score_cutoff=None): >>> fuzz.partial_ratio("this is a test", "this is a test!") 100.0 """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -238,7 +248,13 @@ def partial_ratio(s1, s2, *, processor=None, score_cutoff=None): return _partial_ratio_short_needle(shorter, longer, score_cutoff / 100).score -def partial_ratio_alignment(s1, s2, *, processor=None, score_cutoff=None): +def partial_ratio_alignment( + s1: str | bytes, + s2: str | bytes, + *, + processor: Callable[..., str | bytes] | None | bool = None, + score_cutoff: float | None = None, +) -> ScoreAlignment | None: """ Searches for the optimal alignment of the shorter string in the longer string and returns the fuzz.ratio and the corresponding @@ -246,9 +262,9 @@ def partial_ratio_alignment(s1, s2, *, processor=None, score_cutoff=None): Parameters ---------- - s1 : Sequence[Hashable] + s1 : str | bytes First string to compare. - s2 : Sequence[Hashable] + s2 : str | bytes Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -276,9 +292,6 @@ def partial_ratio_alignment(s1, s2, *, processor=None, score_cutoff=None): >>> fuzz.ratio(s1[res.src_start:res.src_end], s2[res.dest_start:res.dest_end]) 83.33333333333334 """ - if s1 is None or s2 is None: - return None - if processor is True: processor = default_process elif processor is False: @@ -307,21 +320,27 @@ def partial_ratio_alignment(s1, s2, *, processor=None, score_cutoff=None): if len(s1) <= len(s2): return res - else: - return ScoreAlignment( - res.score, res.dest_start, res.dest_end, res.src_start, res.src_end - ) + return ScoreAlignment( + res.score, res.dest_start, res.dest_end, res.src_start, res.src_end + ) -def token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff=None): + +def token_sort_ratio( + s1: str, + s2: str, + *, + processor: Callable[..., str] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ Sorts the words in the strings and calculates the fuzz.ratio between them Parameters ---------- - s1 : Sequence[Hashable] + s1 : str First string to compare. - s2 : Sequence[Hashable] + s2 : str Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -345,9 +364,6 @@ def token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff=None): >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") 100.0 """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -362,16 +378,22 @@ def token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff=None): return ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff) -def token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=None): +def token_set_ratio( + s1: str, + s2: str, + *, + processor: Callable[..., str] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ Compares the words in the strings based on unique and common words between them using fuzz.ratio Parameters ---------- - s1 : Sequence[Hashable] + s1 : str First string to compare. - s2 : Sequence[Hashable] + s2 : str Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -397,9 +419,6 @@ def token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=None): >>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") 100.0 """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -439,7 +458,7 @@ def token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=None): sect_ab_len = sect_len + (sect_len != 0) + ab_len sect_ba_len = sect_len + (sect_len != 0) + ba_len - result = 0 + result = 0.0 cutoff_distance = ceil((sect_ab_len + sect_ba_len) * (1 - score_cutoff / 100)) dist = indel_distance(diff_ab_joined, diff_ba_joined, score_cutoff=cutoff_distance) @@ -462,16 +481,22 @@ def token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=None): return max(result, sect_ab_ratio, sect_ba_ratio) -def token_ratio(s1, s2, *, processor=default_process, score_cutoff=None): +def token_ratio( + s1: str, + s2: str, + *, + processor: Callable[..., str] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio (faster than manually executing the two functions) Parameters ---------- - s1 : Sequence[Hashable] + s1 : str First string to compare. - s2 : Sequence[Hashable] + s2 : str Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -490,9 +515,6 @@ def token_ratio(s1, s2, *, processor=default_process, score_cutoff=None): ----- .. image:: img/token_ratio.svg """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -509,15 +531,21 @@ def token_ratio(s1, s2, *, processor=default_process, score_cutoff=None): ) -def partial_token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff=None): +def partial_token_sort_ratio( + s1: str, + s2: str, + *, + processor: Callable[..., str] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ sorts the words in the strings and calculates the fuzz.partial_ratio between them Parameters ---------- - s1 : Sequence[Hashable] + s1 : str First string to compare. - s2 : Sequence[Hashable] + s2 : str Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -536,9 +564,6 @@ def partial_token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff= ----- .. image:: img/partial_token_sort_ratio.svg """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -553,16 +578,22 @@ def partial_token_sort_ratio(s1, s2, *, processor=default_process, score_cutoff= return partial_ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff) -def partial_token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=None): +def partial_token_set_ratio( + s1: str, + s2: str, + *, + processor: Callable[..., str] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio Parameters ---------- - s1 : Sequence[Hashable] + s1 : str First string to compare. - s2 : Sequence[Hashable] + s2 : str Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -581,9 +612,6 @@ def partial_token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=N ----- .. image:: img/partial_token_set_ratio.svg """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -609,16 +637,22 @@ def partial_token_set_ratio(s1, s2, *, processor=default_process, score_cutoff=N return partial_ratio(diff_ab, diff_ba, score_cutoff=score_cutoff) -def partial_token_ratio(s1, s2, *, processor=default_process, score_cutoff=None): +def partial_token_ratio( + s1: str, + s2: str, + *, + processor: Callable[..., str] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio (faster than manually executing the two functions) Parameters ---------- - s1 : Sequence[Hashable] + s1 : str First string to compare. - s2 : Sequence[Hashable] + s2 : str Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -637,15 +671,12 @@ def partial_token_ratio(s1, s2, *, processor=default_process, score_cutoff=None) ----- .. image:: img/partial_token_ratio.svg """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: processor = None - if processor is not None and processor: + if processor is not None: s1 = processor(s1) s2 = processor(s2) @@ -685,15 +716,21 @@ def partial_token_ratio(s1, s2, *, processor=default_process, score_cutoff=None) ) -def WRatio(s1, s2, *, processor=default_process, score_cutoff=None): +def WRatio( + s1: str, + s2: str, + *, + processor: Callable[..., str] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ Calculates a weighted ratio based on the other ratio algorithms Parameters ---------- - s1 : Sequence[Hashable] + s1 : str First string to compare. - s2 : Sequence[Hashable] + s2 : str Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -714,9 +751,6 @@ def WRatio(s1, s2, *, processor=default_process, score_cutoff=None): """ UNBASE_SCALE = 0.95 - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -762,7 +796,13 @@ def WRatio(s1, s2, *, processor=default_process, score_cutoff=None): ) -def QRatio(s1, s2, *, processor=default_process, score_cutoff=None): +def QRatio( + s1: str | bytes, + s2: str | bytes, + *, + processor: Callable[..., str | bytes] | None | bool = default_process, + score_cutoff: float | None = None, +) -> float: """ Calculates a quick ratio between two strings using fuzz.ratio. The only difference to fuzz.ratio is, that this preprocesses @@ -770,9 +810,9 @@ def QRatio(s1, s2, *, processor=default_process, score_cutoff=None): Parameters ---------- - s1 : Sequence[Hashable] + s1 : str | bytes First string to compare. - s2 : Sequence[Hashable] + s2 : str | bytes Second string to compare. processor: callable, optional Optional callable that is used to preprocess the strings before @@ -792,9 +832,6 @@ def QRatio(s1, s2, *, processor=default_process, score_cutoff=None): >>> fuzz.QRatio("this is a test", "THIS is a test!") 100.0 """ - if s1 is None or s2 is None: - return 0 - if processor is True: processor = default_process elif processor is False: @@ -809,28 +846,3 @@ def QRatio(s1, s2, *, processor=default_process, score_cutoff=None): return 0 return ratio(s1, s2, score_cutoff=score_cutoff) - - -def _GetScorerFlagsSimilarity(**kwargs): - return {"optimal_score": 100, "worst_score": 0, "flags": (1 << 5)} - - -ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -partial_ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -token_sort_ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -partial_token_sort_ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -token_set_ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -partial_token_set_ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -token_ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -partial_token_ratio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -WRatio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -QRatio._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} diff --git a/src/rapidfuzz/fuzz_py.pyi b/src/rapidfuzz/fuzz_py.pyi deleted file mode 100644 index 633b3cfe..00000000 --- a/src/rapidfuzz/fuzz_py.pyi +++ /dev/null @@ -1,105 +0,0 @@ -from typing import Any, Callable, Dict, Hashable, Optional, Sequence, TypeVar - -from typing_extensions import Protocol - -from rapidfuzz.distance import ScoreAlignment -from rapidfuzz.utils import default_process - -class _ScorerAttributes(Protocol): - _RF_ScorerPy: Dict - -def _attr_decorator(func: Any) -> _ScorerAttributes: - return func - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -@_attr_decorator -def ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def partial_ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def partial_ratio_alignment( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> Optional[ScoreAlignment]: ... -@_attr_decorator -def token_sort_ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def token_set_ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def token_ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def partial_token_sort_ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def partial_token_set_ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def partial_token_ratio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def WRatio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... -@_attr_decorator -def QRatio( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = default_process, - score_cutoff: Optional[float] = 0 -) -> float: ... diff --git a/src/rapidfuzz/process.py b/src/rapidfuzz/process.py index 6defd442..190156bc 100644 --- a/src/rapidfuzz/process.py +++ b/src/rapidfuzz/process.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import fallback_import as _fallback_import _mod = "rapidfuzz.process" extract = _fallback_import(_mod, "extract") -extractOne = _fallback_import(_mod, "extractOne") +extractOne = _fallback_import(_mod, "extractOne") # noqa: N816 extract_iter = _fallback_import(_mod, "extract_iter") cdist = _fallback_import(_mod, "cdist") diff --git a/src/rapidfuzz/process.pyi b/src/rapidfuzz/process.pyi index c4c0ac94..da6fcf14 100644 --- a/src/rapidfuzz/process.pyi +++ b/src/rapidfuzz/process.pyi @@ -18,7 +18,6 @@ from typing import ( from rapidfuzz.fuzz import WRatio, ratio _StringType = Sequence[Hashable] -_AnyStringType = TypeVar("_AnyStringType", bound=_StringType) _S1 = TypeVar("_S1") _S2 = TypeVar("_S2") _ResultType = Union[int, float] diff --git a/src/rapidfuzz/process_cpp.py b/src/rapidfuzz/process_cpp.py index 4b48485a..2ccd81aa 100644 --- a/src/rapidfuzz/process_cpp.py +++ b/src/rapidfuzz/process_cpp.py @@ -1,6 +1,11 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann -from rapidfuzz.fuzz import ratio as _ratio + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Callable, Collection, Hashable, Sequence + +from rapidfuzz.fuzz import ratio from rapidfuzz.process_cpp_impl import FLOAT32 as _FLOAT32 from rapidfuzz.process_cpp_impl import FLOAT64 as _FLOAT64 from rapidfuzz.process_cpp_impl import INT8 as _INT8 @@ -12,12 +17,15 @@ from rapidfuzz.process_cpp_impl import UINT32 as _UINT32 from rapidfuzz.process_cpp_impl import UINT64 as _UINT64 from rapidfuzz.process_cpp_impl import cdist as _cdist -from rapidfuzz.process_cpp_impl import extract as extract -from rapidfuzz.process_cpp_impl import extract_iter as extract_iter -from rapidfuzz.process_cpp_impl import extractOne as extractOne +from rapidfuzz.process_cpp_impl import extract, extract_iter, extractOne + +__all__ = ["extract", "extract_iter", "extractOne", "cdist"] + +if TYPE_CHECKING: + import numpy as np -def _dtype_to_type_num(dtype): +def _dtype_to_type_num(dtype: np.dtype | None) -> int | None: import numpy as np if dtype is None: @@ -47,16 +55,16 @@ def _dtype_to_type_num(dtype): def cdist( - queries, - choices, + queries: Collection[Sequence[Hashable] | None], + choices: Collection[Sequence[Hashable] | None], *, - scorer=_ratio, - processor=None, - score_cutoff=None, - dtype=None, - workers=1, - **kwargs -): + scorer: Callable[..., int | float] = ratio, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | float | None = None, + dtype: np.dtype | None = None, + workers: int = 1, + **kwargs: Any, +) -> np.ndarray: import numpy as np dtype = _dtype_to_type_num(dtype) @@ -69,6 +77,6 @@ def cdist( score_cutoff=score_cutoff, dtype=dtype, workers=workers, - **kwargs + **kwargs, ) ) diff --git a/src/rapidfuzz/process_cpp_impl.pyx b/src/rapidfuzz/process_cpp_impl.pyx index c7982af0..a05b6a07 100644 --- a/src/rapidfuzz/process_cpp_impl.pyx +++ b/src/rapidfuzz/process_cpp_impl.pyx @@ -5,9 +5,14 @@ from rapidfuzz.fuzz import WRatio, ratio from rapidfuzz.utils import default_process cimport cython -from cpp_common cimport (PyObjectWrapper, RF_KwargsWrapper, RF_StringWrapper, - conv_sequence, get_score_cutoff_f64, - get_score_cutoff_i64) +from cpp_common cimport ( + PyObjectWrapper, + RF_KwargsWrapper, + RF_StringWrapper, + conv_sequence, + get_score_cutoff_f64, + get_score_cutoff_i64, +) from cpython cimport Py_buffer from cpython.buffer cimport PyBUF_F_CONTIGUOUS, PyBUF_ND, PyBUF_SIMPLE from cpython.exc cimport PyErr_CheckSignals @@ -24,11 +29,18 @@ import heapq from array import array from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_IsValid -from rapidfuzz_capi cimport (RF_SCORER_FLAG_RESULT_F64, - RF_SCORER_FLAG_RESULT_I64, - RF_SCORER_FLAG_SYMMETRIC, RF_Kwargs, - RF_Preprocess, RF_Preprocessor, RF_Scorer, - RF_ScorerFlags, RF_ScorerFunc, RF_String) +from rapidfuzz_capi cimport ( + RF_SCORER_FLAG_RESULT_F64, + RF_SCORER_FLAG_RESULT_I64, + RF_SCORER_FLAG_SYMMETRIC, + RF_Kwargs, + RF_Preprocess, + RF_Preprocessor, + RF_Scorer, + RF_ScorerFlags, + RF_ScorerFunc, + RF_String, +) cdef extern from "process_cpp.hpp": diff --git a/src/rapidfuzz/process_py.py b/src/rapidfuzz/process_py.py index d30d204c..d15110ba 100644 --- a/src/rapidfuzz/process_py.py +++ b/src/rapidfuzz/process_py.py @@ -1,13 +1,28 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + import heapq +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Collection, + Hashable, + Iterable, + Mapping, + Sequence, + overload, +) from rapidfuzz.fuzz import WRatio, ratio from rapidfuzz.utils import default_process +__all__ = ["extract", "extract_iter", "extractOne", "cdist"] + -def _get_scorer_flags_py(scorer, kwargs): +def _get_scorer_flags_py(scorer: Any, kwargs: dict[str, Any]) -> tuple[int, int]: params = getattr(scorer, "_RF_ScorerPy", None) if params is not None: flags = params["get_scorer_flags"](**kwargs) @@ -15,15 +30,42 @@ def _get_scorer_flags_py(scorer, kwargs): return (0, 100) +@overload +def extract_iter( + query: Sequence[Hashable] | None, + choices: Iterable[Sequence[Hashable] | None], + *, + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = None, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> Iterable[tuple[Sequence[Hashable], int | float, int]]: + ... + + +@overload +def extract_iter( + query: Sequence[Hashable] | None, + choices: Mapping[Any, Sequence[Hashable] | None], + *, + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = None, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> Iterable[tuple[Sequence[Hashable], int | float, Any]]: + ... + + def extract_iter( - query, - choices, + query: Sequence[Hashable] | None, + choices: Iterable[Sequence[Hashable] | None] + | Mapping[Any, Sequence[Hashable] | None], *, - scorer=WRatio, - processor=default_process, - score_cutoff=None, - **kwargs -): + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = default_process, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> Iterable[tuple[Sequence[Hashable], int | float, Any]]: """ Find the best match in a list of choices @@ -95,7 +137,8 @@ def extract_iter( if processor is not None: query = processor(query) - choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices) + choices_iter: Iterable[tuple[Any, Sequence[Hashable] | None]] + choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices) # type: ignore[union-attr] for key, choice in choices_iter: if choice is None: continue @@ -110,7 +153,7 @@ def extract_iter( processor(choice), processor=None, score_cutoff=score_cutoff, - **kwargs + **kwargs, ) if lowest_score_worst: @@ -121,15 +164,42 @@ def extract_iter( yield (choice, score, key) +@overload def extractOne( - query, - choices, + query: Sequence[Hashable] | None, + choices: Iterable[Sequence[Hashable] | None], *, - scorer=WRatio, - processor=default_process, - score_cutoff=None, - **kwargs -): + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = None, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> tuple[Sequence[Hashable], int | float, int] | None: + ... + + +@overload +def extractOne( + query: Sequence[Hashable] | None, + choices: Mapping[Any, Sequence[Hashable] | None], + *, + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = None, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> tuple[Sequence[Hashable], int | float, Any] | None: + ... + + +def extractOne( + query: Sequence[Hashable] | None, + choices: Iterable[Sequence[Hashable] | None] + | Mapping[Any, Sequence[Hashable] | None], + *, + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = default_process, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> tuple[Sequence[Hashable], int | float, Any] | None: """ Find the best match in a list of choices. When multiple elements have the same similarity, the first element is returned. @@ -264,9 +334,10 @@ def extractOne( if processor is not None: query = processor(query) - result = None + result: tuple[Sequence[Hashable], int | float, Any] | None = None - choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices) + choices_iter: Iterable[tuple[Any, Sequence[Hashable] | None]] + choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices) # type: ignore[union-attr] for key, choice in choices_iter: if choice is None: continue @@ -281,7 +352,7 @@ def extractOne( processor(choice), processor=None, score_cutoff=score_cutoff, - **kwargs + **kwargs, ) if lowest_score_worst: @@ -299,16 +370,45 @@ def extractOne( return result +@overload def extract( - query, - choices, + query: Sequence[Hashable] | None, + choices: Collection[Sequence[Hashable] | None], *, - scorer=WRatio, - processor=default_process, - limit=5, - score_cutoff=None, - **kwargs -): + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = None, + limit: int | None = None, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> list[tuple[Sequence[Hashable], int | float, int]]: + ... + + +@overload +def extract( + query: Sequence[Hashable] | None, + choices: Mapping[Any, Sequence[Hashable] | None], + *, + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = None, + limit: int | None = None, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> list[tuple[Sequence[Hashable], int | float, Any]]: + ... + + +def extract( + query: Sequence[Hashable] | None, + choices: Collection[Sequence[Hashable] | None] + | Mapping[Any, Sequence[Hashable] | None], + *, + scorer: Callable[..., int | float] = WRatio, + processor: Callable[..., Sequence[Hashable]] | None | bool = default_process, + limit: int | None = 5, + score_cutoff: int | float | None = None, + **kwargs: Any, +) -> list[tuple[Sequence[Hashable], int | float, Any]]: """ Find the best matches in a list of choices. The list is sorted by the similarity. When multiple choices have the same similarity, they are sorted by their index @@ -379,11 +479,18 @@ def extract( ) if lowest_score_worst: return heapq.nlargest(limit, result_iter, key=lambda i: i[1]) - else: - return heapq.nsmallest(limit, result_iter, key=lambda i: i[1]) + return heapq.nsmallest(limit, result_iter, key=lambda i: i[1]) + + +if TYPE_CHECKING: + import numpy as np -def _dtype_to_type_num(dtype, scorer, **kwargs): +def _dtype_to_type_num( + dtype: np.dtype | None, + scorer: Callable[..., int | float], + **kwargs: dict[str, Any], +) -> np.dtype: import numpy as np if dtype is not None: @@ -394,23 +501,22 @@ def _dtype_to_type_num(dtype, scorer, **kwargs): flags = params["get_scorer_flags"](**kwargs) if flags["flags"] & (1 << 6): return np.int32 - else: - return np.float32 + return np.float32 return np.float32 def cdist( - queries, - choices, + queries: Collection[Sequence[Hashable] | None], + choices: Collection[Sequence[Hashable] | None], *, - scorer=ratio, - processor=None, - score_cutoff=None, - dtype=None, - workers=1, - **kwargs -): + scorer: Callable[..., int | float] = ratio, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: int | float | None = None, + dtype: np.dtype | None = None, + workers: int = 1, + **kwargs: Any, +) -> np.ndarray: """ Compute distance/similarity between each pair of the two collections of inputs. @@ -462,14 +568,12 @@ def cdist( Returns a matrix of dtype with the distance/similarity between each pair of the two collections of inputs. """ - import numpy as np - dtype = _dtype_to_type_num(dtype, scorer, **kwargs) results = np.zeros((len(queries), len(choices)), dtype=dtype) if queries is choices: if processor is None: - proc_queries = queries + proc_queries = list(queries) else: proc_queries = [processor(x) for x in queries] @@ -483,20 +587,23 @@ def cdist( proc_queries[j], processor=None, score_cutoff=score_cutoff, - **kwargs + **kwargs, ) else: if processor is None: - proc_queries = queries - proc_choices = choices + proc_choices = list(choices) else: - proc_queries = [processor(x) for x in queries] proc_choices = [processor(x) for x in choices] - for i, query in enumerate(proc_queries): + for i, query in enumerate(queries): + proc_query = processor(query) if processor else query for j, choice in enumerate(proc_choices): results[i, j] = scorer( - query, choice, processor=None, score_cutoff=score_cutoff, **kwargs + proc_query, + choice, + processor=None, + score_cutoff=score_cutoff, + **kwargs, ) return results diff --git a/src/rapidfuzz/string_metric.py b/src/rapidfuzz/string_metric.py index 36fbca2b..330e050c 100644 --- a/src/rapidfuzz/string_metric.py +++ b/src/rapidfuzz/string_metric.py @@ -1,14 +1,23 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + import warnings +from typing import Callable, Hashable, Sequence -from rapidfuzz.distance import Hamming, Jaro, JaroWinkler, Levenshtein +from rapidfuzz.distance import Editop, Hamming, Jaro, JaroWinkler, Levenshtein def levenshtein( - s1, s2, *, weights=(1, 1, 1), processor=None, max=None, score_cutoff=None -): + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + max: int | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the minimum number of insertions, deletions, and substitutions required to change one sequence into the other according to Levenshtein with custom @@ -78,7 +87,12 @@ def levenshtein( ) -def levenshtein_editops(s1, s2, *, processor=None): +def levenshtein_editops( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, +) -> list[Editop]: """ Return list of 3-tuples describing how to turn s1 into s2. Each tuple is of the form (tag, src_pos, dest_pos). @@ -125,8 +139,13 @@ def levenshtein_editops(s1, s2, *, processor=None): def normalized_levenshtein( - s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None -): + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + weights: tuple[int, int, int] | None = (1, 1, 1), + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized levenshtein distance using custom costs for insertion, deletion and substitution. @@ -205,7 +224,14 @@ def normalized_levenshtein( ) -def hamming(s1, s2, *, processor=None, max=None, score_cutoff=None): +def hamming( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + max: int | None = None, + score_cutoff: int | None = None, +) -> int: """ Calculates the Hamming distance between two strings. The hamming distance is defined as the number of positions @@ -250,7 +276,13 @@ def hamming(s1, s2, *, processor=None, max=None, score_cutoff=None): return Hamming.distance(s1, s2, processor=processor, score_cutoff=score_cutoff) -def normalized_hamming(s1, s2, *, processor=None, score_cutoff=None): +def normalized_hamming( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates a normalized hamming distance @@ -299,7 +331,13 @@ def normalized_hamming(s1, s2, *, processor=None, score_cutoff=None): ) -def jaro_similarity(s1, s2, *, processor=None, score_cutoff=None): +def jaro_similarity( + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the jaro similarity @@ -335,8 +373,13 @@ def jaro_similarity(s1, s2, *, processor=None, score_cutoff=None): def jaro_winkler_similarity( - s1, s2, *, prefix_weight=0.1, processor=None, score_cutoff=None -): + s1: Sequence[Hashable], + s2: Sequence[Hashable], + *, + prefix_weight: float = 0.1, + processor: Callable[..., Sequence[Hashable]] | None = None, + score_cutoff: float | None = None, +) -> float: """ Calculates the jaro winkler similarity @@ -386,24 +429,3 @@ def jaro_winkler_similarity( ) * 100 ) - - -def _GetScorerFlagsDistance(**kwargs): - return {"optimal_score": 0, "worst_score": 2**63 - 1, "flags": (1 << 6)} - - -def _GetScorerFlagsSimilarity(**kwargs): - return {"optimal_score": 100, "worst_score": 0, "flags": (1 << 5)} - - -levenshtein._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsDistance} - -normalized_levenshtein._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -hamming._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsDistance} - -normalized_hamming._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -jaro_similarity._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} - -jaro_winkler_similarity._RF_ScorerPy = {"get_scorer_flags": _GetScorerFlagsSimilarity} diff --git a/src/rapidfuzz/string_metric.pyi b/src/rapidfuzz/string_metric.pyi deleted file mode 100644 index 0c028c6a..00000000 --- a/src/rapidfuzz/string_metric.pyi +++ /dev/null @@ -1,56 +0,0 @@ -from typing import Callable, Hashable, List, Optional, Sequence, Tuple, TypeVar - -_StringType = Sequence[Hashable] -_S1 = TypeVar("_S1") -_S2 = TypeVar("_S2") - -def levenshtein( - s1: _S1, - s2: _S2, - *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - max: Optional[int] = None, - score_cutoff: Optional[int] = None -) -> int: ... -def normalized_levenshtein( - s1: _S1, - s2: _S2, - *, - weights: Optional[Tuple[int, int, int]] = (1, 1, 1), - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -def levenshtein_editops( - s1: _S1, s2: _S2, *, processor: Optional[Callable[..., _StringType]] = None -) -> List[Tuple[str, int, int]]: ... -def hamming( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - max: Optional[int] = None, - score_cutoff: Optional[int] = None -) -> int: ... -def normalized_hamming( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -def jaro_similarity( - s1: _S1, - s2: _S2, - *, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... -def jaro_winkler_similarity( - s1: _S1, - s2: _S2, - *, - prefix_weight: float = 0.1, - processor: Optional[Callable[..., _StringType]] = None, - score_cutoff: Optional[float] = 0 -) -> float: ... diff --git a/src/rapidfuzz/utils.py b/src/rapidfuzz/utils.py index a9b06a8e..2f54a8bf 100644 --- a/src/rapidfuzz/utils.py +++ b/src/rapidfuzz/utils.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + from rapidfuzz._utils import fallback_import as _fallback_import default_process = _fallback_import("rapidfuzz.utils", "default_process") diff --git a/src/rapidfuzz/utils.pyi b/src/rapidfuzz/utils.pyi index 60c67976..3cfa4622 100644 --- a/src/rapidfuzz/utils.pyi +++ b/src/rapidfuzz/utils.pyi @@ -1,3 +1,8 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2022 Max Bachmann + +from __future__ import annotations + from typing import Hashable, Sequence def default_process(sentence: Sequence[Hashable]) -> Sequence[Hashable]: ... diff --git a/src/rapidfuzz/utils_cpp.pyx b/src/rapidfuzz/utils_cpp.pyx index 6c646267..bddfd333 100644 --- a/src/rapidfuzz/utils_cpp.pyx +++ b/src/rapidfuzz/utils_cpp.pyx @@ -1,12 +1,16 @@ # distutils: language=c++ # cython: language_level=3, binding=True, linetrace=True -from cpp_common cimport (conv_sequence, convert_string, hash_array, - hash_sequence, is_valid_string) +from cpp_common cimport ( + conv_sequence, + convert_string, + hash_array, + hash_sequence, + is_valid_string, +) from cpython.pycapsule cimport PyCapsule_New from libcpp cimport bool -from rapidfuzz_capi cimport (PREPROCESSOR_STRUCT_VERSION, RF_Preprocessor, - RF_String) +from rapidfuzz_capi cimport PREPROCESSOR_STRUCT_VERSION, RF_Preprocessor, RF_String from array import array diff --git a/src/rapidfuzz/utils_py.py b/src/rapidfuzz/utils_py.py index 36d27f0b..cafa6779 100644 --- a/src/rapidfuzz/utils_py.py +++ b/src/rapidfuzz/utils_py.py @@ -1,12 +1,14 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2022 Max Bachmann +from __future__ import annotations + import re _alnum_regex = re.compile(r"(?ui)\W") -def default_process(sentence): +def default_process(sentence: str) -> str: """ This function preprocesses a string by: diff --git a/tests/distance/test_Indel.py b/tests/distance/test_Indel.py index 111556dc..47dd8dd6 100644 --- a/tests/distance/test_Indel.py +++ b/tests/distance/test_Indel.py @@ -67,7 +67,7 @@ def test_different_strings(): assert Indel.normalized_similarity("aaaa", "bbbb") == 0.0 -def testIssue196(): +def test_issue_196(): """ Indel distance did not work correctly for score_cutoff=1 """ diff --git a/tests/distance/test_JaroWinkler.py b/tests/distance/test_JaroWinkler.py index 8583547f..872dd72d 100644 --- a/tests/distance/test_JaroWinkler.py +++ b/tests/distance/test_JaroWinkler.py @@ -2,8 +2,6 @@ import unittest -import pytest - from rapidfuzz.distance import JaroWinkler_cpp, JaroWinkler_py @@ -45,7 +43,10 @@ def test_edge_case_lengths(self): self._jaro_winkler_similarity(s2, s1, 0.95333) s1 = "00000000000000000000000000000000000000000000000000000000000000000" - s2 = "01000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000" + s2 = ( + "010000000000000000000000000000000000000000000000000000000000000000" + "00000000000000000000000000000000000000000000000000000000000000" + ) self._jaro_winkler_similarity(s2, s1, 0.85234) diff --git a/tests/distance/test_OSA.py b/tests/distance/test_OSA.py index d0b0cd65..71d5b281 100644 --- a/tests/distance/test_OSA.py +++ b/tests/distance/test_OSA.py @@ -2,7 +2,6 @@ import unittest -from rapidfuzz import process from rapidfuzz.distance import OSA as _OSA from rapidfuzz.distance import OSA_cpp, OSA_py diff --git a/tests/distance/test_init.py b/tests/distance/test_init.py index 259fd7ba..93a72db2 100644 --- a/tests/distance/test_init.py +++ b/tests/distance/test_init.py @@ -7,14 +7,9 @@ import pytest from hypothesis import given, settings -from rapidfuzz.distance import ( - Editop, - Editops, - Levenshtein, - MatchingBlock, - Opcode, - Opcodes, -) +import rapidfuzz.distance._initialize_cpp as distance_cpp +import rapidfuzz.distance._initialize_py as distance_py +from rapidfuzz.distance import Editops, Levenshtein, Opcodes def test_editops_comparison(): @@ -28,11 +23,12 @@ def test_editops_comparison(): assert not (ops != ops.copy()) -def test_editops_get_index(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_editops_get_index(module): """ test __getitem__ with index of Editops """ - ops = Editops( + ops = module.Editops( [ ("delete", 1, 1), ("replace", 2, 1), @@ -70,11 +66,12 @@ def test_editops_get_index(): ops[-6] -def test_editops_get_slice(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_editops_get_slice(module): """ test __getitem__ with slice of Editops """ - ops = Editops( + ops = module.Editops( [ ("delete", 1, 1), ("replace", 2, 1), @@ -110,11 +107,12 @@ def test_editops_get_slice(): ops[::-1] -def test_editops_del_slice(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_editops_del_slice(module): """ test __delitem__ with slice of Editops """ - ops = Editops( + ops = module.Editops( [ ("delete", 1, 1), ("replace", 2, 1), @@ -156,11 +154,12 @@ def del_test(key): del_test(slice(-4, -1, 2)) -def test_editops_inversion(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_editops_inversion(module): """ test correct inversion of Editops """ - ops = Editops( + ops = module.Editops( [ ("delete", 1, 1), ("replace", 2, 1), @@ -192,11 +191,12 @@ def test_opcodes_comparison(): assert not (ops != ops.copy()) -def test_opcode_get_index(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_opcode_py_get_index(module): """ test __getitem__ with index of Opcodes """ - ops = Opcodes( + ops = module.Opcodes( [ ("equal", 0, 1, 0, 1), ("delete", 1, 2, 1, 1), @@ -238,11 +238,12 @@ def test_opcode_get_index(): ops[-7] -def test_opcode_inversion(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_opcode_inversion(module): """ test correct inversion of Opcodes """ - ops = Opcodes( + ops = module.Opcodes( [ ("equal", 0, 1, 0, 1), ("delete", 1, 2, 1, 1), @@ -265,33 +266,35 @@ def test_opcode_inversion(): ] -def test_editops_empty(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_opcodes_empty(module): """ test behavior of conversion between empty list and Editops """ - ops = Opcodes([], 0, 0) + ops = module.Opcodes([], 0, 0) assert ops.as_list() == [] assert ops.src_len == 0 assert ops.dest_len == 0 - ops = Opcodes([], 0, 3) + ops = module.Opcodes([], 0, 3) assert ops.as_list() == [ - Opcode(tag="equal", src_start=0, src_end=0, dest_start=0, dest_end=3) + module.Opcode(tag="equal", src_start=0, src_end=0, dest_start=0, dest_end=3) ] assert ops.src_len == 0 assert ops.dest_len == 3 -def test_editops_empty(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_editops_empty(module): """ test behavior of conversion between empty list and Opcodes """ - ops = Editops([], 0, 0) + ops = module.Editops([], 0, 0) assert ops.as_list() == [] assert ops.src_len == 0 assert ops.dest_len == 0 - ops = Editops([], 0, 3) + ops = module.Editops([], 0, 3) assert ops.as_list() == [] assert ops.src_len == 0 assert ops.dest_len == 3 @@ -322,30 +325,49 @@ def test_list_initialization(): assert ops.as_opcodes() == ops2 -def test_merge_adjacent_blocks(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_merge_adjacent_blocks(module): """ test whether adjacent blocks are merged """ - ops1 = [Opcode(tag="equal", src_start=0, src_end=3, dest_start=0, dest_end=3)] + ops1 = [ + module.Opcodes(tag="equal", src_start=0, src_end=3, dest_start=0, dest_end=3) + ] ops2 = [ - Opcode(tag="equal", src_start=0, src_end=1, dest_start=0, dest_end=1), - Opcode(tag="equal", src_start=1, src_end=3, dest_start=1, dest_end=3), + module.Opcodes(tag="equal", src_start=0, src_end=1, dest_start=0, dest_end=1), + module.Opcodes(tag="equal", src_start=1, src_end=3, dest_start=1, dest_end=3), ] - assert Opcodes(ops1, 3, 3) == Opcodes(ops2, 3, 3) - assert Opcodes(ops2, 3, 3) == Opcodes(ops2, 3, 3).as_editops().as_opcodes() + assert module.Opcodes(ops1, 3, 3) == module.Opcodes(ops2, 3, 3) + assert ( + module.Opcodes(ops2, 3, 3) + == module.Opcodes(ops2, 3, 3).as_editops().as_opcodes() + ) -def test_empty_matching_blocks(): +@pytest.mark.parametrize("module", [distance_py, distance_cpp]) +def test_empty_matching_blocks(module): """ test behavior for empty matching blocks """ - assert Editops([], 0, 0).as_matching_blocks() == [MatchingBlock(a=0, b=0, size=0)] - assert Editops([], 0, 3).as_matching_blocks() == [MatchingBlock(a=0, b=3, size=0)] - assert Editops([], 3, 0).as_matching_blocks() == [MatchingBlock(a=3, b=0, size=0)] + assert module.Editops([], 0, 0).as_matching_blocks() == [ + module.MatchingBlock(a=0, b=0, size=0) + ] + assert module.Editops([], 0, 3).as_matching_blocks() == [ + module.MatchingBlock(a=0, b=3, size=0) + ] + assert module.Editops([], 3, 0).as_matching_blocks() == [ + module.MatchingBlock(a=3, b=0, size=0) + ] - assert Opcodes([], 0, 0).as_matching_blocks() == [MatchingBlock(a=0, b=0, size=0)] - assert Opcodes([], 0, 3).as_matching_blocks() == [MatchingBlock(a=0, b=3, size=0)] - assert Opcodes([], 3, 0).as_matching_blocks() == [MatchingBlock(a=3, b=0, size=0)] + assert module.Opcodes([], 0, 0).as_matching_blocks() == [ + module.MatchingBlock(a=0, b=0, size=0) + ] + assert module.Opcodes([], 0, 3).as_matching_blocks() == [ + module.MatchingBlock(a=0, b=3, size=0) + ] + assert module.Opcodes([], 3, 0).as_matching_blocks() == [ + module.MatchingBlock(a=3, b=0, size=0) + ] @given(s1=st.text(), s2=st.text()) diff --git a/tests/test_cpp_fallback.py b/tests/test_cpp_fallback.py index 825f211a..6cb6aa13 100644 --- a/tests/test_cpp_fallback.py +++ b/tests/test_cpp_fallback.py @@ -2,4 +2,4 @@ import os os.environ["RAPIDFUZZ_IMPLEMENTATION"] = "cpp" -import rapidfuzz +import rapidfuzz # noqa: E402, F401 diff --git a/tests/test_fuzz.py b/tests/test_fuzz.py index 1d8cdd82..e69af215 100644 --- a/tests/test_fuzz.py +++ b/tests/test_fuzz.py @@ -231,7 +231,10 @@ def testIssue196(self): def testIssue231(self): str1 = "er merkantilismus förderte handle und verkehr mit teils marktkonformen, teils dirigistischen maßnahmen." - str2 = "ils marktkonformen, teils dirigistischen maßnahmen. an der schwelle zum 19. jahrhundert entstand ein neu" + str2 = ( + "ils marktkonformen, teils dirigistischen maßnahmen. " + "an der schwelle zum 19. jahrhundert entstand ein neu" + ) alignment = fuzz.partial_ratio_alignment(str1, str2) self.assertEqual(alignment.src_start, 0) diff --git a/tests/test_hypothesis.py b/tests/test_hypothesis.py index e9e10eda..948699ba 100644 --- a/tests/test_hypothesis.py +++ b/tests/test_hypothesis.py @@ -1,5 +1,4 @@ import random -from functools import partial from itertools import product from string import ascii_letters, digits, punctuation @@ -112,20 +111,20 @@ def jarowinkler_similarity(*args, **kwargs): return sim1 -def jaro_similarity(P, T): - P_flag = [0] * (len(P) + 1) - T_flag = [0] * (len(T) + 1) +def jaro_similarity(pattern, text): + P_flag = [0] * (len(pattern) + 1) + T_flag = [0] * (len(text) + 1) - Bound = max(len(P), len(T)) // 2 + Bound = max(len(pattern), len(text)) // 2 Bound = max(Bound - 1, 0) CommonChars = 0 - for i in range(len(T)): + for i in range(len(text)): lowlim = i - Bound if i >= Bound else 0 - hilim = i + Bound if i + Bound <= len(P) - 1 else len(P) - 1 + hilim = i + Bound if i + Bound <= len(pattern) - 1 else len(pattern) - 1 for j in range(lowlim, hilim + 1): - if not P_flag[j] and P[j] == T[i]: + if not P_flag[j] and pattern[j] == text[i]: T_flag[i] = 1 P_flag[j] = 1 CommonChars += 1 @@ -136,39 +135,39 @@ def jaro_similarity(P, T): Transpositions = 0 k = 0 - for i in range(len(T)): + for i in range(len(text)): if T_flag[i]: j = k - while j < len(P): + while j < len(pattern): if P_flag[j]: k = j + 1 break j += 1 - if T[i] != P[j]: + if text[i] != pattern[j]: Transpositions += 1 Transpositions = Transpositions // 2 - Sim = ( - CommonChars / len(P) - + CommonChars / len(T) + sim = ( + CommonChars / len(pattern) + + CommonChars / len(text) + (CommonChars - Transpositions) / CommonChars ) - return Sim / 3 + return sim / 3 -def jaro_winkler_similarity(P, T, prefix_weight=0.1): - min_len = min(len(P), len(T)) +def jaro_winkler_similarity(pattern, text, prefix_weight=0.1): + min_len = min(len(pattern), len(text)) prefix = 0 max_prefix = min(min_len, 4) while prefix < max_prefix: - if T[prefix] != P[prefix]: + if text[prefix] != pattern[prefix]: break prefix += 1 - Sim = jaro_similarity(P, T) + Sim = jaro_similarity(pattern, text) if Sim > 0.7: Sim += prefix * prefix_weight * (1.0 - Sim) diff --git a/tests/test_pure_python_fallback.py b/tests/test_pure_python_fallback.py index ac409113..94c4b005 100644 --- a/tests/test_pure_python_fallback.py +++ b/tests/test_pure_python_fallback.py @@ -2,4 +2,4 @@ import os os.environ["RAPIDFUZZ_IMPLEMENTATION"] = "python" -import rapidfuzz +import rapidfuzz # noqa: E402, F401 diff --git a/tests/test_utils.py b/tests/test_utils.py index ec42a65d..ead3f960 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import unittest -from rapidfuzz import fuzz, process, utils +from rapidfuzz import utils class UtilsTest(unittest.TestCase):